def fgsea( data: Union[MultimodalData, UnimodalData], log2fc_key: str, pathways: str, de_key: Optional[str] = "de_res", minSize: Optional[int] = 15, maxSize: Optional[int] = 500, nproc: Optional[int] = 0, seed: Optional[int] = 0, fgsea_key: Optional[str] = "fgsea_out", ) -> None: """Perform Gene Set Enrichment Analysis using fGSEA. This function calls R package fGSEA, requiring fGSEA in R installed. Parameters ---------- data: Union[``MultimodalData``, ``UnimodalData``] Single-cell or pseudo-bulk data. log2fc_key: ``str`` Key in pre-computed DE results representing log2 fold change. pathways: ``str`` Either a string or a path to the gene set file in GMT format. If string, choosing from "hallmark" and "canonical_pathways" (MSigDB H and C2/CP). de_key: ``str``, optional, default: ``"de_res"`` Key name of DE analysis results stored. data.varm[de_key] should contain a record array of DE results. minSize: ``int``, optional, default: ``15`` Minimal size of a gene set to consider. maxSize: ``int``, optional, default: ``500`` Maximal size of a gene set to consider. nproc: ``int``, optional, default: ``0`` Numbr of processes for parallel computation. If nproc > 0, set BPPARAM. seed: ``int``, optional, default: ``0`` Random seed to make sure fGSEA results are reproducible. fgsea_key: ``str``, optional, default: ``"fgsea_out"`` Key to use to store fGSEA results as a data frame. Returns ------- ``None`` Update ``data.uns``: ``data.uns[fgsea_key]``: fGSEA outputs sorted by padj. Examples -------- >>> pg.fgsea(data, '3:log2FC', hallmark', fgsea_key='fgsea_res') """ try: import rpy2.robjects as ro from rpy2.robjects import pandas2ri from rpy2.robjects.packages import importr from rpy2.robjects.conversion import localconverter except ModuleNotFoundError as e: import sys logger.error(f"{e}\nNeed rpy2! Try 'pip install rpy2'.") sys.exit(-1) try: fgsea = importr("fgsea") except ModuleNotFoundError: import sys text = """Please install fgsea in order to run this function.\n To install this package, start R and enter:\n if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("fgsea")""" logger.error(text) sys.exit(-1) ro.r(f"set.seed({seed})") pwdict = load_signatures_from_file( predefined_pathways.get(pathways, pathways)) pathways_r = ro.ListVector(pwdict) log2fc = ro.FloatVector(data.varm[de_key][log2fc_key]) log2fc.names = ro.StrVector(data.var_names) res = fgsea.fgsea(pathways_r, log2fc, minSize=minSize, maxSize=maxSize, nproc=nproc) unlist = ro.r(""" function(df) { df$leadingEdge <- sapply(df$leadingEdge, function(x) {paste(unlist(x), collapse=',')}) return(df) } """) with localconverter(ro.default_converter + pandas2ri.converter): res_df = ro.conversion.rpy2py(unlist(res)) res_df.sort_values("padj", inplace=True) data.uns[fgsea_key] = res_df
def SymMatrix_to_Matrix(matrix): v = robjects.FloatVector([e for row in matrix for e in row]) r_obj = robjects.r['matrix'](v, nrow=matrix.dim) return r_obj
death_dic['Dead'] = 1 coeffs = [] pvalues = [] genes = [] ##This list tracks the gene names for i in range(len(final_genes[0])): kaplan = [] genes.append(final_genes[0][i][0]) for k, j in zip( clinical_and_files, final_genes ): ## These lists contain the clinical information and mRNA data in the same order. kaplan.append([k[1], k[2], k[3], k[4], k[5], j[i][1]]) data = [ ii[-1] for ii in kaplan ] ## Grabbing all the gene values for the current gene being analyzed ro.globalenv['expression'] = ro.FloatVector(data) res = ro.r( 'round(qnorm((rank(expression, na.last="keep")-0.5)/sum(!is.na(expression))), digit=5)' ) ## Perform inverse normal transformation inverse_norm = list(res) ## Convert robject to python list ## Prepare the variables for rpy2 ro.globalenv['gene'] = ro.FloatVector(inverse_norm) ro.globalenv['times'] = ro.IntVector([ii[0] for ii in kaplan]) ro.globalenv['died'] = ro.IntVector([death_dic[ii[1]] for ii in kaplan]) ro.globalenv['sex'] = ro.IntVector([ii[3] for ii in kaplan]) ro.globalenv['age'] = ro.IntVector([ii[4] for ii in kaplan]) res = ro.r('coxph(Surv(times,died) ~ gene + sex + age)' ) ## Perform Cox regression ## Parse the string of the result with python for the gene coefficient and pvalue for entry in str(res).split('\n'): try:
def main(args): theDB = DB(":memory:") for no, pe, files in os.walk(args[1]): for f in files: print("indexing: " + args[1] + "/" + f) inputData = FileStream(args[1] + "/" + f) lexer = JavaLexer(inputData) tokens = CommonTokenStream(lexer) parser = JavaParser(tokens) tree = parser.compilationUnit() theFileID = -1 with open(args[1] + "/" + f, 'r') as tehFile: theFileID = theDB.putFile(tehFile.read())[0] tokenList = JavaSourceIndexer(theDB, theFileID).visit(tree) #print(tokenList) theTestFiles[f] = tokenList problems = theTestFiles.keys() problems.sort() finds = 0.0 total = 0.0 diffSims = [] sameSims = [] for i in range(len(problems)): sameSim = 0 sameCount = 0 diffSim = 0 diffCount = 0 for j in range(len(problems)): if problems[j][:problems[j].find(".")] == problems[ i][:problems[i].find(".")] and problems[j] != problems[i]: sameSim += coSim(theTestFiles[problems[i]], theTestFiles[problems[j]]) sameCount += 1 else: diffSim += coSim(theTestFiles[problems[i]], theTestFiles[problems[j]]) diffCount += 1 sameSim /= sameCount diffSim /= diffCount if sameSim > diffSim: finds += 1.0 else: print(problems[i] + " was more similar to arbitrary code than to mutations of " + problems[i]) total += 1.0 diffSims.append(diffSim) sameSims.append(sameSim) print( str(finds / total) + " percent of files were more similar to mutations of themselves than to arbitary code." ) # t-test res = R.r['t.test'](R.FloatVector(sameSims), R.FloatVector(diffSims)) print( "The p-value of a student's t test, testing the difference in similarity between mutations of code, and arbitrary code is:" ) print(res.rx('p.value')[0][0]) print("The 95% confidence interval of the difference is:") print(res.rx('conf.int')[0])
print robj.r.mean(x) print "Here are some other stats" print "Sum" print robj.r.sum(x) print "Variance" print robj.r.var(x) # <headingcell level=3> # Part 3: Create and interact with multi-dimensional R objects # <codecell> # create R matrices v = robj.FloatVector(robj.r.rnorm(20)) m = robj.r.matrix(v, ncol = 2) print(m) print "According to R the column sums are" print robj.r.apply(m, 2, 'sum') # convert matrix into a numpy array m_np = np.array(m) print(m_np) # <codecell> # read in data as an R data.frame faithful = robj.DataFrame.from_csvfile('/media/sf_Dropbox/teaching/rpy/faithful.dat', sep=' ') print type(faithful) print faithful.names
def getPairwiseComparisons(dist1, dist2): return robjects.r["wilcox.test"](robjects.FloatVector(dist1), robjects.FloatVector(dist2))[2][0]
def run_fisher(args): ''' run Fisher's Exact test ''' sz_utils.make_dirs_if_necessary(args.outp) sz_utils.check_if_files_exist(args.ac_file) tables = sz_utils._count2table(args.ac_file)[0] task_q = mp.JoinableQueue() result_q = mp.Queue() create_procs(args.nproc, task_q, result_q, args.outp) sz_utils._assign_tables(tables, task_q, args.nproc) try: task_q.join() except KeyboardInterrupt: ColorText().info("[poolseq_tk]: Terminated unexpectedly by keyboard\n", "stderr") sys.exit() else: pvals, odds_ratios, log10_pvals = {}, {}, {} while args.nproc: file = result_q.get() with open(file, 'r') as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) pval = float(tmp_line[2]) odds_ratio = float(tmp_line[3]) log10_pval = tmp_line[4] if (chr, pos) not in pvals: pvals[chr, pos] = pval if (chr, pos) not in odds_ratios: odds_ratios[chr, pos] = odds_ratio if (chr, pos) not in log10_pvals: log10_pvals[chr, pos] = log10_pval os.remove(file) # pvals_split, odds_ratios_split = result_q.get() # pvals.update(pvals_split) # odds_ratios.update(odds_ratios_split) args.nproc -= 1 ColorText().info( "[poolseq_tk]: Running Fisher's Exact tests successfully\n", "stderr") # correcting raw p-values and make QQ plots ColorText().info( "[poolseq_tk]: multi-testing correction using %s method at %d%% level ..." % (args.adj_method, args.adj_cutoff * 100), "stderr") raw_pvals = [pvals[k] for k in sorted(pvals.iterkeys())] raw_pvals_vector = robjects.FloatVector(raw_pvals) padjust = robjects.r['p.adjust'](raw_pvals_vector, method=args.adj_method) ColorText().info(" [done]\n", "stderr") ColorText().info( "[poolseq_tk]: p-value cutoff using Benjamini.Hochberg procedure %.5e" % (sz_utils.getFDR_BH(pvals, args.adj_cutoff)), "stderr") ColorText().info(" [done]\n", "stderr") # output p-values ColorText().info("[poolseq_tk]: output to files ...", "stderr") out_all = args.outp + ".fisher.all" out_fdr = args.outp + ".fisher.fdr%d" % (args.adj_cutoff * 100) out_expect = args.outp + ".fisher.fdr%d.expect" % (args.adj_cutoff * 100) with open(out_all, 'w') as fALL, \ open(out_fdr, 'w') as fFDR, \ open(out_expect, 'w') as fEXPECT: for i, k in enumerate(sorted(pvals.iterkeys())): chr = k[0] pos = k[1] raw_pval = pvals[k] log_pval = log10_pvals[k] odds_ratio = odds_ratios[k] if padjust[i] <= args.adj_cutoff: sz_utils._results_outputter(fFDR, pos, chr, "\t".join(tables[k][1:3]), tables[k][3:], raw_pval, log_pval, padjust[i], odds_ratio) if ((args.oddsr_direction == "greater" and odds_ratios[k] > 1) or (args.oddsr_direction == "less" and odds_ratios[k] < 1)): sz_utils._results_outputter(fEXPECT, pos, chr, "\t".join(tables[k][1:3]), tables[k][3:], raw_pval, log_pval, padjust[i], odds_ratio) sz_utils._results_outputter(fALL, pos, chr, "\t".join(tables[k][1:3]), tables[k][3:], raw_pval, log_pval, padjust[i], odds_ratio) ColorText().info(" [done]\n", "stderr") ColorText().info("[poolseq_tk]: Program finishes successfully\n", "stderr")
doc_topic = model.doc_topic_ # calculate comments assigned to each topic topic_comments = np.dot(comments, doc_topic) ## plot results # import r devices #base = importr('base') rbarplot = robjects.r('barplot') #rprint = robjects.globalenv.get("print") #graphics = importr("graphics") # plots grdevices.png("./plots/topic_comments.png") rbarplot(robjects.FloatVector(topic_comments), xlab="Topics", ylab="Comments", main="Comments assigned to each topic", col="coral1") grdevices.dev_off() # Generate plots for other values of k ### k = 4 model = lda.LDA(n_topics=k, n_iter=500, random_state=1, eta=200 / float(len(vocab)), alpha=50 / float(k)) model.fit(X)
def ecdf(vectors, labels=None, colors=["red", "blue", "orange", "violet", "green", "brown"], xlab="", ylab="cumulative fraction", main="", legendWhere="topleft", lty=1, lwd=1, legendArgs=None, labelsIncludeN=True, **ecdfKwdArgs): """ Take a list of lists, convert them to vectors, and plots them sequentially on a CDF """ if ro is None: return #print "MEANS:", main #for vector, label in zip(convertToVectors, labels): # print label, numpy.mean(vector) def _expand(item): try: iter(item) return item except TypeError: return [item] * len(vectors) lty = _expand(lty) lwd = _expand(lwd) if not "xlim" in ecdfKwdArgs or ecdfKwdArgs["xlim"] is None: xlim = [ min(min(vector) for vector in vectors if len(vector) > 0), max(max(vector) for vector in vectors if len(vector) > 0) ] ecdfKwdArgs["xlim"] = xlim ecdfKwdArgs["xlim"] = ro.FloatVector(xlim) started = False for i, vector in enumerate(vectors): if len(vector) > 0: vector = ro.FloatVector(vector) ecdfKwdArgs.update({ "verticals": True, "do.points": False, "col.hor": colors[(i) % len(colors)], "col.vert": colors[(i) % len(colors)], "lty": lty[(i) % len(lty)], "lwd": lwd[(i) % len(lwd)] }) ecdf = r.ecdf(vector) if not started: r.plot(ecdf, main=main, xlab=xlab, ylab=ylab, **ecdfKwdArgs) started = True else: r.plot(ecdf, add=True, **ecdfKwdArgs) if labels is not None: if labelsIncludeN: labelsWithN = [] for i, label in enumerate(labels): labelsWithN.append(label + " (n=%d)" % len(vectors[i])) else: labelsWithN = labels legendArgs = asdict(legendArgs, {"cex": 0.7}) r.legend(legendWhere, legend=ro.StrVector(labelsWithN), lty=ro.IntVector(lty), lwd=ro.IntVector([lwdi * 2 for lwdi in lwd]), col=ro.StrVector(colors), bg="white", **legendArgs)
# R vector of strings from rpy2.robjects.vectors import StrVector # Selectively install what needs to be install. # We are fancy, just because we can. # names_to_install = [x for packnames if not rpackages.isinstalled(x)] names_to_install = [x for x in packnames if not rpackages.isinstalled(x)] if len(names_to_install) > 0: utils.install_packages(StrVector(names_to_install)) MCDA = importr('MCDA') pi = robjects.r['pi'] print pi[0] res1 = robjects.FloatVector( [5490, 51.4, 8.5, 285, 6500, 70.6, 7, 288, 6489, 54.3, 7.5, 290]) performanceTable = robjects.r['matrix'](res1, nrow=3, ncol=4, byrow=True) performanceTable.rownames = robjects.StrVector(["Corsa", "Clio", "Fiesta"]) performanceTable.colnames = robjects.StrVector( ["Purchase Price", "Economy", "Aesthetics", "Boot Capacity"]) weights = robjects.FloatVector([0.35, 0.25, 0.25, 0.15]) weights.names = robjects.r['colnames'](performanceTable) criteriaMinMax = robjects.StrVector(["min", "max", "max", "max"]) positiveIdealSolutions = robjects.FloatVector( [0.179573776, 0.171636015, 0.159499658, 0.087302767]) negativeIdealSolutions = robjects.FloatVector( [0.212610118, 0.124958799, 0.131352659, 0.085797547])
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("--in_prefix", dest="in_prefix", type="string", help="""Prefix of output files created from createAS_CountTables. In createAS_CountTables this was the -o option""", default=None) opt_parser.add_option("-i", dest="generic_file", type="string", help="""Run statistical tests on a generic table. A generic file with any type of value can also be used. The first line should be a header that starts with # and contains sample names.""", default=None) opt_parser.add_option("--generic", dest="samp_start_idx", type="int", help="""The samp_start_idx gives the 0-based index of the column containing the sample value.""", default=None) # opt_parser.add_option("--left_intron", # dest="left_input", # type="string", # help="""Resulting length-normalized file from createAS_CountTables.py, which # contains the exclusion and inclusion counts # for just the left side of an intron retention # event.""", # default=None) # opt_parser.add_option("--right_intron", # dest="right_input", # type="string", # help="""Resulting length-normalized file from createAS_CountTables.py, which # contains the exclusion and inclusion counts # for just the right side of an intron retention # event.""", # default=None) opt_parser.add_option("--all_psi_output", dest="all_psi_output", type="string", help="""Output file that will contain the PSI values for all events and samples. The last two columns will correspond to the raw-pvalue and corrected p-value. If a generic file is used, this will be the output file""", default=None) opt_parser.add_option("--simple_IR", dest="simple_IR", action="store_true", help="""Will test intron_retention events using total inclusion/exclusion reads and will not test the left and right side separately. It will still test for thresholds for both the left and right side""", default=False) opt_parser.add_option("--thresh", dest="threshold", type="float", help="""Threshold for minimum abundance in an event. Default=%d""" % DEF_THRESH, default=DEF_THRESH) opt_parser.add_option("--mt_correction", dest="mt_method", type="string", help="""Multiple testing correction Method: "BH" - Benjamini & Hochberg, "bonferroni". Must select these strings as the option""", default=None) opt_parser.add_option("--which_test", dest="which_test", type="string", help="""Which test to use. Either "t-test" or "Wilcoxon". Default=%s""" % DEF_TEST, default=DEF_TEST) opt_parser.add_option("--permutation", dest="permutation", action="store_true", help="""Will do permutation tests to get empircal p-value""", default=False) opt_parser.add_option("--samp2batch", dest="samp2batch_file", type="string", help="""If doing a permutation test, will account for potential batch effects""", default=None) opt_parser.add_option("--jcn_seq_len", dest="jcn_seq_len", type="int", help="""Junction length. Used as an option in getASEventReadCounts.py. Required if doing permutation approach""", default=None) opt_parser.add_option("--delta_thresh", dest="delta_thresh", type="float", help="""Minimum PSI(or generic value) difference between the maximum and minimum values for a given event to be considered a change. This should probably be less than the delta threshold used to filter significantly associated events. Default=%s""" % DEF_DPSI_THRESH, default=DEF_DPSI_THRESH) opt_parser.add_option("--sample_set1", dest="sample_set1", type="string", help="""Comma delimited list of samples in set 1 or a file with a list of names, one per line. Names must be in header columns of input files.""", default=None) opt_parser.add_option("--sample_set2", dest="sample_set2", type="string", help="""Comma delimited list of samples in set 2 or a file with a list of names, one per line. Names must be in header columns of input files.""", default=None) opt_parser.add_option("--as_only", dest="as_only", action="store_true", help="""Will output the psi table just to get a sense of alternative splicing. It will not perform any statistical analyses. Names must be in header columns of input files.""", default=None) opt_parser.add_option("--html_dir", dest="html_dir", type="string", help="""Optional: location to put html output table and associated images""", default=None) opt_parser.add_option("--html_out_sign_thresh", dest="sign_thresh", type="float", help="""Significance threshold of q-value for printed out html_table. DEF=%.2f""" % DEF_SIGN_CUTOFF, default=DEF_SIGN_CUTOFF) opt_parser.add_option("--pdf", dest="make_pdf", action="store_true", help="""Optional: Will create images as pdf instead of .png as the default.""", default=None) (options, args) = opt_parser.parse_args() # validate the command line arguments # opt_parser.check_required("-i") opt_parser.check_required("--all_psi_output") opt_parser.check_required("--mt_correction") permutation = options.permutation if permutation: opt_parser.check_required("--jcn_seq_len") jcn_seq_len = options.jcn_seq_len if options.in_prefix: prefix = options.in_prefix input_file = open("%s_AS_exclusion_inclusion_counts_lenNorm.txt" % prefix) left_input_file_name = "%s_left_intron_counts_lenNorm.txt" % prefix right_input_file_name = "%s_right_intron_counts_lenNorm.txt" % prefix # if permutation: # raw_input_file = open("%s_AS_exclusion_inclusion_counts.txt" % prefix) # raw_left_input_file_name = "%s_left_intron_counts.txt" % prefix # raw_right_input_file_name = "%s_right_intron_counts.txt" % prefix else: if not options.generic_file: print "Must include either --in_prefix or -i" opt_parser.print_help() sys.exit(1) input_file = open(options.generic_file) left_input_file_name = None right_input_file_name = None sum_thresh = options.threshold delta_thresh = options.delta_thresh simple_IR = options.simple_IR samp2batch = None if options.samp2batch_file: samp2batch = parseBatchFile(options.samp2batch_file) html_out_dir = options.html_dir html_out_table_name = None if html_out_dir: exec "import rpy2.robjects.lib.ggplot2 as ggplot2" in globals() html_out_dir = formatDir(html_out_dir) if not os.path.exists(html_out_dir): os.mkdir(html_out_dir) html_out_table_name = html_out_dir + "/index.html" sign_thresh = options.sign_thresh html_out = None if html_out_table_name: html_out = open(html_out_table_name, "w") initiateHTML_table(html_out) image_file_type = "png" if options.make_pdf: image_file_type = "pdf" as_only = options.as_only if not as_only: opt_parser.check_required("--sample_set1") opt_parser.check_required("--sample_set2") in_sample_set1 = options.sample_set1 in_sample_set2 = options.sample_set2 # JuncBASE table default samp_start_idx = 11 isGeneric = False if options.samp_start_idx: samp_start_idx = options.samp_start_idx isGeneric = True if permutation and isGeneric: print "Permutation test is only for JuncBASE tables" opt_parser.print_help() sys.exit(1) left_input_file = None right_input_file = None if left_input_file_name is None: print "Warning: No intron retention file given as input. Will not calculate IR events." else: left_input_file = open(left_input_file_name) right_input_file = open(right_input_file_name) # if permutation: # raw_left_input_file = open(raw_left_input_file_name) # raw_right_input_file = open(raw_right_input_file_name) all_psi_output = open(options.all_psi_output, "w") method = options.mt_method if method != "BH" and method != "bonferroni": print "Wrong method indicated." opt_parser.print_help() sys.exit(1) which_test = options.which_test if which_test != "Wilcoxon" and which_test != "t-test": print "Wrong method indicated." opt_parser.print_help() sys.exit(1) if which_test == "Wilcoxon": which_test = "wilcox.test" if which_test == "t-test": which_test = "t.test" idx2sample = {} # {event_type:(set1_medianPSI, set2medianPSI),]} event_type2PSI_vals_4_set = {} # {event:psi_vals_idx} event2PSI_val_idx = {} # {event_type:[pval]} event_type2pvals = {} # {event::pval_idx} event2idx = {} # {event:{col:psi}} event2col2psi = {} # {event:{col:sum_counts}} header = None total_samples = None lenNorm_lines = input_file.readlines() # if permutation: # raw_lines = raw_input_file.readlines() num_lines = len(lenNorm_lines) for j in xrange(num_lines): line = formatLine(lenNorm_lines[j]) if line.startswith("#"): header = line headerList = header.split("\t") if html_out: writeHTMLHeader(html_out, headerList) sampleList = headerList[samp_start_idx:] # Get sample idx for i in range(len(sampleList)): idx2sample[i] = sampleList[i] if as_only: # These are arbitrarily chosen and not really used in_sample_set1 = sampleList[0] in_sample_set2 = sampleList[1] # If there were no batches, all samples are in the same batch if permutation: if samp2batch is None: samp2batch = {}.fromkeys(sampleList, '0') # for sample in sample_set1: # idx2sample[sampleList.index(sample)] = sample # for sample in sample_set2: # idx2sample[sampleList.index(sample)] = sample sample_set1 = getSamples(in_sample_set1) sample_set2 = getSamples(in_sample_set2) sample_set1_checked = checkSamples(sampleList, sample_set1) sample_set2_checked = checkSamples(sampleList, sample_set2) # The threshold for the number of samples that need to have expressed AS # events in order to consider testing samp_set_thresh1 = float(len(sample_set1_checked)) * PROP_NON_NA samp_set_thresh2 = float(len(sample_set2_checked)) * PROP_NON_NA if permutation: # batch2setLabels : {batch:{"idx":[indexes in batch], # "samp_set":[parallele list indicating which sample set it is in]} (batch2setLabels, batch2len) = buildBatchDict(sampleList, samp2batch, sample_set1_checked, sample_set2_checked) continue line_list = line.split("\t") # if permutation: # raw_line_list = formatLine(raw_lines[j]).split("\t") # # if line_list[5] != raw_line_list[5] or line_list[6] != raw_line_list[6]: # print "Count files (raw and lenNorm) do not match up)" # opt_parser.print_help() # sys.exit(1) event = "\t".join(line_list[0:samp_start_idx]) counts = line_list[samp_start_idx:] if permutation: total_counts = [] if event in event2idx: print "Warning: Skipping duplicate event: %s" % event continue if isGeneric: event_type = "generic" else: event_type = getEventType(event) if event_type not in event_type2pvals: event_type2pvals[event_type] = [] event_type2PSI_vals_4_set[event_type] = [] total_samples = len(counts) # Fill PSI dict min_psi = INFINITY max_psi = -INFINITY set1_psis = [] set2_psis = [] all_psis = [] na_count = 0 for i in range(total_samples): if isGeneric: # psi is actually a generic value that is in the table psi = counts[i] else: (psi, sum_ct) = getPSI_sample_sum(counts[i], sum_thresh) if psi != NA: psi_val = float(psi) all_psis.append(psi_val) if psi_val < min_psi: min_psi = psi_val if psi_val > max_psi: max_psi = psi_val else: all_psis.append(NA) na_count += 1 if event in event2col2psi: event2col2psi[event][i] = psi else: event2col2psi[event] = {i:psi} if isGeneric: if psi < sum_thresh: continue else: if permutation: # Compare samples groups together in a wilcoxon rank sum test [col_excl, col_incl] = map(int,counts[i].split(";")) total_count = col_excl + col_incl total_counts.append(total_count) if total_count < sum_thresh: continue # Both samples have to be non-zero # if belowThreshold(sum_thresh, col_excl, col_incl): # continue if idx2sample[i] in sample_set1: if event2col2psi[event][i] != NA: set1_psis.append(event2col2psi[event][i]) elif idx2sample[i] in sample_set2: if event2col2psi[event][i] != NA: set2_psis.append(event2col2psi[event][i]) if as_only: if (float(total_samples - na_count)/total_samples) < PROP_NON_NA: continue else: if len(set1_psis) <= samp_set_thresh1 or len(set2_psis) <= samp_set_thresh2: continue if (max_psi - min_psi) < delta_thresh: continue if as_only: cur_len = len(event_type2pvals[event_type]) event_type2pvals[event_type].append(1.0) event2idx[event] = cur_len psi_vals_cur_len = len(event_type2PSI_vals_4_set[event_type]) event_type2PSI_vals_4_set[event_type].append((0.0,0.0)) event2PSI_val_idx[event] = psi_vals_cur_len continue psi_vals_cur_len = len(event_type2PSI_vals_4_set[event_type]) event_type2PSI_vals_4_set[event_type].append((robjects.r['median'](robjects.FloatVector(set1_psis))[0], robjects.r['median'](robjects.FloatVector(set2_psis))[0])) event2PSI_val_idx[event] = psi_vals_cur_len # Calculate p-val for intron retention later if event_type == "intron_retention": continue # cur_len2 = len(event_type2col2pvals[event_type][j]) # if event in event2pairs2idx: # event2pairs2idx[event][(0,j)] = cur_len # else: # event2pairs2idx[event] = {(0,j):cur_len} # if event in event2col2idx: # event2col2idx[event][j] = cur_len2 # else: # event2col2idx[event] = {j:cur_len2} # cur_len = len(event_type2pvals[event_type]) try: if permutation: # incl_iso_len = getEventInclLen(event, jcn_seq_len) null_dist = get_null_dist(line_list[samp_start_idx:], total_counts, all_psis, which_test, batch2setLabels, batch2len, sum(map(ord,event)), samp_set_thresh1, samp_set_thresh2) this_stat = compareTest(which_test, set1_psis, set2_psis, give_pvals=False) # For debugging # fig = plt.figure() # ax = fig.add_subplot(111) # ax.hist(null_dist, 100, normed=1) # plt.show() raw_pval = get_emp_pval(null_dist, this_stat) else: raw_pval = compareTest(which_test, set1_psis, set2_psis) except: print "Warning: Event not tested: %s" % event continue if robjects.r["is.nan"](raw_pval)[0]: continue event_type2pvals[event_type].append(raw_pval) event2idx[event] = cur_len # Now calculate intron retention if (not as_only) and (not isGeneric): if left_input_file: left_events2counts = getIntronLeftRightCounts(left_input_file, samp_start_idx) right_events2counts = getIntronLeftRightCounts(right_input_file, samp_start_idx) # if permutation: # raw_left_events2counts = getIntronLeftRightCounts(raw_left_input_file, samp_start_idx) # raw_right_events2counts = getIntronLeftRightCounts(raw_right_input_file, samp_start_idx) else: left_events2counts = {} right_events2counts = {} # if permutation: # raw_left_events2counts = {} # raw_right_events2counts = {} for event in left_events2counts: if event not in right_events2counts: continue # If the event is not in this dictionary, the sum of the left and # right counts did not pass the thresholds. if event not in event2PSI_val_idx: continue set1_psis_left = [] set2_psis_left = [] set1_psis_right = [] set2_psis_right = [] if simple_IR: set1_total_psis = [] set2_total_psis = [] left_total_counts = [] right_total_counts = [] left_all_psis = [] right_all_psis = [] if simple_IR: total_counts = [] total_str_counts = [] # to mimic left_events2counts structure all_psis = [] left_min_psi = 200 left_max_psi = -1 right_min_psi = 200 right_max_psi = -1 for j in range(total_samples): [left_col_excl, left_col_incl] = map(int,left_events2counts[event][j].split(";")) [right_col_excl, right_col_incl] = map(int,right_events2counts[event][j].split(";")) left_total = left_col_excl + left_col_incl right_total = right_col_excl + right_col_incl left_total_counts.append(left_total) right_total_counts.append(right_total) if simple_IR: # the exclusion counts are not necessarily the same on both # left and right because there may be other splice junctions # associated with the 5' and 3' splice site. For simplicity, # I will average the two values total_excl = int(round((left_col_excl + right_col_excl)/2.0)) total_incl = left_col_incl + right_col_incl total_counts.append(total_excl + total_incl) # Both samples have to be non-zero # if (belowThreshold(sum_thresh, left_col_excl, left_col_incl) # or # belowThreshold(sum_thresh, right_col_excl, right_col_incl)): # continue (left_psi, sum_ct) = getPSI_sample_sum(left_events2counts[event][j], sum_thresh) (right_psi, sum_ct) = getPSI_sample_sum(right_events2counts[event][j], sum_thresh) if simple_IR: (total_psi, total_sum_ct) = getPSI_sample_sum("%d;%d" % (total_excl, total_incl), sum_thresh) total_str_counts.append("%d;%d" % (total_excl, total_incl)) if left_psi != NA: left_psi_val = float(left_psi) left_all_psis.append(left_psi_val) if left_psi_val < left_min_psi: left_min_psi = left_psi_val if left_psi_val > left_max_psi: left_max_psi = left_psi_val else: left_all_psis.append(NA) if right_psi != NA: right_psi_val = float(right_psi) right_all_psis.append(right_psi_val) if right_psi_val < right_min_psi: right_min_psi = right_psi_val if right_psi_val > right_max_psi: right_max_psi = right_psi_val else: right_all_psis.append(NA) if simple_IR: if left_psi == NA or right_psi == NA: all_psis.append(NA) else: all_psis.append(float(total_psi)) if left_total < sum_thresh or right_total < sum_thresh: continue if idx2sample[j] in sample_set1: if left_psi != NA: set1_psis_left.append(left_psi) if right_psi != NA: set1_psis_right.append(right_psi) if simple_IR: if left_psi != NA and right_psi != NA: set1_total_psis.append(total_psi) elif idx2sample[j] in sample_set2: if left_psi != NA: set2_psis_left.append(left_psi) if right_psi != NA: set2_psis_right.append(right_psi) if simple_IR: if left_psi != NA and right_psi != NA: set2_total_psis.append(total_psi) if len(set1_psis_left) <= samp_set_thresh1 or len(set1_psis_right) <= samp_set_thresh1\ or len(set2_psis_left) <= samp_set_thresh2 or len(set2_psis_right) <= samp_set_thresh2: continue if (left_max_psi - left_min_psi) < delta_thresh: continue if (right_max_psi - right_min_psi) < delta_thresh: continue cur_len = len(event_type2pvals["intron_retention"]) try: if permutation: if simple_IR: null_dist = get_null_dist(total_str_counts, total_counts, all_psis, which_test, batch2setLabels, batch2len, sum(map(ord,event)), samp_set_thresh1, samp_set_thresh2) this_stat = compareTest(which_test, set1_total_psis, set2_total_psis, give_pvals=False) pval = get_emp_pval(null_dist, this_stat) else: # incl_iso_len = getEventInclLen(event, jcn_seq_len) null_dist = get_null_dist(left_events2counts[event], left_total_counts, left_all_psis, which_test, batch2setLabels, batch2len, sum(map(ord,event)), samp_set_thresh1, samp_set_thresh2) # # For debugging # fig = plt.figure() # ax = fig.add_subplot(111) # ax.hist(null_dist, 100, normed=1) # plt.show() this_stat = compareTest(which_test, set1_psis_left, set2_psis_left, give_pvals=False) left_pval = get_emp_pval(null_dist, this_stat) null_dist = get_null_dist(right_events2counts[event], right_total_counts, right_all_psis, which_test, batch2setLabels, batch2len, sum(map(ord,event)), samp_set_thresh1, samp_set_thresh2) # # For debugging # fig = plt.figure() # ax = fig.add_subplot(111) # ax.hist(null_dist, 100, normed=1) # plt.show() this_stat = compareTest(which_test, set1_psis_right, set2_psis_right, give_pvals=False) right_pval = get_emp_pval(null_dist, this_stat) else: if simple_IR: pval = compareTest(which_test, set1_total_psis, set2_total_psis) else: left_pval = compareTest(which_test, set1_psis_left, set2_psis_left) right_pval = compareTest(which_test, set1_psis_right, set2_psis_right) except: print "Warning: Event not tested: %s" % event continue if simple_IR: if robjects.r["is.nan"](pval)[0]: continue else: combined_pval = pval else: if robjects.r["is.nan"](left_pval)[0] or robjects.r["is.nan"](right_pval)[0]: continue else: # Old combined p_val method # combined_pval = (left_pval + right_pval) - left_pval * right_pval combined_pval = max(left_pval, right_pval) event_type2pvals["intron_retention"].append(combined_pval) event2idx[event] = cur_len # All pairs have been evaluated, so now do multiple testing correction on # everything event_type2adjusted_pvals = {} event_type2col2adjusted_pvals = {} # Used for printing boxplots data_counter = 0 for event_type in event_type2pvals: if as_only: event_type2adjusted_pvals[event_type] = list(event_type2pvals[event_type]) else: event_type2adjusted_pvals[event_type] = robjects.r['p.adjust'](robjects.FloatVector(event_type2pvals[event_type]), method) # Now go through all events and print out pvals all_psi_output.write(header) if as_only: all_psi_output.write("\n") else: all_psi_output.write("\tset1_med\tset2_med\tdelta_val\traw_pval\tcorrected_pval\n") for event in event2idx: if isGeneric: event_type = "generic" else: event_type = getEventType(event) this_idx = event2idx[event] if this_idx == NA: psi_vals = [] for i in range(total_samples): psi_vals.append(event2col2psi[event][i]) outline = "%s\t%s\tNA\tNA\n" % (event, "\t".join(psi_vals)) all_psi_output.write(outline) continue psi_vals = [] for i in range(total_samples): psi_vals.append(event2col2psi[event][i]) outline = "%s\t%s" % (event, "\t".join(psi_vals)) if as_only: outline += "\n" all_psi_output.write(outline) continue # Add median PSI and delta PSI values this_psi_vals_idx = event2PSI_val_idx[event] outline += "\t%.2f\t%.2f\t%.2f" % (event_type2PSI_vals_4_set[event_type][this_psi_vals_idx][0], event_type2PSI_vals_4_set[event_type][this_psi_vals_idx][1], event_type2PSI_vals_4_set[event_type][this_psi_vals_idx][1] - event_type2PSI_vals_4_set[event_type][this_psi_vals_idx][0]) outline += "\t%f\t%f\n" % (event_type2pvals[event_type][this_idx], event_type2adjusted_pvals[event_type][this_idx]) all_psi_output.write(outline) if html_out: if event_type2adjusted_pvals[event_type][this_idx] < sign_thresh: data_counter = printDataToHTML(grdevices, html_out_dir, html_out, outline, samp_start_idx, idx2sample, sample_set1, sample_set2, data_counter, image_file_type) all_psi_output.close() sys.exit(0)
def meanVar(_files, _gff_file, _output): NFILE = len(_files) if NFILE == 1: sys.stderr.write("Need at least two samples for each group.\n") sys.exit(1) ## Dictionary of gene counts _dict_counts = dict() _genes = HTSeq.GenomicArrayOfSets("auto", stranded=False) idx = 0 count = 0 transcript = set() cur_line = None lines = 0 for feature in _gff_file: lines += 1 if feature.type in GENE or lines == num_lines: if len(transcript) > 1: _dict_counts[cur_line.name] = [0] * NFILE _genes[cur_line.iv] += cur_line.name count += 1 cur_line = feature transcript.clear() if feature.type in EXON: transcript.add(feature.attr["Parent"]) print "Number of genes", count _file_raw_count = open(_output + '.rawcounts', 'w') _file_nb_count = open(_output + '.nbcounts', 'w') ## This loop read through the input list and call countbam for each input file for f in _files: bam_file = HTSeq.BAM_Reader(f) _dict_counts = countbam(bam_file, _genes, _dict_counts, idx) idx += 1 sys.stderr.write("Library %d has generated.\n" % idx) ## Print raw counts in file specified by <out> for key, value in sorted(_dict_counts.iteritems()): _file_raw_count.write(key + "\t" + "\t".join(map(str, value)) + "\n") _file_raw_count.close() ## Calculate group mean and variance list_mean = list() list_var = list() for key, value in sorted(_dict_counts.iteritems()): list_mean.append(np.mean(np.array(value))) list_var.append(np.var(np.array(value))) ## Computer loess esimates ## The following code is using rpy2 module a = robjects.FloatVector(list_mean) b = robjects.FloatVector(list_var) df = robjects.DataFrame({"mean": a, "var": b}) non0_df = df.rx(df.rx2("mean").ro > 0, True) ## subsetting if mean > 0 loess_fit = r.loess("var ~ mean", data=non0_df, degree=2) var_pred = r.predict(loess_fit, a) # This loop overwrite global variable dict_counts for recoding new count data count_idx = 0 for key, value in sorted(_dict_counts.iteritems()): n = math.pow(list_mean[count_idx], 2) / (var_pred[count_idx] - list_mean[count_idx]) n = int(n) # n: number of failures if n <= 0: _dict_counts[key] = [0] * NREPS else: p = n / float(n + list_mean[count_idx]) # p: prob of success _dict_counts[key] = nbinom.rvs(n, p, size=NREPS).tolist() count_idx += 1 for key, value in sorted(_dict_counts.iteritems()): _file_nb_count.write(key + "\t" + "\t".join(map(str, value)) + "\n") _file_nb_count.close() _file_raw_count.close() return _dict_counts
def ssplines(y, x, lambd=0): """ trend_filter(y, lambda = 0, order = 3) finds the solution of the l1 trend estimation problem minimize (1/2)||y-f(x)||^2 + lambda*||D^2 f(x)||_2^2, with variable x, and problem data y and lambda, with lambda >0. This function uses rpy2 to call the smoothing splines implementation in R Input arguments: - y: n-vector; original signal, dependent variable y(x) - x: n-vector; independent variable - lambda: scalar; positive regularization parameter Output arguments: list[0] - y_tf: n-vector; filtered solution - dy_tf: n-vector; filtered derivative list[1] - residual: l-2 norm of (y - f(x)) - reg_residual: l-2 norm of D^2 f(x) Author: Alexandre Cortiella Affiliation: University of Colorado Boulder Department: Aerospace Engineering Sciences Date: 11/09/2020 Version: v1.0 Updated: 11/09/2020 """ # Transform into R vectorsfrom scipy import interpolate r_y = robjects.FloatVector(y) r_x = robjects.FloatVector(x) #Create ssplines object with specific inputs r_smooth_spline = robjects.r['smooth.spline'] #extract R function kwargs = {"x": r_x, "y": r_y, "lambda": float(lambd)} spline1 = r_smooth_spline(**kwargs) #Compute filtered signal for a specific lambda y_ss = np.array(spline1.rx2('y')) df = np.array(spline1.rx2('df')) #Compute its derivative y_ss_ss = interpolate.splrep(r_x, y_ss, k=3, s=0) dy_ss = interpolate.splev(r_x, y_ss_ss, der=1) ddy_ss = interpolate.splev(r_x, y_ss_ss, der=2) #Compute residuals residual = norm(y - y_ss) reg_residual = norm(ddy_ss) #Compute GCV m = len(y) GCV = (m * norm(y - y_ss)**2) / (m - df)**2 return [(y_ss, dy_ss), (residual, reg_residual), (GCV, df)]
def get_heatmap(Heatmap, ColSideMatrix=False, RowSideMatrix=False, ColorScheme="bluered", BreakBegin=-1.0, BreakEnd=1.0, scale="row", key=True, keysize=1.0, symbreaks=False, density="none", symkey=False, trace="none", cexRow=0.75, cexCol=0.01, Rowv=True, Colv=True, BottomMargin=5, RightMargin=10, Legend=False): ''' ''' ro.r['source'](os.path.dirname(os.path.realpath(__file__)) + "/Heatmap3.R") ro.r['library']('gplots') ro.r['library']('devtools') Breaks = ro.FloatVector(list(arange(BreakBegin, BreakEnd + 0.1, 0.001))) Cluster = ro.r('function(c) {hclust(c,method="average")}') Distance = ro.r('function(c) {dist(c,method="euclidean")}') if ColSideMatrix and not RowSideMatrix: ro.r['heatmap.3'](Heatmap, ColSideColors=ColSideMatrix, col=ro.r[ColorScheme](len(Breaks) - 1), breaks=Breaks, hclustfun=Cluster, distfun=Distance, scale=scale, key=key, keysize=keysize, symbreaks=symbreaks, density=density, symkey=symkey, trace=trace, cexRow=cexRow, cexCol=cexCol, Rowv=Rowv, Colv=False, margins=ro.IntVector([BottomMargin, RightMargin])) elif RowSideMatrix: ro.r['heatmap.3'](Heatmap, ColSideColors=ColSideMatrix, RowSideColors=RowSideMatrix, col=ro.r[ColorScheme](len(Breaks) - 1), breaks=Breaks, hclustfun=Cluster, distfun=Distance, scale=scale, key=key, keysize=keysize, symbreaks=symbreaks, density=density, symkey=symkey, trace=trace, cexRow=cexRow, cexCol=cexCol, Rowv=Rowv, Colv=False, margins=ro.IntVector([BottomMargin, RightMargin])) else: ro.r['heatmap.3'](Heatmap, col=ro.r[ColorScheme](len(Breaks) - 1), breaks=Breaks, hclustfun=Cluster, distfun=Distance, scale=scale, key=key, keysize=keysize, symbreaks=symbreaks, density=density, symkey=symkey, trace=trace, cexRow=cexRow, cexCol=cexCol, Rowv=Rowv, Colv=Colv, margins=ro.IntVector([BottomMargin, RightMargin])) if Legend: ncol = 1 if len(Legend.values()) >= 6: ncol = 2 Fill = ro.StrVector(Legend.values()) Legend = ro.StrVector(Legend.keys()) ro.r['legend']("topright", legend=Legend, fill=Fill, border=False, bty="n", cex=0.6, ncol=ncol, **{ 'y.intersp': 0.7 }) return
def run_main(sig_info=None, gctx = None, allele_col = None, o = None, r = None, c = None, i = None, conn_null = None, ie_col = None, ie_filter = None, num_reps = None, cell_id = None, plate_id = None): #default values i = int(i) if i != None else int(1000) ie_col = str(ie_col) if ie_col != None else str(x_ie_a549) ie_filter = float(ie_filter) if ie_filter != None else float(0.0) num_reps = int(num_reps) if num_reps != None else int(3) sig_info_file = open(sig_info) output_file_prefix = open(o + ".txt", "w") # Output distribution files controls = grp.grp.read(c) reference_test_filename = r ref2test_allele = None if reference_test_filename: ref2test_allele = parseRefTestFile(reference_test_filename) if ref2test_allele == None: print("Error reading in comparisons file") sys.exit() this_gctx = parse(gctx) # this_gctx.read() num_iterations = int(i) num_reps = int(num_reps) conn_null_input = conn_null if conn_null_input: conn_nulls_from_input_str = grp.grp.read(conn_null_input) conn_nulls_from_input = map(float, conn_nulls_from_input_str) (allele2distil_id, allele2WT, allele2gene, allele2cell_id, WT_alleles) = parse_sig_info(sig_info_file, ref2test_allele, allele_col, ie_col, ie_filter, cell_id, plate_id) clean_controls = [] for this_control in controls: if this_control in allele2distil_id: clean_controls.append(this_control) #calculates if no inputs replicate_null_dist, connectivity_null_dist = getNullDist(this_gctx, allele2distil_id, clean_controls, num_iterations, num_reps) #overwrites conn_null_dist if its an input if conn_null_input: connectivity_null_dist = conn_nulls_from_input if not conn_null: conn_null_dist_out = open(o + "_conn_null.txt", "w") for x in connectivity_null_dist: conn_null_dist_out.write("%f\n" % x) conn_null_dist_out.close() WT_dict, wt_rep_pvals, wt_ordered = buildWT_dict(this_gctx, allele2distil_id, WT_alleles, replicate_null_dist, num_reps) # Print header to output file output_file_prefix.write("gene\tmut\tmut_rep\twt_rep\tmut_wt_connectivity\t") output_file_prefix.write("wt\tcell_line\t") output_file_prefix.write("mut_wt_rep_pval\tmut_wt_conn_null_pval\twt_mut_rep_vs_wt_mut_conn_pval\tkruskal_diff\t") output_file_prefix.write("mut_wt_rep_c_pval\tmut_wt_conn_null_c_pval\twt_mut_rep_vs_wt_mut_conn_c_pval\n") mut_rep_pvals = [] mut_wt_rep_pvals = [] mut_wt_conn_pvals = [] mut_wt_rep_vs_wt_mut_conn_pvals = [] outlines = [] # Build comparison for allele in allele2WT: # Don't calculate for the WT allele if allele == allele2WT[allele]: continue mut_rankpt, mut_rankpt_dist = getSelfConnectivity(this_gctx, allele2distil_id[allele], num_reps) self_pval = getPairwiseComparisons(mut_rankpt_dist, replicate_null_dist) mut_rep_pvals.append(self_pval) mut_wt_conn_rankpt, mut_wt_conn_dist = getConnectivity(this_gctx, allele2distil_id[allele], allele2distil_id[allele2WT[allele]], num_reps) conn_pval = getPairwiseComparisons(mut_wt_conn_dist, connectivity_null_dist) mut_wt_conn_pvals.append(conn_pval) mut_wt_rep_pval = getPairwiseComparisons(mut_rankpt_dist, WT_dict[allele2WT[allele]]["wt_rep_dist"]) mut_wt_rep_pvals.append(mut_wt_rep_pval) wt_mut_rep_vs_wt_mut_conn_pval = getKruskal(WT_dict[allele2WT[allele]]["wt_rep_dist"], mut_rankpt_dist, mut_wt_conn_dist) mut_wt_rep_vs_wt_mut_conn_pvals.append(wt_mut_rep_vs_wt_mut_conn_pval) medians = [] medians.append(median(WT_dict[allele2WT[allele]]["wt_rep_dist"])) medians.append(median(mut_rankpt_dist)) medians.append(median(mut_wt_conn_dist)) median_diff = max(medians)-min(medians) out_elems = [allele2gene[allele], allele, "%f" % mut_rankpt, "%f" % WT_dict[allele2WT[allele]]["wt_rep"], "%f" % mut_wt_conn_rankpt, allele2WT[allele], allele2cell_id[allele], "%f" % mut_wt_rep_pval, "%f" % conn_pval, "%f" % wt_mut_rep_vs_wt_mut_conn_pval, "%f" % median_diff] outline = "\t".join(out_elems) outlines.append(outline) # Calculate corrected pvalues mut_wt_rep_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_rep_pvals), "BH") mut_wt_conn_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_conn_pvals), "BH") mut_wt_rep_vs_wt_mut_conn_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_rep_vs_wt_mut_conn_pvals), "BH") # Write to file num_lines = len(outlines) for i in range(num_lines): this_outline = outlines[i] this_outline += "\t%f\t" % mut_wt_rep_c_pvals[i] this_outline += "%f\t" % mut_wt_conn_c_pvals[i] this_outline += "%f\n" % mut_wt_rep_vs_wt_mut_conn_c_pvals[i] output_file_prefix.write(this_outline)
def plot_squiggle(args, filename, start_times, mean_signals): """ Use rpy2 to create a squiggle plot of the read """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] total_time = start_times[-1] - start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([t - t_0 for t in start_times]) r_mean_signals = robjects.FloatVector(mean_signals) # infer the appropriate number of events given the number of facets num_events = len(r_mean_signals) events_per_facet = (num_events / args.num_facets) + 1 # dummy variable to control faceting facet_category = robjects.FloatVector([(i / events_per_facet) + 1 for i in range(len(start_times))]) # make a data frame of the start times and mean signals d = {'start': r_start_times, 'mean': r_mean_signals, 'cat': facet_category} df = robjects.DataFrame(d) gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) else: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) \ + ggplot2.theme_bw() if args.saveas is not None: plot_file = os.path.basename(filename) + "." + args.saveas if os.path.isfile(plot_file): raise Exception( 'Cannot create plot for %s: plot file %s already exists' % (filename, plot_file)) if args.saveas == "pdf": grdevices.pdf(plot_file, width=8.5, height=11) elif args.saveas == "png": grdevices.png(plot_file, width=8.5, height=11, units="in", res=300) pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def getKruskal(wt_rankpt_dist, mut_rankpt_dist, mut_wt_conn_dist): return robjects.r["kruskal.test"](robjects.ListVector( {'a':robjects.FloatVector(wt_rankpt_dist), 'b':robjects.FloatVector(mut_rankpt_dist), 'c':robjects.FloatVector(mut_wt_conn_dist)}))[2][0]
def RWilcox(x1, x2): x1, x2 = list(x1), list(x2) a, b = rob.FloatVector(x1), rob.FloatVector(x2) return rob.r["wilcox.test"](a, b, paired=True)[2][0]
@classmethod def numpy2ri_close(cls): """ 关闭R对象和numpy对象的自动转换 :return: 无返回值 """ numpy2ri.activate() if __name__ == '__main__': r_env = REnv() print(r_env[robjects.StrVector('abc')]) print(type(r_env[robjects.StrVector('abc')])) print(isinstance(robjects.StrVector('abc'), DataFrame)) print(r_env[importr('base').pi]) v = robjects.FloatVector([1.1, 2.2, 3.3, 4.4, 5.5, 6.6]) m = robjects.r['matrix'](v, nrow=2) print(type(m)) print(np.array(m)) print(r_env[m]) d = { 'a': robjects.IntVector((1, 2, 3)), 'b': robjects.IntVector((4, 5, 6)) } dataf = robjects.DataFrame(d) print(isinstance(dataf, Vector)) print(r_env[dataf]) ''' print('='*80)
def main(argv=None): try: usage = "camelPeaks.py [OPTIONS]" desc = """A ChIP-seq peak deconvolution algorithm.""" parser = optparse.OptionParser(usage=usage, description=desc) for opt in opts: parser.add_option(opt[0], opt[1], help=opt[2], **opt[3]) (opt, args) = parser.parse_args() if not (opt.peaks and os.path.exists(opt.peaks)): parser.print_help() raise Usage("Specify a valid peaks file with -p.") if not (opt.forward and os.path.exists(opt.forward)): parser.print_help() raise Usage("Specify a valid forward strand density file with -f.") if not (opt.reverse and os.path.exists(opt.reverse)): parser.print_help() raise Usage("Specify a valid reverse strand density file with -r.") #### if opt.chromosome and opt.length: chrmeta = {opt.chromosome: {'length': opt.length}} else: chrmeta = opt.genome peak_track = track(opt.peaks, chrmeta=chrmeta) chrmeta = peak_track.chrmeta if opt.chromosome: chrmeta = {opt.chromosome: chrmeta[opt.chromosome]} track_info = { 'datatype': peak_track.info.get('datatype', 'qualitative') } outbed = track(opt.output + "_peaks.bed", chrmeta=chrmeta, fields=["chr", "start", "end", "name", "score"]) outwig = track(opt.output + "_deconv.bedgraph", chrmeta=chrmeta) outwig.open(mode='overwrite') topts = {'chrmeta': chrmeta, 'readonly': True} for chrom, cv in chrmeta.iteritems(): peak_stream = sorted_stream(peak_track.read(selection=chrom), [chrom]) strands = { track(opt.forward, **topts).read(chrom, fields=[ 'start', 'end', 'score' ]): 'plus', track(opt.reverse, **topts).read(chrom, fields=[ 'start', 'end', 'score' ]): 'minus' } robjects.r('options(stringsAsFactors=F)') robjects.r('counts=data.frame()') for row_count, peak in enumerate(peak_stream): start = int(peak[peak_stream.fields.index('start')]) end = int(peak[peak_stream.fields.index('end')]) if end - start > opt.sizecutoff: continue if start < 0: start = 0 if not (end <= cv['length']): end = cv['length'] if 'name' in peak_stream.fields: reg_name = peak[peak_stream.fields.index('name')] else: reg_name = str(row_count + 1) data_block = robjects.DataFrame({ 'pos': robjects.IntVector(range(start + 1, end + 1)), 'plus': robjects.FloatVector([0] * (end - start)), 'minus': robjects.FloatVector([0] * (end - start)), 'name': robjects.StrVector([reg_name] * (end - start)) }) for stream, strnd in strands.iteritems(): for row in stream: if row[0] < start: continue if row[1] > end: break data_block.rx2(strnd)[(row[0]-start):(row[1]-start)] = \ robjects.FloatVector([row[2]]*(row[1]-row[0])) robjects.r.assign('newblock', data_block) robjects.r('counts=rbind(counts,newblock)') robjects.r('read.length=%i' % opt.extension) robjects.r('chr.name="%s"' % chrom) robjects.r('pdf.file="%s.pdf"' % opt.output) robjects.r('mu=%i' % opt.mu) robjects.r('ktype="%s"' % opt.kernel) robjects.r('source("%s")' % os.path.join(opt.script, "deconv_fcts.R")) robjects.r(""" counts = split(counts[,c("pos","plus","minus")],counts$name) pdf(file=pdf.file,title='chip-seq',paper='a4',width=8,height=11) par(cex=1.5,lwd=1.5) ccf = cross.correlate(counts,threshold=.5) plot(ccf$lag,ccf$acf,t='l',ylim=c(0,1), xlab='Lag',ylab='Cross-correlation', main=paste('Strand cross-correlation',chr.name)) cut.ccf = ccf$acf cut.ccf[which(ccf$lag<mu)] = 0 lambda = ccf$lag[which.max(cut.ccf)] sol = inverse.solve(counts,mu=mu,lambda=lambda,len=read.length,regul=1e-3,optimize=TRUE,ktype=ktype) col = 'red' lab = paste('lambda=',sol$par$lambda,sep='') abline(v=sol$par$lambda,col=col) text(sol$par$lambda,0,lab,col=col,pos=4) col = 'blue' lab = paste('mu=',sol$par$mu,sep='') abline(v=sol$par$mu,col=col) text(sol$par$mu,0.3,lab,col=col,pos=4) col = 'darkgreen' lab = paste('l=',read.length,sep='') abline(v=read.length,col=col) text(read.length,0.6,lab,col=col,pos=4) par(mfrow=c(4,2)) for (n in names(counts)) { if (sol$sol[[n]]$value>.65) next plot.sol(counts[[n]],sol$sol[[n]],sol$par) title(sub=chr.name) } dev.off() bed = data.frame() cutoff = 1e-3 for (n in names(counts)) { I = which(sol$sol[[n]]$prob>cutoff*sum(sol$sol[[n]]$prob)) if (length(I)<2) next interval = range(counts[[n]]$pos[I]) score = sum(sol$sol[[n]]$prob[I]) name = paste('ID=',n,';FERR=',round(sol$sol[[n]]$val,digits=4),sep='') bed = rbind(bed,data.frame( start=interval[1],end=interval[2], name=name,score=score)) } bed[,'start'] = as.integer(bed[,'start']-1) wig = data.frame() for (n in names(counts)) { I = which(sol$sol[[n]]$prob>cutoff*sum(sol$sol[[n]]$prob)) wig = rbind(wig,data.frame( pos = as.integer(counts[[n]]$pos[I]), score = as.numeric(sol$sol[[n]]$prob[I]))) } """) nrow = robjects.r("nrow(bed)")[0] outbed.write(((robjects.r("bed").rx2('start')[ri], robjects.r("bed").rx2('end')[ri], robjects.r("bed").rx2('name')[ri], robjects.r("bed").rx2('score')[ri]) for ri in xrange(nrow)), fields=["start", "end", "name", "score"], chrom=chrom, mode='append') nrow = robjects.r("nrow(wig)")[0] outwig.write(((robjects.r("wig").rx2('pos')[ri] - 1, robjects.r("wig").rx2('pos')[ri], robjects.r("wig").rx2('score')[ri]) for ri in xrange(nrow)), fields=["start", "end", "score"], chrom=chrom, mode='append') outwig.close() print "************OUTPUT FILES**********" print "\n".join([ opt.output + ".pdf", opt.output + "_peaks.bed", opt.output + "_deconv.bedgraph" ]) print "************PARAMETERS**********" print "lambda=%f|mu=%f|len=%i" % (robjects.r("sol$par$lambda")[0], robjects.r("sol$par$mu")[0], robjects.r("read.length")[0]) sys.exit(0) except Usage, err: print >> sys.stderr, err.msg print >> sys.stderr, usage sys.exit(2)
def FloatV(self, L): return robjects.FloatVector(L)
def array_to_rmatrix(self, X): nr, nc = X.shape xvec = robj.FloatVector(X.transpose().reshape((X.size))) xr = robj.r.matrix(xvec, nrow=nr, ncol=nc) return xr
def buildDMRStats(tables, method, outfile): '''build dmr summary statistics. Creates some diagnostic plots in <exportdir>/<method> directory. Tables should be labeled <tileset>_<design>_<method>. ''' dbhandle = sqlite3.connect(PARAMS["database"]) def togeneset(tablename): return re.match("([^_]+)_", tablename).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join(( "tileset", "design", "track1", "track2", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "up", "down", "twofold", "twofold_up", "twofold_down", )) + "\n") all_tables = set(Database.getTables(dbhandle)) outdir = os.path.join(PARAMS["exportdir"], "diff_methylation") for tablename in tables: prefix = P.snip(tablename, "_%s" % method) tileset, design = prefix.split("_") def toDict(vals, l=2): return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals]) E.info("collecting data from %s" % tablename) tested = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name""" % locals()).fetchall()) status = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name,status""" % locals()).fetchall(), 3) signif = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE significant GROUP BY treatment_name,control_name""" % locals()).fetchall()) fold2 = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE (l2fold >= 1 or l2fold <= -1) AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) fold2up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 1 AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) fold2down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < -1 AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) groups = tested.keys() for treatment_name, control_name in groups: k = (treatment_name, control_name) outf.write("\t".join( map(str, (tileset, design, treatment_name, control_name, tested[k], "\t".join([ str(status[(treatment_name, control_name, x)]) for x in keys_status ]), signif[(k)], up[k], down[k], fold2[k], fold2up[k], fold2down[k]))) + "\n") ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, '''SELECT end - start, pvalue FROM %(tablename)s WHERE significant''' % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals( ) R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10( length )', ylab='log10( pvalue )', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
def generateDemandPlanning(input_url, PPOSQuantity=1000, PlannedWeek=1, PPOSToBeDisaggregated='PPOS1', MinPackagingSize=10, planningHorizon=10): """Generate random demand from spreadsheet at input_url. """ # id is given as an integer and minus one # ToDo we have to standardize data # PPOSToBeDisaggregated='PPOS'+str(PPOSToBeDisaggregated+'1') # Read data from the exported Excel file from RapidMiner and call the Import_Excel object of the KE tool to import this data in the tool demand_data = urllib.urlopen(input_url).read() workbook = xlrd.open_workbook(file_contents=demand_data) worksheets = workbook.sheet_names() worksheet_RapidMiner = worksheets[0] A = Import_Excel() Turnovers = A.Input_data( worksheet_RapidMiner, workbook) #Dictionary with the data from the Excel file #Create lists with the MAs' names and the Turnovers for the first twelve weeks of 2010 retrieving this data from the dictionary PPOS = Turnovers.get('Ppos', []) SP = Turnovers.get('SP', []) MA = Turnovers.get('FP Material No PGS+', []) GlobalDemand = Turnovers.get('Global demand', []) #Call the Distributions object and fit the data from the list in Normal distribution, so as to have info on Global demand (mean and standard deviation) D = Distributions() E = HandleMissingValues() MA = E.DeleteMissingValue(MA) t = D.Normal_distrfit(GlobalDemand) avg = t.get('mean') stdev = t.get('stdev') def constrained_sum_sample_pos(n, total): """Return a randomly chosen list of n positive integers summing to total. Each such list is equally likely to occur.""" dividers = sorted(random.sample(xrange(1, total), n - 1)) return [a - b for a, b in zip(dividers + [total], [0] + dividers)] def constrained_sum_sample_nonneg(n, total): """Return a randomly chosen list of n nonnegative integers summing to total. Each such list is equally likely to occur.""" return [x - 1 for x in constrained_sum_sample_pos(n, total + n)] DemandProfile = {} #Create a dictionary week = [] # list that defines the planning horizon, i.e. 10 weeks for i in range(int(planningHorizon)): week.append(i + 1) for i in week: Demand = int( abs(random.normalvariate(avg, stdev)) ) # Generate a random, non-negative, integer number from the Normal distribution AllocatedPercent = 0.8 - ( 0.05 * i ) # Defines a number starts with 0.8 or 80% and reduced with every iteration at 0.05 or 5% Remaining_Demand = int( (1 - AllocatedPercent) * Demand) # Defines the Remaining demand a = constrained_sum_sample_nonneg(len(MA), 100) myInt = 100 a = robjects.FloatVector(a) lista = [ x / myInt for x in a ] # Define a list with the same length as the MA list and elements float numbers with total sum equal to 1 b = constrained_sum_sample_nonneg( len(MA), Remaining_Demand ) # Define a list with the same length as the MA list and elements with total sum the Remaining demand dicta = {} for index in range(0, len(MA)): MinUnits = round(b[index] * (random.uniform(0, 0.2)), 0) TotalUnits = b[index] if TotalUnits < MinPackagingSize: TotalUnits = 0 if MinUnits < MinPackagingSize: MinUnits = 0 dicta.update( {MA[index]: [TotalUnits, MinUnits]} ) # it updates a dictionary with key the different MAs and values the remaining demand and (b[index]*lista[index]) DemandProfile.update( {i: dicta} ) #It updates a dictionary with key the number of each iteration (week) and value the dictionary dicta Table = [] i = 0 for i in range(len(MA)): Table.append([PPOS[i], SP[i], MA[i]]) i += 1 uniquePPOS = [] for ppos in PPOS: if not ppos in uniquePPOS and ppos != '': uniquePPOS.append(ppos) book = Workbook() sheet1 = book.add_sheet('Future1', cell_overwrite_ok=True) aggrTable = [] for key in DemandProfile.keys(): for elem in DemandProfile[key]: if DemandProfile[key].get(elem)[0] > 0: MAkey = elem totalUnits = DemandProfile[key].get(elem)[0] minUnits = DemandProfile[key].get(elem)[1] plannedWeek = key aggrTable.append([MAkey, totalUnits, minUnits, plannedWeek]) else: continue t = 1 aggrTable.sort(key=lambda x: x[1], reverse=False) for i in sorted(aggrTable, key=lambda x: int(x[3])): sheet1.write(0, 0, 'Order ID') sheet1.write(0, 1, 'MA ID') sheet1.write(0, 2, 'Total # Units') sheet1.write(0, 3, 'Min # Units') sheet1.write(0, 4, 'Planned Week') sheet1.write(t, 1, (i[0].replace('MA', '', 1))) sheet1.write(t, 2, i[1]) sheet1.write(t, 3, i[2]) sheet1.write(t, 4, i[3]) sheet1.write(t, 0, t) t += 1 # open json file futureDemandProfileFile = open('futureDemandProfile.json', mode='w') futureDemandProfile = {} t = 1 for i in sorted(aggrTable, key=lambda x: int(x[3])): dicta = { 'MAID': i[0], 'TotalUnits': i[1], 'MinUnits': i[2], 'PlannedWeek': i[3] } futureDemandProfile[t] = dicta futureDemandProfileString = json.dumps(futureDemandProfile, indent=5) t += 1 #write json file futureDemandProfileFile.write(futureDemandProfileString) ###==================================================================================================### sheet2 = book.add_sheet('PPOS', cell_overwrite_ok=True) dictPPOS = {} dictPPOSMA = {} for ind in uniquePPOS: indices = [i for i, j in enumerate(PPOS) if j == ind] mas = [ma for ma in MA if (MA.index(ma) in indices)] dictPPOSMA.update({ind: mas}) t = 1 for key in dictPPOSMA.keys(): for elem in dictPPOSMA[key]: if key == PPOSToBeDisaggregated: c = constrained_sum_sample_nonneg(len(dictPPOSMA[key]), PPOSQuantity) d = constrained_sum_sample_nonneg(len(dictPPOSMA[key]), 100) myInt = 100 d = robjects.FloatVector(d) listd = [x / myInt for x in d] for i in range(0, len(dictPPOSMA[key])): MinUnits = round(c[i] * (random.uniform(0, 0.2)), 0) TotalUnits = c[i] if TotalUnits < MinPackagingSize: TotalUnits = 0 if MinUnits < MinPackagingSize: MinUnits = 0 dictPPOS.update( {dictPPOSMA[key][i]: [TotalUnits, MinUnits]}) t = 1 for i in range(0, len(dictPPOS)): sheet2.write(0, 0, 'Order ID') sheet2.write(0, 1, 'MA ID') sheet2.write(0, 2, 'Total # Units') sheet2.write(0, 3, 'Min # Units') sheet2.write(0, 4, 'Planned Week') sheet2.write(t, 0, t) # XXX the MA id should not have MA prefix... sheet2.write(t, 1, dictPPOSMA[PPOSToBeDisaggregated][i].replace('MA', '', 1)) sheet2.write(t, 2, dictPPOS[dictPPOSMA[PPOSToBeDisaggregated][i]][0]) sheet2.write(t, 3, dictPPOS[dictPPOSMA[PPOSToBeDisaggregated][i]][1]) sheet2.write(t, 4, PlannedWeek) t += 1 # open json file PPOSProfileFile = open('PPOSProfile.json', mode='w') PPOSProfile = {} t = 1 for i in range(0, len(dictPPOS)): dictb = { 'MAID': dictPPOSMA[PPOSToBeDisaggregated][i], 'TotalUnits': dictPPOS[dictPPOSMA[PPOSToBeDisaggregated][i]][0], 'MinUnits': dictPPOS[dictPPOSMA[PPOSToBeDisaggregated][i]][1], 'PlannedWeek': PlannedWeek } PPOSProfile[t] = dictb PPOSProfileString = json.dumps(PPOSProfile, indent=5) t += 1 #write json file PPOSProfileFile.write(PPOSProfileString) import StringIO out = StringIO.StringIO() book.save(out) book.save('DP.xls') return out.getvalue()
def plot1(moptions, significant_pos, curn): m_signal = [] #deque() #[] m_pos = [] #deque() #[] m_ds = [] #deque() #[] curchr = significant_pos[0][0] curstrand = significant_pos[0][1] curpos = significant_pos[0][2] if moptions["neighborPvalues"] > 0 and (not moptions["testMethod"] == "ks"): mtitle = ( "1=%s VS\n 2=%s:\n p-value=%.1E (ks test p=%.1E) at pos %d of %s strand in %s. Rank %d " % (moptions['ds2'][0], moptions['ds2'][1], significant_pos[1][3][1], significant_pos[1][2][1], curpos + 1, curstrand, curchr, curn + 1)) else: mtitle = ( "1=%s VS\n 2=%s:\n p-value=%.1E at pos %d of %s strand in %s. Rank %d " % (moptions['ds2'][0], moptions['ds2'][1], significant_pos[1][2][1], curpos + 1, curstrand, curchr, curn + 1)) ds0 = moptions[moptions['ds2'][0]] ds1 = moptions[moptions['ds2'][1]] ds2 = [ds0, ds1] sk = (curchr, curstrand) noenough = False pv3 = {} cur_ind = moptions['sign_test'].index(significant_pos) print significant_pos, cur_ind, curn nearybysize = moptions["window"] if moptions['RegionRankbyST'] == 1: nearybysize = int(nearybysize * 2) #for mind in range(cur_ind-moptions["window"], cur_ind+moptions["window"]+1): for mind in range(cur_ind - nearybysize, cur_ind + nearybysize + 1): if pos_check(moptions['sign_test'], cur_ind, mind): #print len(moptions['sign_test']), cur_ind, mind pk = moptions['sign_test'][mind][0][2] pv = moptions['sign_test'][mind][1] pv3[(pk, ds0['base'][sk][pk])] = pv else: noenough = True if noenough: break for mds_ind in range(len(ds2)): mna = ds2[mds_ind]['base'][sk][pk] for sg in ds2[mds_ind]['norm_mean'][sk][pk]: m_ds.append("%d" % (mds_ind + 1)) if moptions["neighborPvalues"] > 0 and ( not moptions["testMethod"] == "ks"): if has_ut == 1: m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E\n%.1E' % (pk + 1, mna, pv[0][1], pv[1][1], pv[2][1], pv[3][1])) else: m_pos.append('%d/%s\n%.1E\n%.1E' % (pk + 1, mna, pv[2][1], pv[3][1])) else: if has_ut == 1: m_pos.append( '%d/%s\n%.1E\n%.1E\n%.1E' % (pk + 1, mna, pv[0][1], pv[1][1], pv[2][1])) else: m_pos.append('%d/%s\n%.1E' % (pk + 1, mna, pv[2][1])) m_signal.append(round(sg, 3)) #for pk in range(curpos-moptions["window"], curpos+moptions["window"]+1): # pv = None; # if pk==curpos: pv = significant_pos[1] # else: # if ds1['norm_mean'].has_key(sk) and ds1['norm_mean'][sk].has_key(pk) and ds0['norm_mean'].has_key(sk) and ds0['norm_mean'][sk].has_key(pk): # pv = getUtest(ds0['norm_mean'][sk][pk], ds1['norm_mean'][sk][pk]) # if pv==None: # noenough = True; # else: # cur_comb_pv = get_fisher_comb_pvalues(moptions, significant_pos) # if not cur_comb_pv==None: # pv.append(cur_comb_pv) # pv3[(pk, ds0['base'][sk][pk])] = pv # if noenough: break; # # for mds_ind in range(len(ds2)): # mna = ds2[mds_ind]['base'][sk][pk] # for sg in ds2[mds_ind]['norm_mean'][sk][pk]: # m_ds.append("%d" % (mds_ind+1)) # if moptions["neighborPvalues"]>0: # m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E\n%.1E' % (pk+1, mna, pv[0], pv[1],pv[2],pv[3])) # else: # m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E' % (pk+1, mna, pv[0], pv[1],pv[2])) # m_signal.append(round(sg,3)) if not noenough: closesize = moptions["neighborPvalues"] * 2 if moptions['RegionRankbyST'] == 1: closesize = moptions["window"] if closesize < 1: closesize = 1 #if significant_pos[0][1]=='-' and 3072-moptions["neighborPvalues"]*3<=significant_pos[0][2]<=3072+moptions["neighborPvalues"]*3: if significant_pos[0][1] == '-' and 3072 - closesize < significant_pos[ 0][2] < 3072 + closesize: print 'Rank', curn + 1, moptions["testMethod"], moptions[ "FileID"], significant_pos[0][0], significant_pos[0][ 1], significant_pos[0][2] + 1, significant_pos[0][3] #poskeys = deque(); pvsp3 = [deque(), deque(), deque()] poskeys = [] pvsp3 = [[], [], [], []] #print 'pvsp3', pvsp3 pv3keys = pv3.keys() pv3keys.sort() for pv3k in pv3keys: if moptions["neighborPvalues"] > 0 and (not moptions["testMethod"] == "ks"): print('%d/%s' % (pv3k[0] + 1, pv3k[1])), ( 'u=%.3E(%.3E) t=%.3E(%.3E) ks=%.3E(%.3E) pv5=%.3E(%.3E)' % (pv3[pv3k][0][1], pv3[pv3k][0][0], pv3[pv3k][1][1], pv3[pv3k][1][0], pv3[pv3k][2][1], pv3[pv3k][2][0], pv3[pv3k][3][1], pv3[pv3k][3][0])) else: print('%d/%s' % (pv3k[0] + 1, pv3k[1])), ( 'u=%.3E(%.3E) t=%.3E(%.3E) ks=%.3E(%.3E)' % (pv3[pv3k][0][1], pv3[pv3k][0][0], pv3[pv3k][1][1], pv3[pv3k][1][0], pv3[pv3k][2][1], pv3[pv3k][2][0])) poskeys.append('%d/%s' % (pv3k[0] + 1, pv3k[1])) #pvsp3[0].append(pv3[pv3k][0]) #pvsp3[1].append(pv3[pv3k][1]) #pvsp3[2].append(pv3[pv3k][2]) pvsp3[0].append(round(math.log10(pv3[pv3k][0][1]), 3)) pvsp3[1].append(round(math.log10(pv3[pv3k][1][1]), 3)) pvsp3[2].append(round(math.log10(pv3[pv3k][2][1]), 3)) if moptions["neighborPvalues"] > 0 and (not moptions["testMethod"] == "ks"): pvsp3[3].append(round(math.log10(pv3[pv3k][3][1]), 3)) print '' stu = { "Position": robjects.StrVector(poskeys), "Pvalue": robjects.FloatVector(pvsp3[0]) } stru = robjects.DataFrame(stu) stt = { "Position": robjects.StrVector(poskeys), "Pvalue": robjects.FloatVector(pvsp3[1]) } strt = robjects.DataFrame(stt) stks = { "Position": robjects.StrVector(poskeys), "Pvalue": robjects.FloatVector(pvsp3[2]) } strks = robjects.DataFrame(stks) if moptions["neighborPvalues"] > 0 and (not moptions["testMethod"] == "ks"): stcb = { "Position": robjects.StrVector(poskeys), "Pvalue": robjects.FloatVector(pvsp3[3]) } else: stcb = { "Position": robjects.StrVector([]), "Pvalue": robjects.FloatVector(pvsp3[3]) } strcb = robjects.DataFrame(stcb) pydf = { "Signal": robjects.FloatVector(m_signal), "Position": robjects.StrVector(m_pos), "DS": robjects.FactorVector(robjects.StrVector(m_ds)) } plotDat = robjects.DataFrame(pydf) mrtitle = robjects.StrVector([mtitle]) mhasbox = robjects.IntVector([has_boxplot]) sys.stdout.flush() robjects.globalenv['Base_Most_Significant_Plot'](plotDat, stru, strt, strks, strcb, mrtitle, mhasbox) return noenough
def Quantile_Normalize(input_file, data_start): """ Take an input file, parse each line up to the data_start column and add those position elements to a list as a string. Take the elements of each line from data_start to end and add to an array, using headers to keep track of where to add each element to array. Quantile normalizes final array and returns both the list of positions and quantile normalized numpy array. Args: input_file = The input file to quantile normalize. data_start = Index of column in which actual data to be normalized starts. Returns: header = Header of output file. pos_list = List of positions for each line. norm_matrix = Quantile normalized matrix of data. """ #Open input file with open(input_file) as f: print("Creating data matrix, may take a few minutes.") #Get header and print to output header = f.readline().strip() #Determine number of samples in file samples = header.strip().split("\t")[data_start:] #Initialize list to hold all the other lists pos_list = [] sample_list = [] #Add appropriate number of lists to master list for item in samples: sample_list.append([]) # Debug chroms = [] #Iterate through file and store each column in a list for line in f: #Used to keep track of data index later count = 0 line = line.strip().split("\t") if line[0] not in chroms: print(line[0]) chroms.append(line[0]) position = line[0:data_start] pos_list.append("\t".join(position)) data = line[data_start:] #Add data to appropriate list for entry in data: #Add pseudocount sample_list[count].append(float(entry) + 0.1) count += 1 print("Converting to R matrix.") #Actually do the QN matrix = sample_list del sample_list v = robjects.FloatVector( [element for col in matrix for element in col]) m = robjects.r['matrix'](v, ncol=len(matrix), byrow=False) print("Performing quantile normalization.") Rnormalized_matrix = preprocessCore.normalize_quantiles(m) norm_matrix = np.array(Rnormalized_matrix) return header, pos_list, norm_matrix
# Also check if anyone's missing df_new = df_both["lh"].append(df_both["rh"]) # Redo index df_new.index = range(df_new.shape[0]) # Sort the column print "Sort by Hemi, Cluster, and Stat" import rpy2.robjects as robjects r = robjects.r cluster = robjects.IntVector(df_new.Cluster.tolist()) network = robjects.StrVector(df_new.YeoNetwork.tolist()) stat = robjects.FloatVector(df_new.Stat.tolist()) o = np.array(r.order(cluster, network, stat, decreasing=True)) - 1 df2 = df_new.ix[o, :] ##### print "Combine, Select, Mash" # Combine the aparc, subcortical, and cerebellum cols = [ "Cluster", "Network", "Hemi", "Region", "BA", "x", "y", "z", "Statistic" ] dict3 = {k: [] for k in cols} for i, row in df2.iterrows():
def main(): xcms = importr('xcms') fn_feat = sys.argv[1] fn_mzML = fn_feat.replace("features.tsv", "mzML") shift0 = float(sys.argv[2]) #load mzML file fl = xcms.xcmsRaw(fn_mzML, profstep=0, includeMSn=False) lines = open(fn_feat, 'r').readlines() tags = {k: n for n, k in enumerate(lines[0].strip().split('\t'))} pairs = {} for c in charges: pairs[c] = [] for n, l in enumerate(lines[1:]): es = l.strip().split('\t') charge = int(es[tags['charge']]) if charge not in charges: continue rtStart = float(es[tags['rtStart']]) rtEnd = float(es[tags['rtEnd']]) mz = float(es[tags['mz']]) mzApex = float(es[tags['mostAbundantMz']]) rtApex = float(es[tags['rtApex']]) intApex = float(es[tags['intensityApex']]) intSum = float(es[tags['intensitySum']]) mz0 = mz #Apex # p -- 0 -- q mz_p = mz0 - shift0 / charge mz_q = mz0 + shift0 / charge rt_range = rob.FloatVector([rtStart * 60.0, rtEnd * 60.0]) mz_p_range = rob.FloatVector([mz_p - tol, mz_p + tol]) EIC_p = xcms.rawEIC(fl, mz_p_range, rt_range) scan_p, intens_p = EIC_p.items() mz0_range = rob.FloatVector([mz0 - tol, mz0 + tol]) EIC0 = xcms.rawEIC(fl, mz0_range, rt_range) scan0, intens0 = EIC0.items() mz_q_range = rob.FloatVector([mz_q - tol, mz_q + tol]) EIC_q = xcms.rawEIC(fl, mz_q_range, rt_range) scan_q, intens_q = EIC_q.items() corr1, r1, Np1 = check_chromatograms_corr(intens_p, intens0) #print( corr1, r1, Np1 ) #if corr1 > max_R2: if corr1 > -Np1 / A + 4.0 / A + 0.95: #print(corr1, r1) p = paired_feats() p.mz = mz_p #mz p.mzLApex = mzApex - shift0 / charge #mz_p p.mzHApex = mzApex #mz0 p.rtStart = rtStart p.rtEnd = rtEnd p.rtApex = rtApex p.intApex = intApex p.intR = r1 p.intSum = intSum p.charge = charge p.corr = corr1 p.Np = Np1 pairs[charge].append(p) #print(p.output()) corr2, r2, Np2 = check_chromatograms_corr(intens0, intens_q) #print( corr2, r2, Np2 ) #if corr2 > max_R2: if corr2 > -Np2 / A + 4.0 / A + 0.95: #print(corr2, r2) p = paired_feats() p.mz = mz0 #mz p.mzLApex = mzApex #mz0 p.mzHApex = mzApex + shift0 / charge #mz_q p.rtStart = rtStart p.rtEnd = rtEnd p.rtApex = rtApex p.intApex = intApex p.intR = r2 p.intSum = intSum p.charge = charge p.corr = corr2 p.Np = Np2 pairs[charge].append(p) #print(p.output()) #merge all pair for c in charges: #print("charge:", c, "Npair:", len(pairs[c])) closed_pairs = [] sorted_pairs = sorted(pairs[c], key=lambda x: x.mz) #LApex nlast = len(sorted_pairs) if nlast == 0: continue p0 = 0 p1 = 1 closed_pairs.append(sorted_pairs[p0]) while p1 < nlast: mz0 = sorted_pairs[p0].mz #LApex mz1 = sorted_pairs[p1].mz #LApex mz_tol = tol if mz1 - mz0 > mz_tol: merged_pairs = merge_pairs(closed_pairs) for p in merged_pairs: print(p.output()) closed_pairs = [] p0 = p1 p1 = p0 + 1 closed_pairs.append(sorted_pairs[p0]) else: closed_pairs.append(sorted_pairs[p1]) p1 = p1 + 1 merged_pairs = merge_pairs(closed_pairs) for p in merged_pairs: print(p.output())
def make_isa_data(nrows=300, ncols=50, nclusts=3, nclustrows=None, nclustcols=None, noise=0, bicluster_signals=None, bicluster_noise=None, noverlap_rows=0, noverlap_cols=None, shuffle=None): """ Make ISA-style data. Generates a dataset using the Bioconductor 'isa2' package's make.isa.data function. If an argument is None, it is not included, and isa2's defaults are used. Requires that 'isa2' be installed. Args: * nrows: Number of rows in the data matrix. * cols: Number of columns in the data matrix. * nclusts: Number of biclusters. * nclustrows: Rows in each bicluster. Defaults to round(0.5 * num_rows/num_fact) * nclustcols: Cols in each bicluster. round(0.5 * num_cols/num_fact) * noise: Standard deviation of normal noise in background. * bicluster_signals: List of base signals for each bicluster. Defaults to 1's. * bicluster_noise: List of noise standard deviations for each bicluster. Defaults to 0's. * noverlap_rows: Number of bicluster rows that overlap. * noverlap_cols: Number of coluster columns that overlap. Defaults to 'overlap_row'. * shuffle: If True, shuffle rows and columns. """ args = locals() isa_map = dict( nrows='num_rows', ncols='num_cols', nclusts='num_fact', nclustrows='mod_row_size', nclustcols='mod_col_size', noise='noise', bicluster_signals='mod_signal', bicluster_noise='mod_noise', noverlap_rows='overlap_row', noverlap_cols='overlap_col', ) isa_args = dict() for key, argkey in isa_map.iteritems(): isa_args[argkey] = args[key] #remove empty keys empty_keys = [] for key in isa_args: if isa_args[key] is None: empty_keys.append(key) for key in empty_keys: isa_args.pop(key) for key in ['mod_signal', 'mod_noise']: if key in isa_args: isa_args[key] = robjects.FloatVector(list(isa_args[key])) robjects.r.library('isa2') #get data func = robjects.r['isa.in.silico'] result = func(**isa_args) #convert to python data = numpy.array(robjects.Matrix(result[0])).copy() rows = numpy.array(robjects.Matrix(result[1])).copy() cols = numpy.array(robjects.Matrix(result[2])).copy() nbiclusters = rows.shape[1] row_list = [] for i in range(nbiclusters): row = list(rows[:, i].nonzero()[0]) row_list.append(row) col_list = [] for i in range(nbiclusters): col = list(cols[:, i].nonzero()[0]) col_list.append(col) expected = [] for r, c, in zip(row_list, col_list): expected.append(Bicluster(r, c, data)) if shuffle: data, expected = _shuffle_(data, expected) return data, expected
def geweke(data): robjects.r('library(coda)') r_geweke = robjects.r['geweke.diag'] data = robjects.r.matrix(robjects.FloatVector(data), nrow=len(data)) return r_geweke(data)[0]