def _replot(self): if self._grDevices.dev_list() != r("NULL"): self._grDevices.dev_off() self._graphics.par(bg) self._graphics.split_screen(r.c(2,1)) self._graphics.split_screen(r.c(1, 2), screen=2) self._graphics.screen(1) self._plot_regions()
def convert_hgnc2ensembl(hgnc_id): init_biomaRt() v = R.c(hgnc_id) res = R.getBM(attributes=R.c("ensembl_gene_id"), filters="hgnc_symbol", values=v, mart=__mart) try: return R.get("ensembl_gene_id", res)[0] except: print 'Error convert_hgnc2ensembl: '+str(hgnc_id)+' not found in database' return None
def convert_list_ensembl2hgnc(ensembl_id_list): init_biomaRt() v = R.c(ensembl_id_list) res = R.getBM(attributes=R.c("hgnc_symbol"), filters="ensembl_gene_id", values=v, mart=__mart) try: return R.get("hgnc_symbol", res) except: print 'Error convert_ensembl2hgnc: '+str(ensembl_id)+' not found in database' return None
def _roc_curve_r(observations, predictions, FDRth=0.05): """ :param observations: known truth set :param predictions: all data :param FDRth: :return: """ obs_rtbl = numpy2ri.py2ri(observations) prd_rtbl = numpy2ri.py2ri(predictions) roc_prm = {'direction': '>'} RES = pROC.roc(obs_rtbl, prd_rtbl, **roc_prm) auc = pandas2ri.ri2py(RES.rx2('auc'))[0] columns = ['threshold', 'ppv', 'sensitivity', 'specificity'] coor_prm = {'ret': r.c('threshold', 'ppv', 'sensitivity', 'specificity')} COORS = pROC.coords(RES, 'all', **coor_prm) cords = numpy2ri.ri2py(COORS) df = pd.DataFrame(cords.T, columns=columns) FDR5percTh = (df[df.ppv >= (1 - FDRth)])['threshold'].max() if not np.isnan(FDR5percTh): index_min = min(df[df.threshold <= FDR5percTh].index.tolist()) else: index_min = 0 threshold = df.at[index_min, 'threshold'] SENS = df.at[index_min, 'sensitivity'] SPEC = df.at[index_min, 'specificity'] return df, auc, SENS, FDR5percTh
def deaScranDESeq2(counts, conds, comparisons, alpha, scran_clusters=False): """Makes a call to DESeq2 with SCRAN to perform D.E.A. in the given counts matrix with the given conditions and comparisons. Returns a list of DESeq2 results for each comparison """ results = list() n_cells = len(counts.columns) try: pandas2ri.activate() deseq2 = RimportLibrary("DESeq2") scran = RimportLibrary("scran") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) as_matrix = r["as.matrix"] # Create the R conditions and counts data r_counts = pandas2ri.py2ri(counts) cond = robjects.StrVector(conds) r_call = """ function(r_counts) { sce = SingleCellExperiment(assays=list(counts=r_counts)) return(sce) } """ r_func = r(r_call) sce = r_func(as_matrix(r_counts)) if scran_clusters: r_clusters = scran.quickCluster(as_matrix(r_counts), max(n_cells/10, 10)) min_cluster_size = min(Counter(r_clusters).values()) sizes = list(set([round((min_cluster_size/2) / i) for i in [5,4,3,2,1]])) sce = scran.computeSumFactors(sce, clusters=r_clusters, sizes=sizes, positive=True) else: sizes = list(set([round((n_cells/2) * i) for i in [0.1,0.2,0.3,0.4,0.5]])) sce = scran.computeSumFactors(sce, sizes=sizes, positive=True) sce = r.normalize(sce) dds = r.convertTo(sce, type="DESeq2") r_call = """ function(dds, conditions){ colData(dds)$conditions = as.factor(conditions) design(dds) = formula(~ conditions) return(dds) } """ r_func = r(r_call) dds = r_func(dds, cond) dds = r.DESeq(dds) # Perform the comparisons and store results in list for A,B in comparisons: result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha) result = r['as.data.frame'](result) genes = r['rownames'](result) result = pandas2ri.ri2py_dataframe(result) # There seems to be a problem parsing the rownames from R to pandas # so we do it manually result.index = genes results.append(result) pandas2ri.deactivate() except Exception as e: raise e return results
def _prepare_r_instance(self, data: pd.DataFrame): if data.index.freq is None: freq = pd.infer_freq(data.index) if freq is None: raise ValueError( f"The time series index has no valid frequency. Index={data.index}" ) data.index.freq = freq print("input", r.c(data.index[0].year, _get_start_epoch(data))) sts = surveillance.sts( start=r.c(data.index[0].year, _get_start_epoch(data)), epoch=robjects.IntVector([ r["as.numeric"](r["as.Date"](d.isoformat()))[0] for d in data.index.date ]), # epoch=data.index, freq=_get_freq(data), observed=data["n_cases"].values, epochAsDate=True, ) return sts
def __init__(self, polyfile=None, findfile=None, bg="white"): self._graphics = importr("graphics") self._grDevices = importr("grDevices") rpy2.interactive.process_revents.start() self._NA = r("NA")[0] self._C = lambda seq: r.c(*seq) self._polyfile = polyfile self._findfile = findfile self._replot()
def _call_surveillance_algo(self, disprog_obj, detection_range): control = r.list( range=detection_range, b=self.years_back, w=self.window_half_width, reweight=self.reweight, alpha=self.alpha, trend=self.trend, limit54=r.c(self.min_cases_in_past_periods, self.past_period_cutoff), powertrans=self.power_transform, ) surv = surveillance.algo_farrington(disprog_obj, control=control) return surv
def plot_me(sub_f, label): if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10): sub_f = to_quants(sub_f, q=q, std=std) m = get_cox_ph(surv, sub_f, formula=fmla) r_data = m.rx2('call')[2] p = log_rank(sub_f, surv)['p'] ls = r.c(*colors) r.plot(survival.survfit(fmla, r_data), lty=1, col=ls, lwd=4, cex=1.25, xlab='Years to Event', ylab='Survival'); r.title(label, cex=3.) if ann == 'p': r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4) elif ann != None: r.text(0, labels=ann, pos=4)
def deaDESeq2(counts, conds, comparisons, alpha, size_factors=None): """Makes a call to DESeq2 to perform D.E.A. in the given counts matrix with the given conditions and comparisons. Can be given size factors. Returns a list of DESeq2 results for each comparison """ results = list() try: pandas2ri.activate() deseq2 = RimportLibrary("DESeq2") multicore = RimportLibrary("BiocParallel") multicore.register( multicore.MulticoreParam(multiprocessing.cpu_count() - 1)) # Create the R conditions and counts data r_counts = pandas2ri.py2ri(counts) cond = robjects.DataFrame({"conditions": robjects.StrVector(conds)}) design = r('formula(~ conditions)') dds = r.DESeqDataSetFromMatrix(countData=r_counts, colData=cond, design=design) if size_factors is None: dds = r.DESeq(dds, parallel=True) else: assign_sf = r["sizeFactors<-"] dds = assign_sf(object=dds, value=robjects.FloatVector(size_factors)) dds = r.estimateDispersions(dds) dds = r.nbinomWaldTest(dds) # Perform the comparisons and store results in list for A, B in comparisons: result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha) result = r['as.data.frame'](result) genes = r['rownames'](result) result = pandas2ri.ri2py_dataframe(result) # There seems to be a problem parsing the rownames from R to pandas # so we do it manually result.index = genes results.append(result) pandas2ri.deactivate() except Exception as e: raise e return results
def _call_surveillance_algo(self, sts, detection_range): control = r.list( **{ "range": detection_range, "c.ARL": self.glr_test_threshold, "m0": robjects.NULL, # Mtilde is set to 1, since that is the only valid value for "epi" and "intercept" "Mtilde": 1, "M": self.m, "change": self.change, # Role of theta: If NULL then the GLR scheme is used. If not NULL the prespecified value for κ or λ is used in a recursive LR scheme, which is faster.""" "theta": robjects.NULL, "dir": r.c(*self.direction), "ret": self.upperbound_statistic, }) surv = surveillance.glrpois(sts, control=control) return surv
def __add_GO_info(dictionary): whichTerms = R.c(dictionary.keys()) qTerms = R.paste(R.paste("'", whichTerms, "'", sep=""), collapse=",") retVal = R.dbGetQuery(R.GO_dbconn(), R.paste("SELECT ontology, go_id, term, definition FROM go_term WHERE go_id IN (", qTerms, ");", sep="")) for iter in retVal.iter_row(): go_id = iter.rx2('go_id')[0] ontology = iter.rx2('ontology')[0] term = iter.rx2('term')[0] definition = iter.rx2('definition')[0] dictionary[go_id]['ontology'] = ontology dictionary[go_id]['term'] = term dictionary[go_id]['definition'] = definition return dictionary
def _call_surveillance_algo(self, sts, detection_range): control = r.list( range=detection_range, b=self.years_back, w=self.window_half_width, reweight=self.reweight, weightsThreshold=self.weights_threshold, alpha=self.alpha, trend=self.trend, trend_threshold=self.trend_threshold, limit54=r.c(self.min_cases_in_past_periods, self.past_period_cutoff), powertrans=self.power_transform, pastWeeksNotIncluded=self.past_weeks_not_included, thresholdMethod=self.threshold_method, ) surv = surveillance.farringtonFlexible(sts, control=control) return surv
def _call_surveillance_algo(self, sts, detection_range): control = r.list( **{ "range": detection_range, "c.ARL": self.glr_test_threshold, "m0": robjects.NULL, "alpha": self.alpha, # Mtilde is set to 1, since that is the only valid value for "epi" and "intercept" "Mtilde": 1, "M": self.m, "change": self.change, "theta": robjects.NULL, "dir": r.c(*self.direction), "ret": self.upperbound_statistic, "xMax": self.x_max, }) surv = surveillance.glrnb(sts, control=control) return surv
def process(outf, dti_f, bval_f, python=False): """ Take a list of lists of files DTI and b-val files, returns a gzip R file with all B0 data arrays stored on it. """ if python: import collections b0s = collections.OrderedDict() for idx, scan in enumerate(bval_f): print scan basename = os.path.basename(scan) print basename bval = np.loadtxt(scan) bval[np.where(bval==np.min(bval))] = 0 im = nb.load(dti_f[idx]) b0_loc = np.where(bval==np.min(bval))[0][0] dti = im.get_data()[:,:,:,b0_loc] if python: b0s[basename] = np.ravel(dti) else: ro = numpy2ri(np.ravel(dti+1)) rr = robj.Matrix(ro) if idx is 0: myl = r.list(basename=rr) else: myl = r.c(myl, r.list(basename=rr)) if python: import pickle # write python dict to a file #mydict = {'a': 1, 'b': 2, 'c': 3} output = open(outf, 'wb') pickle.dump(b0s, output) output.close() # read python dict back from the file # pkl_file = open('myfile.pkl', 'rb') # mydict2 = pickle.load(pkl_file) # pkl_file.close() else: r.assign('bar', myl) r("save(bar, file='"+outf+"', compress=TRUE)")
def deaDESeq2(counts, conds, comparisons, alpha, size_factors=None): """Makes a call to DESeq2 to perform D.E.A. in the given counts matrix with the given conditions and comparisons. Can be given size factors. Returns a list of DESeq2 results for each comparison """ results = list() try: pandas2ri.activate() deseq2 = RimportLibrary("DESeq2") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) # Create the R conditions and counts data r_counts = pandas2ri.py2ri(counts) cond = robjects.DataFrame({"conditions": robjects.StrVector(conds)}) design = r('formula(~ conditions)') dds = r.DESeqDataSetFromMatrix(countData=r_counts, colData=cond, design=design) if size_factors is None: dds = r.DESeq(dds, parallel=True, useT=True, minmu=1e-6, minReplicatesForReplace=np.inf) else: assign_sf = r["sizeFactors<-"] dds = assign_sf(object=dds, value=robjects.FloatVector(size_factors)) dds = r.estimateDispersions(dds) dds = r.nbinomWaldTest(dds) # Perform the comparisons and store results in list for A,B in comparisons: result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha, parallel=True) result = r['as.data.frame'](result) genes = r['rownames'](result) result = pandas2ri.ri2py_dataframe(result) # There seems to be a problem parsing the rownames from R to pandas # so we do it manually result.index = genes results.append(result) pandas2ri.deactivate() except Exception as e: raise e return results
def plot_segments(cbs_fc, cbs_normfc, outdir='./'): """ :param cbs_fc: raw fold chnages :param cbs_normfc: normalised fold changes :param outdir: :return: """ pdf_prm = {'file': "{}/09_Raw_vs_postCRISPRcleanR_segmentation_fold_changes.pdf".format(outdir), 'width': 7.5, 'height': 7.5} grdevices.pdf(**pdf_prm) r.par(mfrow=r.c(2, 1)) for chr_name, (_, _, cnseg_raw) in cbs_fc.items(): (_, _, cnseg_norm) = cbs_normfc[chr_name] plot_prm = {'main': "raw_FCs_chr{}".format(chr_name), 'xlab': 'sgRNA_Index', 'ylab': 'FCs'} dnacopy.plotSample(cnseg_raw, **plot_prm) # plot normalised fold changes plot_prm = {'main': "CRISPRcleanR_FCs_chr{}".format(chr_name), 'xlab': 'sgRNA_Index', 'ylab': 'FCs'} dnacopy.plotSample(cnseg_norm, **plot_prm) grdevices.dev_off()
def plot_me(sub_f, label): if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10): sub_f = to_quants(sub_f, q=q, std=std) m = get_cox_ph(surv, sub_f, formula=fmla) r_data = m.rx2('call')[2] p = log_rank(sub_f, surv)['p'] ls = r.c(*colors) r.plot(survival.survfit(fmla, r_data), lty=1, col=ls, lwd=4, cex=1.25, xlab='Years to Event', ylab='Survival') r.title(label, cex=3.) if ann == 'p': r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4) elif ann != None: r.text(0, labels=ann, pos=4)
def draw_survival_curves(feature, surv, assignment=None, filename='tmp.png', show=False, title=True, labels=None, colors=['blue', 'red'], ann=None, show_legend=True, q=.25, std=None): if assignment is None: num_panels = 1 assignment = feature.map(lambda s: 1) name = lambda v: str(feature.name) if feature.name != None else '' else: num_panels = len(assignment.unique()) name = lambda v: str(assignment.name) + ' = ' + str(v) if (labels is None) and ((len(feature) / feature.nunique()) > 10): labels = r.sort(r.c(*feature.unique())) # R sorts bad colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black'] if feature.dtype == 'bool': feature = feature.map({True: 'True', False: 'False'}) r.png(filename=filename, width=200 * (num_panels + 1), height=300, res=75) fmla = robjects.Formula('Surv(days, event) ~ feature') r.par(mfrow=r.c(1, num_panels)) r.par(mar=r.c(4, 5, 4, 1)) r.par(xpd=True) if (get_vec_type(feature) == 'real') and (len(feature.unique()) > 10): colors = ['blue', 'orange', 'red'] if q == .5: labels = ['Bottom 50%', 'Top 50%'] else: labels = [ 'Bottom {}%'.format(int(q * 100)), 'Normal', 'Top {}%'.format(int(q * 100)) ] ls = r.c(*colors) def plot_me(sub_f, label): if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10): sub_f = to_quants(sub_f, q=q, std=std) m = get_cox_ph(surv, sub_f, formula=fmla) r_data = m.rx2('call')[2] p = log_rank(sub_f, surv)['p'] ls = r.c(*colors) r.plot(survival.survfit(fmla, r_data), lty=1, col=ls, lwd=4, cex=1.25, xlab='Years to Event', ylab='Survival') r.title(label, cex=3.) if ann == 'p': r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4) elif ann != None: r.text(0, labels=ann, pos=4) if show_legend == 'out': r.par(xpd=True, mar=r.c(4, 5, 5, 8)) for value in sorted(assignment.ix[feature.index].dropna().unique()): f = feature.ix[assignment[assignment == value].index] if len(f.unique()) > 1: plot_me(f, name(value)) if show_legend == True: mean_s = surv.ix[:, 'event'].ix[assignment[assignment == value].index].mean() if mean_s < .5: r.legend(surv.ix[:, 'days'].max() * .05 / 365., .45, labels, lty=1, col=ls, lwd=3, bty='o') else: r.legend(surv.ix[:, 'days'].max() * .4 / 365, .9, labels, lty=1, col=ls, lwd=3, bty='o') elif show_legend == 'out': r.legend(surv.ix[:, 'days'].max() * 1.1 / 365, .9, labels, lty=1, col=ls, lwd=3, bty='o') r('dev.off()') if show: return Show(filename)
if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R("""op <- par(mar=c(11,4,4,2))""" ) # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), beside=TRUE %s)""" % extra_options) elif method == "scatter+marginal": if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R("""matrix""") R(""" x <- matrix[,1]; y <- matrix[,2]; xhist <- hist(x, breaks=20, plot=FALSE); yhist <- hist(y, breaks=20, plot=FALSE); top <- max(c(xhist$counts, yhist$counts)); nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE ); par(mar=c(3,3,1,1)) ; plot(x, y, cex=%s, pch="o" %s) ; par(mar=c(0,3,1,1)) ; barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ; par(mar=c(3,0,1,1)) ; title(main='%s');
def draw_survival_curves(feature, surv, assignment=None, filename='tmp.png', show=False, title=True, labels=None, colors=['blue', 'red'], ann=None, show_legend=True, q=.25, std=None): if assignment is None: num_panels = 1 assignment = feature.map(lambda s: 1) name = lambda v: str(feature.name) if feature.name != None else '' else: num_panels = len(assignment.unique()) name = lambda v: str(assignment.name) + ' = ' + str(v) if (labels is None) and ((len(feature) / feature.nunique()) > 10): labels = r.sort(r.c(*feature.unique())) # R sorts bad colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black'] if feature.dtype == 'bool': feature = feature.map({True: 'True', False: 'False'}) r.png(filename=filename, width=200 * (num_panels + 1), height=300, res=75) fmla = robjects.Formula('Surv(days, event) ~ feature') r.par(mfrow=r.c(1, num_panels)) r.par(mar=r.c(4, 5, 4, 1)) r.par(xpd=True) if (get_vec_type(feature) == 'real') and (len(feature.unique()) > 10): colors = ['blue', 'orange', 'red'] if q == .5: labels = ['Bottom 50%', 'Top 50%'] else: labels = ['Bottom {}%'.format(int(q * 100)), 'Normal', 'Top {}%'.format(int(q * 100))] ls = r.c(*colors) def plot_me(sub_f, label): if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10): sub_f = to_quants(sub_f, q=q, std=std) m = get_cox_ph(surv, sub_f, formula=fmla) r_data = m.rx2('call')[2] p = log_rank(sub_f, surv)['p'] ls = r.c(*colors) r.plot(survival.survfit(fmla, r_data), lty=1, col=ls, lwd=4, cex=1.25, xlab='Years to Event', ylab='Survival'); r.title(label, cex=3.) if ann == 'p': r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4) elif ann != None: r.text(0, labels=ann, pos=4) if show_legend == 'out': r.par(xpd=True, mar=r.c(4, 5, 5, 8)) for value in sorted(assignment.ix[feature.index].dropna().unique()): f = feature.ix[assignment[assignment == value].index] if len(f.unique()) > 1: plot_me(f, name(value)) if show_legend == True: mean_s = surv.ix[:, 'event'].ix[assignment[assignment == value].index].mean() if mean_s < .5: r.legend(surv.ix[:, 'days'].max() * .05 / 365., .45, labels, lty=1, col=ls, lwd=3, bty='o') else: r.legend(surv.ix[:, 'days'].max() * .4 / 365, .9, labels, lty=1, col=ls, lwd=3, bty='o') elif show_legend == 'out': r.legend(surv.ix[:, 'days'].max() * 1.1 / 365, .9, labels, lty=1, col=ls, lwd=3, bty='o') r('dev.off()') if show: return Show(filename)
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: r_table2scatter.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take from table. Choices are 'all', 'all-but-first' or a ','-separated list of columns.") parser.add_option("--logscale", dest="logscale", type="string", help="log-transform one or both axes [default=%Default].") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file [default=%default].", metavar="FILE") parser.add_option("-f", "--file", dest="input_filename", type="string", help="filename with table data [default=%default].", metavar="FILE") parser.add_option("-2", "--file2", dest="input_filename2", type="string", help="additional data file [default=%default].", metavar="FILE") parser.add_option("-s", "--stats", dest="statistics", type="choice", choices=("correlation", "spearman", "pearson", "count"), help="statistical quantities to compute [default=%default]", action="append") parser.add_option("-p", "--plot", dest="plot", type="choice", choices=("scatter", "pairs", "panel", "bar", "bar-stacked", "bar-besides", "1_vs_x", "matched", "boxplot", "scatter+marginal", "scatter-regression"), help="plots to plot [default=%default]", action="append") parser.add_option("-t", "--threshold", dest="threshold", type="float", help="min threshold to use for counting method [default=%default].") parser.add_option("-o", "--colours", dest="colours", type="int", help="column with colour information [default=%default].") parser.add_option("-l", "--plot-labels", dest="labels", type="string", help="column labels for x and y in matched plots [default=%default].") parser.add_option("-d", "--add-diagonal", dest="add_diagonal", action="store_true", help="add diagonal to plot [default=%default].") parser.add_option("-e", "--plot-legend", dest="legend", type="int", help="column with legend [default=%default].") parser.add_option("-r", "--options", dest="r_options", type="string", help="R plotting options [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("full", "sparse"), help="output format [default=%default].") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.add_option("", "--xrange", dest="xrange", type="string", help="x viewing range of plot [default=%default].") parser.add_option("", "--yrange", dest="yrange", type="string", help="y viewing range of plot[default=%default].") parser.add_option("--allow-empty-file", dest="fail_on_empty", action="store_false", help="do not fail on empty input [default=%default].") parser.add_option("--fail-on-empty", dest="fail_on_empty", action="store_true", help="fail on empty input [default=%default].") parser.set_defaults( hardcopy=None, input_filename="", input_filename2=None, columns="all", logscale=None, statistics=[], plot=[], threshold=0.0, labels="x,y", colours=None, diagonal=False, legend=None, title=None, xrange=None, yrange=None, r_options="", fail_on_empty=True, format="full") (options, args) = E.Start(parser) if len(args) == 1 and not options.input_filename: options.input_filename = args[0] if options.columns not in ("all", "all-but-first"): options.columns = [int(x) - 1 for x in options.columns.split(",")] if options.colours: options.colours -= 1 if options.legend: options.legend -= 1 table = {} headers = [] # read data matrix if options.input_filename: lines = IOTools.openFile(options.input_filename, "r").readlines() else: # note: this will not work for interactive viewing, but # creating hardcopy plots works. lines = sys.stdin.readlines() lines = [x for x in lines if x[0] != "#"] if len(lines) == 0: if options.fail_on_empty: raise IOError("no input") E.warn("empty input") E.Stop() return matrix, headers, colours, legend = readTable(lines, "matrix", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) if options.input_filename2: # read another matrix (should be of the same format. matrix2, headers2, colours2, legend2 = readTable( lines, "matrix2", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) R.assign("headers", headers) ndata = R("""length( matrix[,1] )""")[0] if options.loglevel >= 1: options.stdlog.write("# read matrix: %ix%i\n" % (len(headers), ndata)) if colours: R.assign("colours", colours) for method in options.statistics: if method == "correlation": cor = R.cor(matrix, use="pairwise.complete.obs") writeMatrix(sys.stdout, cor, headers=headers, format="%5.2f") elif method == "pearson": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "n", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): try: result = R( """cor.test( matrix[,%i], matrix[,%i] )""" % (x + 1, y + 1)) except rpy.RPyException as msg: E.warn("correlation not computed for columns %i(%s) and %i(%s): %s" % ( x, headers[x], y, headers[y], msg)) options.stdout.write("%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" % (headers[x], headers[y], "na", "na", "na", 0, "na", "na")) else: options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result.rx2('estimate').rx2( 'cor')[0], Stats.getSignificance( float(result.rx2('p.value')[0])), result.rx2('p.value')[0], result.rx2('parameter').rx2( 'df')[0], result.rx2('method')[0], result.rx2('alternative')[0])) elif method == "spearman": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): result = R( """cor.test( matrix[,%i], matrix[,%i], method='spearman')""" % (x + 1, y + 1)) options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result['estimate']['rho'], Stats.getSignificance(float(result['p.value'])), result['p.value'], result['parameter']['df'], result['method'], result['alternative'])) elif method == "count": # number of shared elements > threshold m, r, c = MatlabTools.ReadMatrix(open(options.input_filename, "r"), take=options.columns, headers=True) mask = numpy.greater(m, options.threshold) counts = numpy.dot(numpy.transpose(mask), mask) writeMatrix(options.stdout, counts, headers=c, format="%i") if options.plot: # remove columns that are completely empty if "pairs" in options.plot: colsums = R('''colSums( is.na(matrix ))''') take = [x for x in range(len(colsums)) if colsums[x] != ndata] if take: E.warn("removing empty columns %s before plotting" % str(take)) matrix = R.subset(matrix, select=[x + 1 for x in take]) R.assign("""matrix""", matrix) headers = [headers[x] for x in take] if legend: legend = [headers[x] for x in take] if options.r_options: extra_options = ", %s" % options.r_options else: extra_options = "" if options.legend is not None and len(legend): extra_options += ", legend=c('%s')" % "','".join(legend) if options.labels: xlabel, ylabel = options.labels.split(",") extra_options += ", xlab='%s', ylab='%s'" % (xlabel, ylabel) else: xlabel, ylabel = "", "" if options.colours: extra_options += ", col=colours" if options.logscale: extra_options += ", log='%s'" % options.logscale if options.xrange: extra_options += ", xlim=c(%f,%f)" % tuple( map(float, options.xrange.split(","))) if options.yrange: extra_options += ", ylim=c(%f,%f)" % tuple( map(float, options.yrange.split(","))) if options.hardcopy: if options.hardcopy.endswith(".eps"): R.postscript(options.hardcopy) elif options.hardcopy.endswith(".png"): R.png(options.hardcopy, width=1024, height=768, type="cairo") elif options.hardcopy.endswith(".jpg"): R.jpg(options.hardcopy, width=1024, height=768, type="cairo") for method in options.plot: if ndata < 100: point_size = "1" pch = "o" elif ndata < 1000: point_size = "1" pch = "o" else: point_size = "0.5" pch = "." if method == "scatter": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % ( point_size, extra_options)) if method == "scatter-regression": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % ( point_size, extra_options)) dat = R( """dat <- data.frame(x = matrix[,1], y = matrix[,2])""") R( """new <- data.frame(x = seq( min(matrix[,1]), max(matrix[,1]), (max(matrix[,1]) - min(matrix[,1])) / 100))""") mod = R("""mod <- lm( y ~ x, dat)""") R("""predict(mod, new, se.fit = TRUE)""") R("""pred.w.plim <- predict(mod, new, interval="prediction")""") R("""pred.w.clim <- predict(mod, new, interval="confidence")""") R( """matpoints(new$x,cbind(pred.w.clim, pred.w.plim[,-1]), lty=c(1,2,2,3,3), type="l")""") R.mtext( "y = %f * x + %f, r=%6.4f, n=%i" % (mod["coefficients"]["x"], mod["coefficients"][ "(Intercept)"], R("""cor( dat )[2]"""), ndata), 3, cex=1.0) elif method == "pairs": if options.add_diagonal: R( """panel.hist <- function( x,y,... ) { points(x,y,...); abline(0,1); }""") else: R( """panel.hist <- function( x,y,... ) { points(x,y,...); }""") # There used to be a argument na_action="na.omit", but # removed this as there appeared error messages saying # "na.action is not a graphical parameter" and the # plots showed occasionally the wrong scale. # cex=point_size also caused trouble (error message: # "X11 used font size 8 when 2 was requested" or # similar) if options.colours: R.pairs(matrix, pch=pch, col=colours, main=options.title, panel="panel.hist", labels=headers, cex_labels=2.0) else: R.pairs(matrix, pch=pch, panel="panel.hist", main=options.title, labels=headers, cex_labels=2.0) elif method == "boxplot": extra_options += ",main='%s'" % options.title # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R( """op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot R("""boxplot( matrix %s)""" % extra_options) elif method == "bar" or method == "bar-stacked": if not options.colours: extra_options += ", col=rainbow(5)" # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R( """op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), %s)""" % extra_options) elif method == "bar-besides": if not options.colours: extra_options += ", col=rainbow(%i)" % ndata # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R( """op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), beside=TRUE %s)""" % extra_options) elif method == "scatter+marginal": if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R("""matrix""") R(""" x <- matrix[,1]; y <- matrix[,2]; xhist <- hist(x, breaks=20, plot=FALSE); yhist <- hist(y, breaks=20, plot=FALSE); top <- max(c(xhist$counts, yhist$counts)); nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE ); par(mar=c(3,3,1,1)) ; plot(x, y, cex=%s, pch="o" %s) ; par(mar=c(0,3,1,1)) ; barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ; par(mar=c(3,0,1,1)) ; title(main='%s'); barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE ) ; title(main='%s'); """ % (point_size, extra_options, xlabel, ylabel)) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) elif method in ("panel", "1_vs_x", "matched"): if method == "panel": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): pairs.append((x, y)) elif method == "1_vs_x": pairs = [] for x in range(1, len(headers)): pairs.append((0, x)) # print matching columns elif method == "matched": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): if headers[x] == headers[y]: pairs.append((x, y)) break w = int(math.ceil(math.sqrt(len(pairs)))) h = int(math.ceil(float(len(pairs)) / w)) PosInf = 1e300000 NegInf = -1e300000 xlabel, ylabel = options.labels.split(",") R("""layout(matrix(seq(1,%i), %i, %i, byrow = TRUE))""" % (w * h, w, h)) for a, b in pairs: new_matrix = [x for x in zip( list(matrix[a].values())[0], list(matrix[b].values())[0]) if x[0] not in (float("nan"), PosInf, NegInf) and x[1] not in (float("nan"), PosInf, NegInf)] try: R("""plot(matrix[,%i], matrix[,%i], main='%s versus %s', cex=0.5, pch=".", xlab='%s', ylab='%s' )""" % ( a + 1, b + 1, headers[b], headers[a], xlabel, ylabel)) except rpy.RException as msg: print("could not plot %s versus %s: %s" % (headers[b], headers[a], msg)) if options.hardcopy: R['dev.off']() E.info("matrix added as >matrix< in R.") if not options.hardcopy: if options.input_filename: interpreter = code.InteractiveConsole(globals()) interpreter.interact() else: E.info( "can not start new interactive session as input has come from stdin.") E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-a", "--gtf-a", dest="gtf_a", type="string", help="supply a gtf file - will compress uncompressed files" ) parser.add_option("-b", "--gtf-b", dest = "gtf_b", type = "string", help="supply a second gtf file - will compress uncompressed files") parser.add_option("-s", "--scripts-dir", dest = "scripts_dir", type = "string", help="supply a location for accessory scripts") parser.add_option( "--no-venn", dest = "no_venn", action="store_true", help="set if no venn is to be drawn") ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) gtf_files = [options.gtf_a, options.gtf_b] merged_files = [] prefices = [] E.info("merging gtf files") for gtf in gtf_files: if gtf.endswith(".gtf.gz"): outfile = P.snip(gtf, ".gtf.gz") + ".merged.gtf.gz" prefices.append(P.snip(gtf, ".gtf.gz")) merged_files.append(outfile) statement = '''zcat %s | python %s/gtf2gtf.py --merge-transcripts --log=%s.log | gzip > %s''' % (gtf, options.scripts_dir, outfile, outfile) P.run() elif gtf.endswith(".gtf"): outfile = P.snip(gtf, ".gtf") + ".merged.gtf.gz" prefices.append(P.snip(gtf,".gtf")) merged_files.append(outfile) statement = '''cat %s | python %s/gtf2gtf.py --merge-transcripts --log=%s.log | gzip > %s''' % (gtf, options.scripts_dir, outfile, outfile) P.run() else: raise ValueError("cannot perform merge on %s: is not a gtf file" % gtf) for prefix in prefices: if options.gtf_a.find(prefix) != -1: gtf_a = prefix + ".merged.gtf.gz" prefix_a = prefix elif options.gtf_b.find(prefix) != -1: gtf_b = prefix + ".merged.gtf.gz" prefix_b = prefix E.info("intersecting gtf files") # intersect the resulting merged files scriptsdir = options.scripts_dir intersection_out = "_vs_".join([prefix_a, prefix_b]) + ".intersection.gtf.gz" statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa | python %(scriptsdir)s/gtf2gtf.py --merge-transcripts --log=log | gzip > %(intersection_out)s''' P.run() if not options.no_venn: E.info("producing venn diagram for %s vs %s..." % (options.gtf_a, options.gtf_b)) # produce the venn diagram intersection_file = intersection_out gtf_a_merged = gtf_a gtf_b_merged = gtf_b # create dictionary key gtf_pair = (gtf_a_merged, gtf_b_merged) # containers for counts count_gtf_merged_a = 0 count_gtf_merged_b = 0 count_intersection = 0 # create GTF iterator objects gtf_iterator_a = GTF.iterator(IOTools.openFile(gtf_pair[0])) gtf_iterator_b = GTF.iterator(IOTools.openFile(gtf_pair[1])) gtf_iterator_intersection = GTF.iterator(IOTools.openFile(intersection_file)) # do the counts for each file E.info("counting entries in %s" % gtf_a) for entry in gtf_iterator_a: count_gtf_merged_a += 1 print "counts for gtf-a: ",count_gtf_merged_a E.info("counting entries in %s" % gtf_b) for entry in gtf_iterator_b: count_gtf_merged_b += 1 print "counts for gtf-b: ",count_gtf_merged_b E.info("counting entries in %s" % intersection_file) for entry in gtf_iterator_intersection: count_intersection += 1 print "counts for intersection: ", count_intersection # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this # set to make up the remaining - non-overlapping set result = {} E.info("assembling count lists") result[gtf_pair] = {"gtf-b" : map(str,xrange(count_gtf_merged_b)) , "gtf-a" : map(str,xrange(count_intersection)) + map(str, [random.random() for i in range(count_intersection,count_gtf_merged_a)] )} R_source = os.path.join(os.path.abspath(options.scripts_dir), "venn_diagram.R") R.source(R_source) prefix_a = prefix_a.replace(".", "_").replace("-", "_") prefix_b = prefix_b.replace(".", "_").replace("-", "_") R('''prefix.a <- "%s"''' % prefix_a) R('''prefix.b <- "%s"''' % prefix_b) E.info("drawing venn diagram to %s" % (prefix_a + "_vs_" + prefix_b + ".overlap.png")) R["venn.diagram2"](R.list( A = result[gtf_pair]["gtf-a"], B = result[gtf_pair]["gtf-b"]) , prefix_a + "_vs_" + prefix_b + ".overlap.png" , **{'cat.cex': 1.5 , 'main.fontfamily': "Arial" , 'cat.pos':FloatVector((0,0)) , 'cat.fontfamily':"Arial" , 'main.cex':1.8 , 'height':1000 , 'width':1000 , 'cex':2 , 'fontfamily':"Arial" , 'lwd':R.c(1,1) , 'fill':R.c(R.rgb(0,0,0.5,0.5), R.rgb(0.5,0,0,0.5)) , 'category.names':R.c(prefix_a, prefix_b) , 'margin' : R.c(0.1,0.1,0.1,0.1) }) ## write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="choice", help="method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]", choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t")) parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("--plot-legend", dest="legend", type="string", help="legend for histograms.""") parser.add_option("-f", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.add_option("-n", "--norm-test", dest="norm_test", action="store_true", help="""test if a set of values is normally distributed. Mean and variance are calculated from the data.""") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="""number of bins (for plotting purposes only).""") parser.add_option("--bin-size", dest="bin_size", type="float", help="""bin size for plot.""") parser.add_option("--min-value", dest="min_value", type="float", help="""minimum_value for plot.""") parser.add_option("--max-value", dest="max_value", type="float", help="""maximum_value for plot.""") parser.add_option("--skip-plot", dest="plot", action="store_false", help="""skipping plotting.""") parser.add_option("--header-names", dest="header", type="string", help="""header of value column [default=%default].""") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, legend=None, norm_test=False, num_bins=0, legend_range="2,2", bin_size=None, min_value=None, plot=True, header="value", title=None, ) (options, args) = E.Start(parser, add_pipe_options=True) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.legend: options.legend = options.legend.split(",") map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) f = str else: f = float if options.filename_input1: infile1 = IOTools.openFile(options.filename_input1, "r") else: infile1 = sys.stdin values1, errors1 = IOTools.ReadList(infile1, map_function=f, map_category=map_category2value) if options.filename_input1: infile1.close() if errors1 and options.loglevel >= 3: options.stdlog.write("# errors in input1: %s\n" % ";".join(map(str, errors1))) if options.norm_test: mean = R.mean(values1) stddev = R.sd(values1) options.stdlog.write("# creating %i samples from normal distribution with mean %f and stddev %f\n" % ( len(values1), mean, stddev)) values2 = R.rnorm(len(values1), mean, stddev) errors2 = () else: values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_function=f, map_category=map_category2value) if errors2 and options.loglevel >= 3: options.stdlog.write("# errors in input2: %s\n" % ";".join(map(str, errors2))) if options.loglevel >= 1: options.stdlog.write("# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" % (len(values1), len(errors1), len(values2), len(errors2))) if options.method in ("paired-mwu", "paired-t"): if len(values1) != len(values2): raise ValueError( "number of values must be equal for paired tests.") if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2, *xargs, **kwargs) elif options.method == "mwu": result = R.wilcox_test( values1, values2, paired=False, correct=True, *xargs, **kwargs) elif options.method == "paired-mwu": result = R.wilcox_test( values1, values2, paired=True, correct=True, *xargs, **kwargs) elif options.method == "paired-t": result = R.t_test(values1, values2, paired=True, *xargs, **kwargs) elif options.method == "shapiro": if len(values1) > 5000: E.warn( "shapiro-wilk test only accepts < 5000 values, a random sample has been created.") values1 = random.sample(values1, 5000) result = R.shapiro_test(values1, *xargs, **kwargs) if options.plot: R.assign("v1", values1) R.assign("v2", values2) if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""") # compute breaks: min_value = min(min(values1), min(values2)) if options.min_value is not None: min_value = min(min_value, options.min_value) max_value = max(max(values1), max(values2)) if options.max_value is not None: max_value = max(max_value, options.max_value) extra_options = "" if options.num_bins and not (options.min_value or options.max_value): extra_options += ", breaks=%i" % options.num_bins elif options.num_bins and (options.min_value or options.max_value): bin_size = float((max_value - min_value)) / (options.num_bins + 1) breaks = [ min_value + x * bin_size for x in range(options.num_bins)] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) elif options.bin_size is not None: num_bins = int(((max_value - min_value) / options.bin_size)) + 1 breaks = [ min_value + x * options.bin_size for x in range(num_bins + 1)] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) R("""h1 <- hist( v1, freq=FALSE, density=20, main='Relative frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))""" % ( "','".join(options.legend))) R("""h1 <- hist( v1, freq=TRUE, density=20, main='Absolute frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=TRUE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))""" % ( "','".join(options.legend))) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) if options.loglevel >= 1: options.stdout.write("## Results for %s\n" % result['method']) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key in list(result.keys()): if key == "data.name": continue options.stdout.write("\t".join((key, str(result[key]))) + "\n") stat = Stats.Summary(values1) for key, value in list(stat.items()): options.stdout.write("%s1\t%s\n" % (str(key), str(value))) stat = Stats.Summary(values2) for key, value in list(stat.items()): options.stdout.write("%s2\t%s\n" % (str(key), str(value))) if options.plot: if options.hardcopy: R.dev_off() E.Stop()
import os import pandas import numpy os.environ[ 'R_USER'] = '******' #path depends on where you installed Python. Mine is the Anaconda distribution from rpy2.robjects.packages import importr from rpy2.robjects import r, pandas2ri pandas2ri.activate() # utils = importr('utils') # utils.install_packages('warbleR') warbleR = importr('warbleR') dataframe = pandas.DataFrame( [['001_K.wav', int(1), int(2), int(3)]], columns=['sound.files', 'selec', 'start', 'end']) print(dataframe) print(warbleR.specan(X=dataframe, bp=r.c(0, 28000)))
# set vertical orientation if max( [len(x) for x in headers] ) > 40 / len(headers): # remove xlabel: extra_options = re.sub( ", xlab='[^']+'", "", extra_options ) extra_options += ", names.arg=headers, las=2" R("""op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), beside=TRUE %s)""" % extra_options) elif method == "scatter+marginal": if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0,0,4,0) ) R( """matrix""" ) R( """ x <- matrix[,1]; y <- matrix[,2]; xhist <- hist(x, breaks=20, plot=FALSE); yhist <- hist(y, breaks=20, plot=FALSE); top <- max(c(xhist$counts, yhist$counts)); nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE ); par(mar=c(3,3,1,1)) ; plot(x, y, cex=%s, pch="o" %s) ; par(mar=c(0,3,1,1)) ; barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ; par(mar=c(3,0,1,1)) ; title(main='%s');
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-m", "--method", dest="method", type="choice", help= "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]", choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t")) parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("--plot-legend", dest="legend", type="string", help="legend for histograms." "") parser.add_option("-f", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.add_option( "-n", "--norm-test", dest="norm_test", action="store_true", help= """test if a set of values is normally distributed. Mean and variance are calculated from the data.""") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="""number of bins (for plotting purposes only).""") parser.add_option("--bin-size", dest="bin_size", type="float", help="""bin size for plot.""") parser.add_option("--min-value", dest="min_value", type="float", help="""minimum_value for plot.""") parser.add_option("--max-value", dest="max_value", type="float", help="""maximum_value for plot.""") parser.add_option("--skip-plot", dest="plot", action="store_false", help="""skipping plotting.""") parser.add_option("--header-names", dest="header", type="string", help="""header of value column [default=%default].""") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, legend=None, norm_test=False, num_bins=0, legend_range="2,2", bin_size=None, min_value=None, plot=True, header="value", title=None, ) (options, args) = E.Start(parser, add_pipe_options=True) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.legend: options.legend = options.legend.split(",") map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) f = str else: f = float if options.filename_input1: infile1 = IOTools.openFile(options.filename_input1, "r") else: infile1 = sys.stdin values1, errors1 = IOTools.ReadList(infile1, map_function=f, map_category=map_category2value) if options.filename_input1: infile1.close() if errors1 and options.loglevel >= 3: options.stdlog.write("# errors in input1: %s\n" % ";".join(map(str, errors1))) if options.norm_test: mean = R.mean(values1) stddev = R.sd(values1) options.stdlog.write( "# creating %i samples from normal distribution with mean %f and stddev %f\n" % (len(values1), mean, stddev)) values2 = R.rnorm(len(values1), mean, stddev) errors2 = () else: values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_function=f, map_category=map_category2value) if errors2 and options.loglevel >= 3: options.stdlog.write("# errors in input2: %s\n" % ";".join(map(str, errors2))) if options.loglevel >= 1: options.stdlog.write( "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" % (len(values1), len(errors1), len(values2), len(errors2))) if options.method in ("paired-mwu", "paired-t"): if len(values1) != len(values2): raise ValueError( "number of values must be equal for paired tests.") if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2, *xargs, **kwargs) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False, correct=True, *xargs, **kwargs) elif options.method == "paired-mwu": result = R.wilcox_test(values1, values2, paired=True, correct=True, *xargs, **kwargs) elif options.method == "paired-t": result = R.t_test(values1, values2, paired=True, *xargs, **kwargs) elif options.method == "shapiro": if len(values1) > 5000: E.warn( "shapiro-wilk test only accepts < 5000 values, a random sample has been created." ) values1 = random.sample(values1, 5000) result = R.shapiro_test(values1, *xargs, **kwargs) if options.plot: R.assign("v1", values1) R.assign("v2", values2) if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""" ) # compute breaks: min_value = min(min(values1), min(values2)) if options.min_value is not None: min_value = min(min_value, options.min_value) max_value = max(max(values1), max(values2)) if options.max_value is not None: max_value = max(max_value, options.max_value) extra_options = "" if options.num_bins and not (options.min_value or options.max_value): extra_options += ", breaks=%i" % options.num_bins elif options.num_bins and (options.min_value or options.max_value): bin_size = float((max_value - min_value)) / (options.num_bins + 1) breaks = [ min_value + x * bin_size for x in range(options.num_bins) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) elif options.bin_size is not None: num_bins = int(((max_value - min_value) / options.bin_size)) + 1 breaks = [ min_value + x * options.bin_size for x in range(num_bins + 1) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) R("""h1 <- hist( v1, freq=FALSE, density=20, main='Relative frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) R("""h1 <- hist( v1, freq=TRUE, density=20, main='Absolute frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=TRUE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) if options.loglevel >= 1: options.stdout.write("## Results for %s\n" % result['method']) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key in list(result.keys()): if key == "data.name": continue options.stdout.write("\t".join((key, str(result[key]))) + "\n") stat = Stats.Summary(values1) for key, value in list(stat.items()): options.stdout.write("%s1\t%s\n" % (str(key), str(value))) stat = Stats.Summary(values2) for key, value in list(stat.items()): options.stdout.write("%s2\t%s\n" % (str(key), str(value))) if options.plot: if options.hardcopy: R.dev_off() E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_table2scatter.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-c", "--columns", dest="columns", type="string", help= "columns to take from table. Choices are 'all', 'all-but-first' or a ','-separated list of columns." ) parser.add_option( "--logscale", dest="logscale", type="string", help="log-transform one or both axes [default=%Default].") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file [default=%default].", metavar="FILE") parser.add_option("-f", "--file", dest="input_filename", type="string", help="filename with table data [default=%default].", metavar="FILE") parser.add_option("-2", "--file2", dest="input_filename2", type="string", help="additional data file [default=%default].", metavar="FILE") parser.add_option( "-s", "--stats", dest="statistics", type="choice", choices=("correlation", "spearman", "pearson", "count"), help="statistical quantities to compute [default=%default]", action="append") parser.add_option("-p", "--plot", dest="plot", type="choice", choices=("scatter", "pairs", "panel", "bar", "bar-stacked", "bar-besides", "1_vs_x", "matched", "boxplot", "scatter+marginal", "scatter-regression"), help="plots to plot [default=%default]", action="append") parser.add_option( "-t", "--threshold", dest="threshold", type="float", help="min threshold to use for counting method [default=%default].") parser.add_option( "-o", "--colours", dest="colours", type="int", help="column with colour information [default=%default].") parser.add_option( "-l", "--plot-labels", dest="labels", type="string", help="column labels for x and y in matched plots [default=%default].") parser.add_option("-d", "--add-diagonal", dest="add_diagonal", action="store_true", help="add diagonal to plot [default=%default].") parser.add_option("-e", "--plot-legend", dest="legend", type="int", help="column with legend [default=%default].") parser.add_option("-r", "--options", dest="r_options", type="string", help="R plotting options [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("full", "sparse"), help="output format [default=%default].") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.add_option("", "--xrange", dest="xrange", type="string", help="x viewing range of plot [default=%default].") parser.add_option("", "--yrange", dest="yrange", type="string", help="y viewing range of plot[default=%default].") parser.add_option("--allow-empty-file", dest="fail_on_empty", action="store_false", help="do not fail on empty input [default=%default].") parser.add_option("--fail-on-empty", dest="fail_on_empty", action="store_true", help="fail on empty input [default=%default].") parser.set_defaults(hardcopy=None, input_filename="", input_filename2=None, columns="all", logscale=None, statistics=[], plot=[], threshold=0.0, labels="x,y", colours=None, diagonal=False, legend=None, title=None, xrange=None, yrange=None, r_options="", fail_on_empty=True, format="full") (options, args) = E.Start(parser) if len(args) == 1 and not options.input_filename: options.input_filename = args[0] if options.columns not in ("all", "all-but-first"): options.columns = [int(x) - 1 for x in options.columns.split(",")] if options.colours: options.colours -= 1 if options.legend: options.legend -= 1 table = {} headers = [] # read data matrix if options.input_filename: lines = IOTools.openFile(options.input_filename, "r").readlines() else: # note: this will not work for interactive viewing, but # creating hardcopy plots works. lines = sys.stdin.readlines() lines = [x for x in lines if x[0] != "#"] if len(lines) == 0: if options.fail_on_empty: raise IOError("no input") E.warn("empty input") E.Stop() return matrix, headers, colours, legend = readTable(lines, "matrix", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) if options.input_filename2: # read another matrix (should be of the same format. matrix2, headers2, colours2, legend2 = readTable( lines, "matrix2", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) R.assign("headers", headers) ndata = R("""length( matrix[,1] )""")[0] if options.loglevel >= 1: options.stdlog.write("# read matrix: %ix%i\n" % (len(headers), ndata)) if colours: R.assign("colours", colours) for method in options.statistics: if method == "correlation": cor = R.cor(matrix, use="pairwise.complete.obs") writeMatrix(sys.stdout, cor, headers=headers, format="%5.2f") elif method == "pearson": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "n", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): try: result = R("""cor.test( matrix[,%i], matrix[,%i] )""" % (x + 1, y + 1)) except rpy.RPyException as msg: E.warn( "correlation not computed for columns %i(%s) and %i(%s): %s" % (x, headers[x], y, headers[y], msg)) options.stdout.write( "%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" % (headers[x], headers[y], "na", "na", "na", 0, "na", "na")) else: options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result.rx2('estimate').rx2('cor')[0], Stats.getSignificance( float(result.rx2('p.value')[0])), result.rx2('p.value')[0], result.rx2('parameter').rx2('df')[0], result.rx2('method')[0], result.rx2('alternative')[0])) elif method == "spearman": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): result = R( """cor.test( matrix[,%i], matrix[,%i], method='spearman')""" % (x + 1, y + 1)) options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result['estimate']['rho'], Stats.getSignificance(float(result['p.value'])), result['p.value'], result['parameter']['df'], result['method'], result['alternative'])) elif method == "count": # number of shared elements > threshold m, r, c = MatlabTools.ReadMatrix(open(options.input_filename, "r"), take=options.columns, headers=True) mask = numpy.greater(m, options.threshold) counts = numpy.dot(numpy.transpose(mask), mask) writeMatrix(options.stdout, counts, headers=c, format="%i") if options.plot: # remove columns that are completely empty if "pairs" in options.plot: colsums = R('''colSums( is.na(matrix ))''') take = [x for x in range(len(colsums)) if colsums[x] != ndata] if take: E.warn("removing empty columns %s before plotting" % str(take)) matrix = R.subset(matrix, select=[x + 1 for x in take]) R.assign("""matrix""", matrix) headers = [headers[x] for x in take] if legend: legend = [headers[x] for x in take] if options.r_options: extra_options = ", %s" % options.r_options else: extra_options = "" if options.legend is not None and len(legend): extra_options += ", legend=c('%s')" % "','".join(legend) if options.labels: xlabel, ylabel = options.labels.split(",") extra_options += ", xlab='%s', ylab='%s'" % (xlabel, ylabel) else: xlabel, ylabel = "", "" if options.colours: extra_options += ", col=colours" if options.logscale: extra_options += ", log='%s'" % options.logscale if options.xrange: extra_options += ", xlim=c(%f,%f)" % tuple( map(float, options.xrange.split(","))) if options.yrange: extra_options += ", ylim=c(%f,%f)" % tuple( map(float, options.yrange.split(","))) if options.hardcopy: if options.hardcopy.endswith(".eps"): R.postscript(options.hardcopy) elif options.hardcopy.endswith(".png"): R.png(options.hardcopy, width=1024, height=768, type="cairo") elif options.hardcopy.endswith(".jpg"): R.jpg(options.hardcopy, width=1024, height=768, type="cairo") for method in options.plot: if ndata < 100: point_size = "1" pch = "o" elif ndata < 1000: point_size = "1" pch = "o" else: point_size = "0.5" pch = "." if method == "scatter": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (point_size, extra_options)) if method == "scatter-regression": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (point_size, extra_options)) dat = R( """dat <- data.frame(x = matrix[,1], y = matrix[,2])""") R("""new <- data.frame(x = seq( min(matrix[,1]), max(matrix[,1]), (max(matrix[,1]) - min(matrix[,1])) / 100))""" ) mod = R("""mod <- lm( y ~ x, dat)""") R("""predict(mod, new, se.fit = TRUE)""") R("""pred.w.plim <- predict(mod, new, interval="prediction")""" ) R("""pred.w.clim <- predict(mod, new, interval="confidence")""" ) R("""matpoints(new$x,cbind(pred.w.clim, pred.w.plim[,-1]), lty=c(1,2,2,3,3), type="l")""" ) R.mtext("y = %f * x + %f, r=%6.4f, n=%i" % (mod["coefficients"]["x"], mod["coefficients"]["(Intercept)"], R("""cor( dat )[2]"""), ndata), 3, cex=1.0) elif method == "pairs": if options.add_diagonal: R("""panel.hist <- function( x,y,... ) { points(x,y,...); abline(0,1); }""" ) else: R("""panel.hist <- function( x,y,... ) { points(x,y,...); }""" ) # There used to be a argument na_action="na.omit", but # removed this as there appeared error messages saying # "na.action is not a graphical parameter" and the # plots showed occasionally the wrong scale. # cex=point_size also caused trouble (error message: # "X11 used font size 8 when 2 was requested" or # similar) if options.colours: R.pairs(matrix, pch=pch, col=colours, main=options.title, panel="panel.hist", labels=headers, cex_labels=2.0) else: R.pairs(matrix, pch=pch, panel="panel.hist", main=options.title, labels=headers, cex_labels=2.0) elif method == "boxplot": extra_options += ",main='%s'" % options.title # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R("""op <- par(mar=c(11,4,4,2))""" ) # the 10 allows the names.arg below the barplot R("""boxplot( matrix %s)""" % extra_options) elif method == "bar" or method == "bar-stacked": if not options.colours: extra_options += ", col=rainbow(5)" # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R("""op <- par(mar=c(11,4,4,2))""" ) # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), %s)""" % extra_options) elif method == "bar-besides": if not options.colours: extra_options += ", col=rainbow(%i)" % ndata # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R("""op <- par(mar=c(11,4,4,2))""" ) # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), beside=TRUE %s)""" % extra_options) elif method == "scatter+marginal": if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R("""matrix""") R(""" x <- matrix[,1]; y <- matrix[,2]; xhist <- hist(x, breaks=20, plot=FALSE); yhist <- hist(y, breaks=20, plot=FALSE); top <- max(c(xhist$counts, yhist$counts)); nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE ); par(mar=c(3,3,1,1)) ; plot(x, y, cex=%s, pch="o" %s) ; par(mar=c(0,3,1,1)) ; barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ; par(mar=c(3,0,1,1)) ; title(main='%s'); barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE ) ; title(main='%s'); """ % (point_size, extra_options, xlabel, ylabel)) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) elif method in ("panel", "1_vs_x", "matched"): if method == "panel": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): pairs.append((x, y)) elif method == "1_vs_x": pairs = [] for x in range(1, len(headers)): pairs.append((0, x)) # print matching columns elif method == "matched": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): if headers[x] == headers[y]: pairs.append((x, y)) break w = int(math.ceil(math.sqrt(len(pairs)))) h = int(math.ceil(float(len(pairs)) / w)) PosInf = 1e300000 NegInf = -1e300000 xlabel, ylabel = options.labels.split(",") R("""layout(matrix(seq(1,%i), %i, %i, byrow = TRUE))""" % (w * h, w, h)) for a, b in pairs: new_matrix = [ x for x in zip( list(matrix[a].values())[0], list(matrix[b].values())[0]) if x[0] not in (float("nan"), PosInf, NegInf) and x[1] not in (float("nan"), PosInf, NegInf) ] try: R("""plot(matrix[,%i], matrix[,%i], main='%s versus %s', cex=0.5, pch=".", xlab='%s', ylab='%s' )""" % (a + 1, b + 1, headers[b], headers[a], xlabel, ylabel)) except rpy.RException as msg: print("could not plot %s versus %s: %s" % (headers[b], headers[a], msg)) if options.hardcopy: R['dev.off']() E.info("matrix added as >matrix< in R.") if not options.hardcopy: if options.input_filename: interpreter = code.InteractiveConsole(globals()) interpreter.interact() else: E.info( "can not start new interactive session as input has come from stdin." ) E.Stop()
def R_Factor_Analysis( comm_str, csv_data, csv_colvars, csv_coltypes, fpref, test_arr, # -> can be NULL for interior calc Nfac, # -> can be 0 for interior calc Ntopload, # -> can be 0 for interior calc flab, DO_GRAPH, N_cent = 99, # 'centile' N_iter = 5000, # 'iterations' ftype = 'jpeg'): '''Perform factor analysis using R function factanal(). User can specify the number of latent factors using the paran() function, which implements Horn's test. Returns: Factor scores and loadings''' # R libraries used here. paran = importr('paran') # some stuff about format types if PARN_OUT_types.__contains__(ftype): ii = PARN_OUT_types.index(ftype) OUT_dev = PARN_OUT_devs[ii] OUT_paran = fpref+'.'+ftype else: print "** Error! ", print "Output file type '%s' is not valid. Select from:" % (ftype) print "\t", for x in PARN_OUT_types: print " '"+x+"' ", print "\n" sys.exit(32) fff = open(fpref+'.log','w') if comm_str: fff.write('# '+comm_str+"\n") # SETUP THE VARIABLE VALUES Lx,Ly = np.shape(csv_data) # if user hasn't entered a selection, then use 'em all. if not(test_arr): test_arr = list(csv_colvars) # Get rid of variable columns with 'NA' test_arr = Cut_ColVars_with_NAs(csv_data, csv_colvars, test_arr) # check for duplicate columns, which lead to bad singularities test_arr = CheckForDuplicates( test_arr ) # if user hasn't entered a label, then use: if not(flab): flab = 'FACTOR' # only select variables that are represented in the csv_data headings, # as well as being either int or float VARS_inds = [] VARS_names = [] for x in test_arr: if csv_colvars.__contains__(x): ii = csv_colvars.index(x) if [int, float].__contains__(csv_coltypes[ii]): VARS_inds.append(ii) VARS_names.append(x) Nvars = len(VARS_names) Y = np.zeros((Lx,Nvars), dtype=float) print "++ Factor analysis contains %s variables:" % (Nvars) fff.write("\n++ Factor analysis contains %s variables:\n" % (Nvars)) for j in range(Nvars): jj = VARS_inds[j] print "\t %s" % (VARS_names[j]) fff.write("\t %s\n" % (VARS_names[j])) for i in range(Lx): Y[i,j] = csv_data[i][jj] i = CorMatCheck(Y, VARS_names) # SETUP THE NUMBER OF FACTORS # use eval info to pick number of vars, if user hasn't if not(Nfac): print "++ Graphing of parallel analysis (PA) Horn's test is:", if DO_GRAPH: print "ON." else: print "OFF." print "++ PA percentile in Horn's test is: ", N_cent print "++ Number of PA Monte Carlo iterations: ", N_iter # mostly default values, some user control PARN = r.paran( Y, iterations=N_iter, centile=N_cent, quietly=False, status=True, all=True, cfa=True, graph=DO_GRAPH, color=True, col=r.c("black","red","blue"), lty=r.c(1,2,3), lwd=1, legend=True, file=OUT_paran, width=640, height=640, grdevice=OUT_dev, seed=0) if DO_GRAPH: grDevices.dev_off() print "++ Don't worry about the briefly passing image." print "\tIt has been saved as: %s\n\n" % ( OUT_paran ) N_PARN_arr = np.array(PARN.rx2('Retained')) Nfac = int(N_PARN_arr[0]) else: if Nfac > Nvars: print "*+ Warning! The user has selected a number of factors larger" print "\tthan the number of variables (%d > %d)!" % (Nfac, Nvars) print "\t-> Therefore, we're setting it to be %d," % (Nvars) print "\t but you might still want to check if anything went awry?" else: print "++ The user has selected the number of factors" print "\tto be %d out of %d." % (Nfac, Nvars) # RUN THE FACTOR ANALYSIS IN R FA_out = r.factanal(Y, factors=Nfac, scores='regression', rotation="varimax") FA_scores =np.array(FA_out.rx2('scores')) FA_loadings =np.array(FA_out.rx2('loadings')) # match up highest loadings with the variable names, so we have an # idea of what's going into the sausage # how many loadings to output. # Can be: ALL, 5, or user-entered other if not(Ntopload): Ntopload = min(Nvars, 5) elif Ntopload<0 : Ntopload = Nvars else: Ntopload = min(Nvars, Ntopload) if Ntopload==Nvars: strNtopload = "ALL "+str(Nvars) else: strNtopload = 'top '+str(Ntopload)+'/'+str(Nvars) # ordering process FA_titles = [] print "\n++ Factor loading contributions (%s):" % (strNtopload) fff.write("\n++ Factor loading contributions (%s):\n" % (strNtopload)) for i in range(Nfac): P = list(FA_loadings[:,i]) Q = list(VARS_names) PQ = sorted(zip(P,Q),reverse=1) str_title = "%s_%02d" % (flab, i+1) FA_titles.append(str_title) print "\n\t"+str_title fff.write("\n\t"+str_title+"\n") for j in range(Ntopload): print "\t%20s %12.5f" % (PQ[j][1],PQ[j][0]) fff.write("\t%20s %12.5f\n" % (PQ[j][1],PQ[j][0])) fff.close() return FA_scores, FA_titles, VARS_names
def deaScranDESeq2(counts, conds, comparisons, alpha, scran_clusters=False): """Makes a call to DESeq2 with SCRAN to perform D.E.A. in the given counts matrix with the given conditions and comparisons. Returns a list of DESeq2 results for each comparison """ results = list() n_cells = len(counts.columns) try: pandas2ri.activate() deseq2 = RimportLibrary("DESeq2") scran = RimportLibrary("scran") multicore = RimportLibrary("BiocParallel") multicore.register( multicore.MulticoreParam(multiprocessing.cpu_count() - 1)) as_matrix = r["as.matrix"] # Create the R conditions and counts data r_counts = pandas2ri.py2ri(counts) cond = robjects.StrVector(conds) r_call = """ function(r_counts) { sce = SingleCellExperiment(assays=list(counts=r_counts)) return(sce) } """ r_func = r(r_call) sce = r_func(as_matrix(r_counts)) if scran_clusters: r_clusters = scran.quickCluster(as_matrix(r_counts), max(n_cells / 10, 10)) min_cluster_size = min(Counter(r_clusters).values()) sizes = list( set([ round((min_cluster_size / 2) / i) for i in [5, 4, 3, 2, 1] ])) sce = scran.computeSumFactors(sce, clusters=r_clusters, sizes=sizes, positive=True) else: sizes = list( set([ round((n_cells / 2) * i) for i in [0.1, 0.2, 0.3, 0.4, 0.5] ])) sce = scran.computeSumFactors(sce, sizes=sizes, positive=True) sce = r.normalize(sce) dds = r.convertTo(sce, type="DESeq2") r_call = """ function(dds, conditions){ colData(dds)$conditions = as.factor(conditions) design(dds) = formula(~ conditions) return(dds) } """ r_func = r(r_call) dds = r_func(dds, cond) dds = r.DESeq(dds) # Perform the comparisons and store results in list for A, B in comparisons: result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha) result = r['as.data.frame'](result) genes = r['rownames'](result) result = pandas2ri.ri2py_dataframe(result) # There seems to be a problem parsing the rownames from R to pandas # so we do it manually result.index = genes results.append(result) pandas2ri.deactivate() except Exception as e: raise e return results
def run_all(site_type: str, gene_sets: Mapping[str, Path] = GENE_SETS, gene_set_filter: Tuple[int] = (5, 1000), correct=False, **kwargs): """Runs all active_pathways combinations for given site_type. Uses pan_cancer/clinvar Active Driver analyses results and all provided GMT gene sets. Args: site_type: site filter which will be passed to ActiveDriver analysis gene_sets: gene sets to be considered gene_set_filter: a two-tuple: (min, max) number of genes required to be in a gene set. If not set, the default of (5, 1000) is used Results are saved in `output_dir`. Returns: Mapping of directories with newly computed ActivePathways results """ data_table = importr('data.table') paths = {} kwargs['geneset.filter'] = IntVector(gene_set_filter) for analysis in [ active_driver.pan_cancer_analysis, active_driver.clinvar_analysis ]: for gene_set in gene_sets: path = output_dir / analysis.name / gene_set / site_type # remove the old results (if any) rmtree(path, ignore_errors=True) # recreate dir path.mkdir(parents=True) path = path.absolute() ad_result = analysis(site_type) print( f'Preparing active pathways: {analysis.name} for {len(ad_result["all_gene_based_fdr"])} genes' ) print(f'Gene sets/background: {gene_set}') gene_sets_path = gene_sets[gene_set] if callable(gene_sets_path): gene_sets_path = gene_sets_path() result = run_active_pathways(ad_result, str(gene_sets_path), cytoscape_dir=path, correct=correct, **kwargs) data_table.fwrite(result, str(path / 'pathways.tsv'), sep='\t', sep2=r.c('', ',', '')) paths[(analysis, gene_set)] = path return paths
def draw_r(regions, points, **kwargs): # initialize the environment from rpy2.interactive import process_revents from rpy2.robjects import r from rpy2.robjects.packages import importr NA = r("NA")[0] RGB = lambda rgb: r.rgb(*rgb, maxColorValue=256) C = lambda seq: r.c(*seq) OOB = 40 graphics = importr("graphics") grDevices = importr("grDevices") process_revents.start() graphics.par(bg="white") graphics.split_screen(r.c(2, 1)) graphics.split_screen(r.c(1, 2), screen=2) graphics.screen(1) # prepare the regions for plotting ul, lr = regions.box() xlim = r.c(ul[0], lr[0]) ylim = r.c(lr[1], ul[1]) # create the main plot window graphics.plot(r.c(), r.c(), main=regions.name(), type="p", pch="+", xlim=xlim, ylim=ylim, xlab="", ylab="", xaxp=r.c(0, lr[0], lr[0]/200), yaxp=r.c(0, lr[1], lr[1]/200), bg="white") # plot the polygons in the order given order = sorted(regions.polys(), key=lambda p: p.area, reverse=True) for poly in order: xs, ys = zip(*poly.boundary[0].coords) color = regions.color(poly.name(), default=NA) cr, cg, cb = r.col2rgb(color) rgb = r.rgb(cr, cg, cb, alpha=128, maxColorValue=255) graphics.polygon(C(xs), C(ys), col=rgb) # plot the grid graphics.abline(v=r.c(OOB, lr[0]-OOB), lty=2) graphics.abline(h=r.seq(0, lr[1], 200), col="lightgray", lty=2) graphics.abline(v=r.seq(0, lr[0], 200), col="lightgray", lty=2) # plot the points xs, ys, names = zip(*[(pt[0].x, pt[0].y, pt[1]) for pt in points]) colors = [RGB(points.Color(name)) for name in names] graphics.points(C(xs), C(ys), xlab="", ylab="", pch="+", col=C(colors)) # save as a png if "png" in kwargs and kwargs['png']: grDevices.dev_print(grDevices.png, file=kwargs['png'], width=lr[0], height=lr[1]) # derive legend contents: colors, counts, names tid_counts = {} uniq_tids = [] for n in names: if n not in uniq_tids: tid_counts[n] = 0 uniq_tids.append(n) tid_counts[n] += 1 uniq_colors = [RGB(points.Color(tid)) for tid in uniq_tids] uniq_names = [("%d\t%s" % (i, IDs.TileID[i])) for i in uniq_tids] name_counts = [("%d\t%s: %d" % (k, IDs.TileID[k], v)) for (k,v) in \ tid_counts.items()] # display the colors legend legend_args = dict(y_intersp=0.7, cex=0.7) graphics.screen(3) graphics.legend("center", title="Tile Colors", legend=C(uniq_names), col=C(uniq_colors), pch="+", pt_cex=1, **legend_args) # display the counts legend graphics.screen(4) graphics.legend("center", title="Tile Counts", legend=C(name_counts), **legend_args) # sleep until the window is closed while grDevices.dev_list() != r("NULL"): time.sleep(0.1)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-a", "--first-gtf-file", dest="gtf_a", type="string", help="supply a gtf file - will compress uncompressed files") parser.add_option( "-b", "--second-gtf-file", dest="gtf_b", type="string", help="supply a second gtf file - will compress uncompressed files") parser.add_option("-s", "--scripts-dir", dest="scripts_dir", type="string", help="supply a location for accessory scripts") parser.add_option("--no-venn", dest="no_venn", action="store_true", help="set if no venn is to be drawn") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) gtf_files = [options.gtf_a, options.gtf_b] merged_files = [] prefices = [] E.info("merging gtf files") for gtf in gtf_files: if gtf.endswith(".gtf.gz"): outfile = IOTools.snip(gtf, ".gtf.gz") + ".merged.gtf.gz" prefices.append(IOTools.snip(gtf, ".gtf.gz")) merged_files.append(outfile) statement = '''zcat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % ( gtf, options.scripts_dir, outfile, outfile) P.execute(statement) elif gtf.endswith(".gtf"): outfile = IOTools.snip(gtf, ".gtf") + ".merged.gtf.gz" prefices.append(IOTools.snip(gtf, ".gtf")) merged_files.append(outfile) statement = '''cat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % ( gtf, options.scripts_dir, outfile, outfile) E.execute(statement) else: raise ValueError("cannot perform merge on %s: is not a gtf file" % gtf) for prefix in prefices: if options.gtf_a.find(prefix) != -1: gtf_a = prefix + ".merged.gtf.gz" prefix_a = prefix elif options.gtf_b.find(prefix) != -1: gtf_b = prefix + ".merged.gtf.gz" prefix_b = prefix E.info("intersecting gtf files") # intersect the resulting merged files scriptsdir = options.scripts_dir intersection_out = "_vs_".join([prefix_a, prefix_b ]) + ".intersection.gtf.gz" statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts --log=log | gzip > %(intersection_out)s''' P.run() if not options.no_venn: E.info("producing venn diagram for %s vs %s..." % (options.gtf_a, options.gtf_b)) # produce the venn diagram intersection_file = intersection_out gtf_a_merged = gtf_a gtf_b_merged = gtf_b # create dictionary key gtf_pair = (gtf_a_merged, gtf_b_merged) # containers for counts count_gtf_merged_a = 0 count_gtf_merged_b = 0 count_intersection = 0 # create GTF iterator objects gtf_iterator_a = GTF.iterator(IOTools.openFile(gtf_pair[0])) gtf_iterator_b = GTF.iterator(IOTools.openFile(gtf_pair[1])) gtf_iterator_intersection = GTF.iterator( IOTools.openFile(intersection_file)) # do the counts for each file E.info("counting entries in %s" % gtf_a) for entry in gtf_iterator_a: count_gtf_merged_a += 1 print("counts for gtf-a: ", count_gtf_merged_a) E.info("counting entries in %s" % gtf_b) for entry in gtf_iterator_b: count_gtf_merged_b += 1 print("counts for gtf-b: ", count_gtf_merged_b) E.info("counting entries in %s" % intersection_file) for entry in gtf_iterator_intersection: count_intersection += 1 print("counts for intersection: ", count_intersection) # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this # set to make up the remaining - non-overlapping set result = {} E.info("assembling count lists") result[gtf_pair] = { "gtf-b": list(map(str, range(count_gtf_merged_b))), "gtf-a": list(map(str, range(count_intersection))) + list( map(str, [ random.random() for i in range(count_intersection, count_gtf_merged_a) ])) } R_source = os.path.join(os.path.abspath(options.scripts_dir), "venn_diagram.R") R.source(R_source) prefix_a = prefix_a.replace(".", "_").replace("-", "_") prefix_b = prefix_b.replace(".", "_").replace("-", "_") R('''prefix.a <- "%s"''' % prefix_a) R('''prefix.b <- "%s"''' % prefix_b) E.info("drawing venn diagram to %s" % (prefix_a + "_vs_" + prefix_b + ".overlap.png")) R["venn.diagram2"](R.list(A=result[gtf_pair]["gtf-a"], B=result[gtf_pair]["gtf-b"]), prefix_a + "_vs_" + prefix_b + ".overlap.png", **{ 'cat.cex': 1.5, 'main.fontfamily': "Arial", 'cat.pos': FloatVector((0, 0)), 'cat.fontfamily': "Arial", 'main.cex': 1.8, 'height': 1000, 'width': 1000, 'cex': 2, 'fontfamily': "Arial", 'lwd': R.c(1, 1), 'fill': R.c(R.rgb(0, 0, 0.5, 0.5), R.rgb(0.5, 0, 0, 0.5)), 'category.names': R.c(prefix_a, prefix_b), 'margin': R.c(0.1, 0.1, 0.1, 0.1) }) # write footer and output benchmark information. E.Stop()