def plot(self, hardcopy=None): if hardcopy: R.png(hardcopy, width=1024, height=768, type="cairo") R.require('qvalue') # build a qobj R.assign("pval", self.mPValues) R.assign("pi0", self.mPi0) R.assign("qval", self.mQValues) R.assign("lambda", self.mLambda) R("""qobj <-list( pi0=pi0, qvalues=qval, pvalues=pval, lambda=lambda)""") R(""" class(qobj) <- "qvalue" """) R("""qplot(qobj)""") if hardcopy: R.dev_off()
def buildCuffdiffPlots(infile, outfile): '''create summaries of cufflinks results (including some diagnostic plots) Plots are created in the <exportdir>/cuffdiff directory. Plots are: <geneset>_<method>_<level>_<track1>_vs_<track2>_significance.png fold change against expression level ''' ########################################### ########################################### # create diagnostic plots ########################################### outdir = os.path.join(PARAMS["exportdir"], "cuffdiff") dbhandle = sqlite3.connect(PARAMS["database"]) prefix = P.snip(infile, ".load") geneset, method = prefix.split("_") for level in CUFFDIFF_LEVELS: tablename_diff = prefix + "_%s_diff" % level tablename_levels = prefix + "_%s_levels" % level # note that the ordering of EXPERIMENTS and the _diff table # needs to be the same as only one triangle is stored of the # pairwise results. do not plot "undefined" lfold values # (where treatment_mean or control_mean = 0) do not plot lfold # values where the confidence bounds contain 0. for track1, track2 in itertools.combinations(EXPERIMENTS, 2): statement = """ SELECT CASE WHEN d.treatment_mean < d.control_mean THEN d.treatment_mean ELSE d.control_mean END, d.l2fold, d.significant FROM %(tablename_diff)s AS d WHERE treatment_name = '%(track1)s' AND control_name = '%(track2)s' AND status = 'OK' AND treatment_mean > 0 AND control_mean > 0 """ % locals() data = zip(*Database.executewait(dbhandle, statement)) pngfile = "%(outdir)s/%(geneset)s_%(method)s_%(level)s_%(track1)s_vs_%(track2)s_significance.png" % locals() # ian: Bug fix: moved R.png to after data check so that no # plot is started if there is no data this was leading # to R falling over from too many open devices if len(data) == 0: E.warn("no plot for %s - %s -%s vs %s" % (pngfile, level, track1, track2)) continue R.png(pngfile) R.plot(ro.FloatVector(data[0]), ro.FloatVector(data[1]), xlab='min(FPKM)', ylab='log2fold', log="x", pch=20, cex=.1, col=R.ifelse(ro.IntVector(data[2]), "red", "black")) R['dev.off']() P.touch(outfile)
def draw_survival_curves(feature, surv, assignment=None, filename='tmp.png', show=False, title=True, labels=None, colors=['blue', 'red'], ann=None, show_legend=True, q=.25, std=None): if assignment is None: num_panels = 1 assignment = feature.map(lambda s: 1) name = lambda v: str(feature.name) if feature.name != None else '' else: num_panels = len(assignment.unique()) name = lambda v: str(assignment.name) + ' = ' + str(v) if (labels is None) and ((len(feature) / feature.nunique()) > 10): labels = r.sort(r.c(*feature.unique())) # R sorts bad colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black'] if feature.dtype == 'bool': feature = feature.map({True: 'True', False: 'False'}) r.png(filename=filename, width=200 * (num_panels + 1), height=300, res=75) fmla = robjects.Formula('Surv(days, event) ~ feature') r.par(mfrow=r.c(1, num_panels)) r.par(mar=r.c(4, 5, 4, 1)) r.par(xpd=True) if (get_vec_type(feature) == 'real') and (len(feature.unique()) > 10): colors = ['blue', 'orange', 'red'] if q == .5: labels = ['Bottom 50%', 'Top 50%'] else: labels = ['Bottom {}%'.format(int(q * 100)), 'Normal', 'Top {}%'.format(int(q * 100))] ls = r.c(*colors) def plot_me(sub_f, label): if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10): sub_f = to_quants(sub_f, q=q, std=std) m = get_cox_ph(surv, sub_f, formula=fmla) r_data = m.rx2('call')[2] p = log_rank(sub_f, surv)['p'] ls = r.c(*colors) r.plot(survival.survfit(fmla, r_data), lty=1, col=ls, lwd=4, cex=1.25, xlab='Years to Event', ylab='Survival'); r.title(label, cex=3.) if ann == 'p': r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4) elif ann != None: r.text(0, labels=ann, pos=4) if show_legend == 'out': r.par(xpd=True, mar=r.c(4, 5, 5, 8)) for value in sorted(assignment.ix[feature.index].dropna().unique()): f = feature.ix[assignment[assignment == value].index] if len(f.unique()) > 1: plot_me(f, name(value)) if show_legend == True: mean_s = surv.ix[:, 'event'].ix[assignment[assignment == value].index].mean() if mean_s < .5: r.legend(surv.ix[:, 'days'].max() * .05 / 365., .45, labels, lty=1, col=ls, lwd=3, bty='o') else: r.legend(surv.ix[:, 'days'].max() * .4 / 365, .9, labels, lty=1, col=ls, lwd=3, bty='o') elif show_legend == 'out': r.legend(surv.ix[:, 'days'].max() * 1.1 / 365, .9, labels, lty=1, col=ls, lwd=3, bty='o') r('dev.off()') if show: return Show(filename)
def buildDMRStats(tables, method, outfile, dbhandle): """build dmr summary statistics. This method counts the number of up/down, 2fold up/down, etc. genes in output from (:mod:`scripts/runExpression`). This method also creates diagnostic plots in the <exportdir>/<method> directory. Tables should be labeled <tileset>_<design>_<method>. Arguments --------- tables ; list List of tables with DMR output method : string Method name outfile : string Output filename. Tab separated file summarizing """ def togeneset(tablename): return re.match("([^_]+)_", tablename).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write( "\t".join( ( "tileset", "design", "track1", "track2", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "up", "down", "twofold", "twofold_up", "twofold_down", ) ) + "\n" ) all_tables = set(Database.getTables(dbhandle)) outdir = os.path.join(PARAMS["exportdir"], "diff_methylation") for tablename in tables: prefix = P.snip(tablename, "_%s" % method) tileset, design = prefix.split("_") def toDict(vals, l=2): return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals]) E.info("collecting data from %s" % tablename) tested = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name""" % locals(), ).fetchall() ) status = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name,status""" % locals(), ).fetchall(), 3, ) signif = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE significant GROUP BY treatment_name,control_name""" % locals(), ).fetchall() ) fold2 = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE (l2fold >= 1 or l2fold <= -1) AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) fold2up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 1 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) fold2down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < -1 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) groups = tested.keys() for treatment_name, control_name in groups: k = (treatment_name, control_name) outf.write( "\t".join( map( str, ( tileset, design, treatment_name, control_name, tested[k], "\t".join([str(status[(treatment_name, control_name, x)]) for x in keys_status]), signif[(k)], up[k], down[k], fold2[k], fold2up[k], fold2down[k], ), ) ) + "\n" ) ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, """SELECT end - start, pvalue FROM %(tablename)s WHERE significant""" % locals(), ).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals() R.png(pngfile) R.smoothScatter( R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab="log10(length)", ylab="log10(pvalue)", log="x", pch=20, cex=0.1, ) R["dev.off"]() outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--input-format", dest="input_format", type="choice", choices=("bed", "bam"), help="input file format [default=%default].") parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-e", "--extension", dest="extension", type="int", help="extension size [default=%default].") parser.add_option("-b", "--bin-size", dest="bin_size", type="int", help="bin size of genome vector [default=%default].") parser.add_option("-l", "--fragment-length", dest="fragment_length", type="int", help="bin size of genome vector [default=%default].") parser.add_option( "-s", "--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis [default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "rms", "rpm", "all"), help="actions to perform [default=%default].") parser.add_option( "-w", "--bigwig", dest="bigwig", action="store_true", help= "store wig files as bigwig files - requires a genome file [default=%default]" ) parser.set_defaults( input_format="bam", ucsc_genome="hg19", genome_file=None, extension=400, bin_size=50, saturation_iterations=10, fragment_length=700, toolset=[], bigwig=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if len(args) != 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() filename_sample = args[0] if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.Hsapiens.UCSC.%s' % options.ucsc_genome R.library(genome_file) tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) bin_size = options.bin_size extension = options.extension fragment_length = options.fragment_length saturation_iterations = options.saturation_iterations if options.input_format == "bam": E.info("converting bam files") filename_sample = bamToMEDIPS(filename_sample, os.path.join(tmpdir, "sample.medips")) elif options.input_format == "bed": E.info("converting bed files") filename_sample = bedToMEDIPS(filename_sample, os.path.join(tmpdir, "sample.medips")) E.info("loading data") R('''CONTROL.SET = MEDIPS.readAlignedSequences( BSgenome = "%(genome_file)s", file = "%(filename_sample)s" ) ''' % locals()) slotnames = (("extend", "extend", "%i"), ("distFunction", "distance_function", "%s"), ("slope", "slope", "%f"), ("fragmentLength", "fragment_length", "%i"), ("bin_size", "bin_size", "%i"), ("seq_pattern", "pattern", "%s"), ("number_regions", "nregions", "%i"), ("number_pattern", "npatterns", "%i"), ("cali_chr", "calibration_contig", "%s"), ("genome_name", "genome", "%s")) E.info("computing genome vector") R('''CONTROL.SET = MEDIPS.genomeVector(data = CONTROL.SET, bin_size = %(bin_size)i, extend=%(extension)i )''' % locals()) E.info("computing CpG positions") R('''CONTROL.SET = MEDIPS.getPositions(data = CONTROL.SET, pattern = "CG")''' ) E.info("compute coupling vector") R('''CONTROL.SET = MEDIPS.couplingVector(data = CONTROL.SET, fragmentLength = %(fragment_length)i, func = "count")''' % locals()) E.info("compute calibration curve") R('''CONTROL.SET = MEDIPS.calibrationCurve(data = CONTROL.SET)''') E.info("normalizing") R('''CONTROL.SET = MEDIPS.normalize(data = CONTROL.SET)''') outfile = IOTools.openFile(E.getOutputFile("summary.tsv.gz"), "w") outfile.write("category\tvalue\n") if "saturation" in options.toolset or do_all: E.info("saturation analysis") R('''sr.control = MEDIPS.saturationAnalysis(data = CONTROL.SET, bin_size = %(bin_size)i, extend = %(extension)i, no_iterations = %(saturation_iterations)i, no_random_iterations = 1)''' % locals()) R.png(E.getOutputFile("saturation.png")) R('''MEDIPS.plotSaturation(sr.control)''') R('''dev.off()''') R('''write.csv( sr.control$estimation, file ='%s' )''' % E.getOutputFile("saturation_estimation.csv")) outfile.write("estimated_correlation\t%f\n" % R('''sr.control$maxEstCor''')[1]) outfile.write("true_correlation\t%f\n" % R('''sr.control$maxTruCor''')[1]) if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") R('''cr.control = MEDIPS.coverageAnalysis(data = CONTROL.SET, extend = %(extension)i, no_iterations = 10)''' % locals()) R.png(E.getOutputFile("cpg_coverage.png")) R('''MEDIPS.plotCoverage(cr.control)''') R('''dev.off()''') # three rows R('''write.csv( cr.control$coveredPos, file ='%s' )''' % E.getOutputFile("saturation_coveredpos.csv")) # coverage threshold # number of CpG covered # percentage of CpG covered R('''write.csv( cr.control$matrix, file ='%s' )''' % E.getOutputFile("saturation_matrix.csv")) # R('''er.control = MEDIPS.CpGenrich(data = CONTROL.SET)''') if "calibration" in options.toolset or do_all: E.info("plotting calibration") R.png(E.getOutputFile("calibration.png")) R('''MEDIPS.plotCalibrationPlot(data = CONTROL.SET, linearFit = T, xrange=250)''' ) R('''dev.off()''') for slotname, label, pattern in slotnames: value = tuple(R('''CONTROL.SET@%s''' % slotname)) if len(value) == 0: continue outfile.write( "%s\t%s\n" % (label, pattern % tuple(R('''CONTROL.SET@%s''' % slotname))[0])) outfile.close() if "rpm" in options.toolset or do_all: outputfile = E.getOutputFile("rpm.wig") R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = T, descr = "rpm")''' % locals()) if options.bigwig: bigwig(outputfile, contig_sizes) else: compress(outputfile) if "rms" in options.toolset or do_all: outputfile = E.getOutputFile("rms.wig") R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = F, descr = "rms")''' % locals()) if options.bigwig: bigwig(outputfile, contig_sizes) else: compress(outputfile) shutil.rmtree(tmpdir) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults(input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in csv.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError as msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return if len(options.treatment_files) < 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset if options.chroms is None: chrstring = "" else: chroms = options.chroms.split(",") chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms) # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) window_size = options.window_size extend = options.extend shift = options.shift saturation_iterations = options.saturation_iterations uniq = float(options.unique) if options.bwa is True: BWA = "TRUE" else: BWA = "FALSE" if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, bwa = %(BWA)s, %(chrstring)s nrit = 1)''' % locals()) R.png(E.get_output_file("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.get_output_file("%s_saturation_estimation.tsv" % fn)) outfile = iotools.open_file( E.get_output_file("%s_saturation.tsv" % fn), "w") outfile.write("category\tvalues\n") outfile.write("estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write("true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write("nreads\t%s\n" % ",".join(["%i" % x for x in R('''sr$numberReads''')])) outfile.close() if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''cr = MEDIPS.seqCoverage( file='%(fn)s', BSgenome='%(genome_file)s', pattern='CG', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''') R('''dev.off()''') R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "hist", t=15)''') R('''dev.off()''') # note: this file is large R('''write.table(cr$cov.res, file=gzfile('%s','w'), sep='\t')''' % E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn)) if 'enrichment' in options.toolset or do_all: E.info("CpG enrichment analysis") outfile = iotools.open_file(E.get_output_file("enrichment.tsv.gz"), "w") slotnames = (("regions.CG", "regions_CG", "%i"), ("regions.C", "regions_C", "%s"), ("regions.G", "regions_G", "%f"), ("regions.relH", "regions_relH", "%i"), ("regions.GoGe", "regions_GoGe", "%i"), ("genome.CG", "genome_CG", "%s"), ("genome.C", "genome_C", "%s"), ("genome.G", "genome_G", "%i"), ("genome.relH", "genome_relH", "%i"), ("enrichment.score.relH", "enrichment_relH", "%s"), ("enrichment.score.GoGe", "enrichment_GoGe", "%s")) outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''ce = MEDIPS.CpGenrich( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) outfile.write("%s" % fn) for slotname, label, pattern in slotnames: value = tuple(R('''ce$%s''' % slotname)) if len(value) == 0: value = "" outfile.write("\t%s" % pattern % value[0]) outfile.write("\n") outfile.close() if options.input_rdata: E.info("reading R session info from '%s'" % options.input_rdata) R('''load('%s')''' % options.input_rdata) else: if "dmr" in options.toolset or "correlation" in options.toolset \ or do_all: # build four sets for x, fn in enumerate(options.treatment_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''treatment_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''treatment_set = c(%s)''' % ",".join([ "treatment_R%i" % x for x in range(len(options.treatment_files)) ])) if options.control_files: for x, fn in enumerate(options.control_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''control_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''control_set = c(%s)''' % ",".join([ "control_R%i" % x for x in range(len(options.control_files)) ])) # build coupling vector R('''CS = MEDIPS.couplingVector(pattern="CG", refObj = treatment_set[[1]])''') if "correlation" in options.toolset or do_all: R('''cor.matrix = MEDIPS.correlation( c(treatment_set, control_set))''') R('''write.table(cor.matrix, file='%s', sep="\t")''' % E.get_output_file("correlation")) if "dmr" in options.toolset or do_all: # Data that does not fit the model causes # "Error in 1:max_signal_index : argument of length 0" # The advice is to set MeDIP=FALSE # See: http://comments.gmane.org/ # gmane.science.biology.informatics.conductor/52319 if options.is_medip: medip = "TRUE" else: medip = "FALSE" fdr_method = options.fdr_method E.info("applying test for differential methylation") R('''meth = MEDIPS.meth( MSet1 = treatment_set, MSet2 = control_set, CSet = CS, ISet1 = NULL, ISet2 = NULL, p.adj = "%(fdr_method)s", diff.method = "edgeR", MeDIP = %(medip)s, CNV = F, minRowSum = 1)''' % locals()) # Note: several Gb in size # Output full methylation data table R('''write.table(meth, file=gzfile('%s', 'w'), sep="\t", row.names=F, quote=F)''' % E.get_output_file("data.tsv.gz")) # save R session if options.output_rdata: R('''save.image(file='%s', safe=FALSE)''' % E.get_output_file("session.RData")) # DMR analysis - test for windows and output if "dmr" in options.toolset: E.info("selecting differentially methylated windows") # test windows for differential methylation fdr_threshold = options.fdr_threshold R('''tested = MEDIPS.selectSig(meth, adj=T, ratio=NULL, p.value=%(fdr_threshold)f, bg.counts=NULL, CNV=F)''' % locals()) R('''write.table(tested, file=gzfile('%s', 'w'), sep="\t", quote=F)''' % E.get_output_file("significant_windows.gz")) # select gain and merge adjacent windows try: R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),]; gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') E.info('gain output: %s, merged: %s' % (str(R('''dim(gain)''')), str(R('''dim(gain_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(gain_merged, file=of, sep="\t", quote=F, row.names=FALSE, col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute gain windows: msg=%s" % msg) # select loss and merge adjacent windows try: R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),]; loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''') E.info('loss output: %s, merged: %s' % (str(R('''dim(loss)''')), str(R('''dim(loss_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(loss_merged, file=of, sep="\t", quote=F, row.names=F, col.names=F); close(of)''' % E.get_output_file("loss.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute loss windows: msg=%s" % msg) # if "rpm" in options.toolset or do_all: # outputfile = E.get_output_file("rpm.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = T, descr = "rpm")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # if "rms" in options.toolset or do_all: # outputfile = E.get_output_file("rms.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = F, descr = "rms")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # write footer and output benchmark information. E.stop()
def buildDMRStats( tables, method, outfile ): '''build dmr summary statistics. Creates some diagnostic plots in <exportdir>/<method> directory. Tables should be labeled <tileset>_<design>_<method>. ''' dbhandle = sqlite3.connect( PARAMS["database"] ) def togeneset( tablename ): return re.match("([^_]+)_", tablename ).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile( outfile, "w" ) outf.write( "\t".join( ("tileset", "design", "track1", "track2", "tested", "\t".join( [ "status_%s" % x for x in keys_status ] ), "significant", "up", "down", "twofold", "twofold_up", "twofold_down", ) ) + "\n" ) all_tables = set(Database.getTables( dbhandle )) outdir = os.path.join( PARAMS["exportdir"], "diff_methylation" ) for tablename in tables: prefix = P.snip( tablename, "_%s" % method ) tileset, design = prefix.split("_") def toDict( vals, l = 2 ): return collections.defaultdict( int, [ (tuple( x[:l]), x[l]) for x in vals ] ) E.info( "collecting data from %s" % tablename ) tested = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name""" % locals() ).fetchall() ) status = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name,status""" % locals() ).fetchall(), 3 ) signif = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE significant GROUP BY treatment_name,control_name""" % locals() ).fetchall() ) fold2 = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE (l2fold >= 1 or l2fold <= -1) AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) fold2up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 1 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) fold2down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < -1 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) groups = tested.keys() for treatment_name, control_name in groups: k = (treatment_name,control_name) outf.write( "\t".join(map(str, ( tileset, design, treatment_name, control_name, tested[k], "\t".join( [ str(status[(treatment_name,control_name,x)]) for x in keys_status]), signif[(k)], up[k], down[k], fold2[k], fold2up[k], fold2down[k] ) ) ) + "\n" ) ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, '''SELECT end - start, pvalue FROM %(tablename)s WHERE significant'''% locals() ).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals() R.png( pngfile ) R.smoothScatter( R.log10( ro.FloatVector(data[0]) ), R.log10( ro.FloatVector(data[1]) ), xlab = 'log10( length )', ylab = 'log10( pvalue )', log="x", pch=20, cex=.1 ) R['dev.off']() outf.close()
def __call__(self, track): region_statement = ''' SELECT MIN(exons.start) as start, MAX(exons.end) as end, exons.contig as contig FROM annotations.exon_stats as exons INNER JOIN annotations.transcript_info as ti ON exons.transcript_id = ti.transcript_id INNER JOIN annotations.gene_info as gi ON gi.gene_id = ti.gene_id WHERE %s ''' if self.track_type == "region": try: chrom, start, end = re.match("(.+):([0-9]+)-([0-9]+)", track).groups() except AttributeError: raise ValueError("%s is not a valid region specification" % track) else: if self.track_type == "gene id": where = "gi.gene_id = '%s'" % track elif self.track_type == "gene name": where = "gi.gene_name = '%s' " % track elif self.track_type == "transcript": where = "ti.transcript_id == '%s'" % track else: raise NotImplementedError("Track type: %s not implemented" % self.track_type) start, end, chrom = self.getFirstRow(region_statement % where) gene_track = Gviz.GeneRegionTrack(self.txdb, chromosome=chrom, start=start, end=end, **self.gene_track_options) data_tracks = self.getDataTracks(track) axisTrack = Gviz.GenomeAxisTrack() all_tracks = [axisTrack, gene_track] + data_tracks if not os.path.exists("export/GenomePlots"): os.makedirs("export/GenomePlots") # Hack to get around problem with user render not being able # to find font "sans" filename = os.path.join("export/GenomePlots", self.__class__.__name__ + track + ".png") R.png(filename, units="in", res=200, height=self.height, width=self.width) Gviz.plotTracks(all_tracks, main=track, **self.plot_options) R["dev.off"]() return odict((('name', track), ('filename', filename)))
if options.logscale: extra_options += ", log='%s'" % options.logscale if options.xrange: extra_options += ", xlim=c(%f,%f)" % tuple( map(float, options.xrange.split(","))) if options.yrange: extra_options += ", ylim=c(%f,%f)" % tuple( map(float, options.yrange.split(","))) if options.hardcopy: if options.hardcopy.endswith(".eps"): R.postscript(options.hardcopy) elif options.hardcopy.endswith(".png"): R.png(options.hardcopy, width=1024, height=768, type="cairo") elif options.hardcopy.endswith(".jpg"): R.jpg(options.hardcopy, width=1024, height=768, type="cairo") for method in options.plot: if ndata < 100: point_size = "1" pch = "o" elif ndata < 1000: point_size = "1" pch = "o" else: point_size = "0.5" pch = "."
extra_options += ", col=colours" if options.logscale: extra_options += ", log='%s'" % options.logscale if options.xrange: extra_options += ", xlim=c(%f,%f)" % tuple(map(float, options.xrange.split(",") ) ) if options.yrange: extra_options += ", ylim=c(%f,%f)" % tuple(map(float, options.yrange.split(",") ) ) if options.hardcopy: if options.hardcopy.endswith(".eps"): R.postscript(options.hardcopy) elif options.hardcopy.endswith(".png"): R.png(options.hardcopy, width=1024, height=768, type="cairo") elif options.hardcopy.endswith(".jpg"): R.jpg(options.hardcopy, width=1024, height=768, type="cairo") for method in options.plot: if ndata < 100: point_size = "1" pch = "o" elif ndata < 1000: point_size = "1" pch = "o" else: point_size = "0.5" pch = "."
def buildExpressionStats(tables, method, outfile, outdir): '''build expression summary statistics. Creates also diagnostic plots in <exportdir>/<method> directory. ''' dbhandle = sqlite3.connect(PARAMS["database"]) def _split(tablename): # this would be much easier, if feature_counts/gene_counts/etc. # would not contain an underscore. try: design, geneset, counting_method = re.match( "([^_]+)_vs_([^_]+)_(.*)_%s" % method, tablename).groups() except AttributeError: try: design, geneset = re.match("([^_]+)_([^_]+)_%s" % method, tablename).groups() counting_method = "na" except AttributeError: raise ValueError("can't parse tablename %s" % tablename) return design, geneset, counting_method # return re.match("([^_]+)_", tablename ).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join(("design", "geneset", "level", "treatment_name", "counting_method", "control_name", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "twofold")) + "\n") all_tables = set(Database.getTables(dbhandle)) for level in CUFFDIFF_LEVELS: for tablename in tables: tablename_diff = "%s_%s_diff" % (tablename, level) tablename_levels = "%s_%s_diff" % (tablename, level) design, geneset, counting_method = _split(tablename_diff) if tablename_diff not in all_tables: continue def toDict(vals, l=2): return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals]) tested = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name" % locals()).fetchall()) status = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, status, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name,status" % locals()).fetchall(), 3) signif = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE significant " "GROUP BY treatment_name,control_name" % locals()).fetchall()) fold2 = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE (l2fold >= 1 or l2fold <= -1) AND significant " "GROUP BY treatment_name,control_name,significant" % locals()).fetchall()) for treatment_name, control_name in tested.keys(): outf.write("\t".join( map(str, (design, geneset, level, counting_method, treatment_name, control_name, tested[ (treatment_name, control_name)], "\t".join([ str(status[(treatment_name, control_name, x)]) for x in keys_status ]), signif[(treatment_name, control_name)], fold2[(treatment_name, control_name)]))) + "\n") ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, "SELECT i.sum, pvalue " "FROM %(tablename_diff)s, " "%(geneset)s_geneinfo as i " "WHERE i.gene_id = test_id AND " "significant" % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(design)s_%(geneset)s_%(level)s_pvalue_vs_length.png" % locals( ) R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10( length )', ylab='log10( pvalue )', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
def buildCuffdiffPlots(infile, outfile): '''create summaries of cufflinks results (including some diagnostic plots) Plots are created in the <exportdir>/cuffdiff directory. Plots are: <geneset>_<method>_<level>_<track1>_vs_<track2>_significance.png fold change against expression level ''' ########################################### ########################################### # create diagnostic plots ########################################### outdir = os.path.join(PARAMS["exportdir"], "cuffdiff") dbhandle = sqlite3.connect(PARAMS["database"]) prefix = P.snip(infile, ".load") geneset, method = prefix.split("_") for level in CUFFDIFF_LEVELS: tablename_diff = prefix + "_%s_diff" % level tablename_levels = prefix + "_%s_levels" % level # note that the ordering of EXPERIMENTS and the _diff table # needs to be the same as only one triangle is stored of the # pairwise results. do not plot "undefined" lfold values # (where treatment_mean or control_mean = 0) do not plot lfold # values where the confidence bounds contain 0. for track1, track2 in itertools.combinations(EXPERIMENTS, 2): statement = """ SELECT CASE WHEN d.treatment_mean < d.control_mean THEN d.treatment_mean ELSE d.control_mean END, d.l2fold, d.significant FROM %(tablename_diff)s AS d WHERE treatment_name = '%(track1)s' AND control_name = '%(track2)s' AND status = 'OK' AND treatment_mean > 0 AND control_mean > 0 """ % locals() data = zip(*Database.executewait(dbhandle, statement)) pngfile = "%(outdir)s/%(geneset)s_%(method)s_%(level)s_%(track1)s_vs_%(track2)s_significance.png" % locals( ) # ian: Bug fix: moved R.png to after data check so that no # plot is started if there is no data this was leading # to R falling over from too many open devices if len(data) == 0: E.warn("no plot for %s - %s -%s vs %s" % (pngfile, level, track1, track2)) continue R.png(pngfile) R.plot(ro.FloatVector(data[0]), ro.FloatVector(data[1]), xlab='min(FPKM)', ylab='log2fold', log="x", pch=20, cex=.1, col=R.ifelse(ro.IntVector(data[2]), "red", "black")) R['dev.off']() P.touch(outfile)
if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, nrit = 1)''' % locals()) R.png(E.getOutputFile("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.getOutputFile("%s_saturation_estimation.tsv" % fn)) outfile = IOTools.openFile( E.getOutputFile("%s_saturation.tsv" % fn, "w")) outfile.write("category\tvalues\n") outfile.write( "estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write( "true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write(
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults( input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in CSV.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError as msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return if len(options.treatment_files) < 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset if options.chroms is None: chrstring = "" else: chroms = options.chroms.split(",") chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms) # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) window_size = options.window_size extend = options.extend shift = options.shift saturation_iterations = options.saturation_iterations uniq = float(options.unique) if options.bwa is True: BWA = "TRUE" else: BWA = "FALSE" if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, bwa = %(BWA)s, %(chrstring)s nrit = 1)''' % locals()) R.png(E.getOutputFile("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.getOutputFile("%s_saturation_estimation.tsv" % fn)) outfile = IOTools.openFile( E.getOutputFile("%s_saturation.tsv" % fn), "w") outfile.write("category\tvalues\n") outfile.write( "estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write( "true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write( "nreads\t%s\n" % ",".join(["%i" % x for x in R('''sr$numberReads''')])) outfile.close() if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''cr = MEDIPS.seqCoverage( file='%(fn)s', BSgenome='%(genome_file)s', pattern='CG', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R.png(E.getOutputFile("%s_cpg_coverage_pie.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''') R('''dev.off()''') R.png(E.getOutputFile("%s_cpg_coverage_hist.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "hist", t=15)''') R('''dev.off()''') # note: this file is large R('''write.table(cr$cov.res, file=gzfile('%s','w'), sep='\t')''' % E.getOutputFile("%s_saturation_coveredpos.tsv.gz" % fn)) if 'enrichment' in options.toolset or do_all: E.info("CpG enrichment analysis") outfile = IOTools.openFile(E.getOutputFile("enrichment.tsv.gz"), "w") slotnames = (("regions.CG", "regions_CG", "%i"), ("regions.C", "regions_C", "%s"), ("regions.G", "regions_G", "%f"), ("regions.relH", "regions_relH", "%i"), ("regions.GoGe", "regions_GoGe", "%i"), ("genome.CG", "genome_CG", "%s"), ("genome.C", "genome_C", "%s"), ("genome.G", "genome_G", "%i"), ("genome.relH", "genome_relH", "%i"), ("enrichment.score.relH", "enrichment_relH", "%s"), ("enrichment.score.GoGe", "enrichment_GoGe", "%s")) outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''ce = MEDIPS.CpGenrich( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) outfile.write("%s" % fn) for slotname, label, pattern in slotnames: value = tuple(R('''ce$%s''' % slotname)) if len(value) == 0: value = "" outfile.write("\t%s" % pattern % value[0]) outfile.write("\n") outfile.close() if options.input_rdata: E.info("reading R session info from '%s'" % options.input_rdata) R('''load('%s')''' % options.input_rdata) else: if "dmr" in options.toolset or "correlation" in options.toolset \ or do_all: # build four sets for x, fn in enumerate(options.treatment_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''treatment_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''treatment_set = c(%s)''' % ",".join(["treatment_R%i" % x for x in range(len(options.treatment_files))])) if options.control_files: for x, fn in enumerate(options.control_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''control_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''control_set = c(%s)''' % ",".join(["control_R%i" % x for x in range(len(options.control_files))])) # build coupling vector R('''CS = MEDIPS.couplingVector(pattern="CG", refObj = treatment_set[[1]])''') if "correlation" in options.toolset or do_all: R('''cor.matrix = MEDIPS.correlation( c(treatment_set, control_set))''') R('''write.table(cor.matrix, file='%s', sep="\t")''' % E.getOutputFile("correlation")) if "dmr" in options.toolset or do_all: # Data that does not fit the model causes # "Error in 1:max_signal_index : argument of length 0" # The advice is to set MeDIP=FALSE # See: http://comments.gmane.org/ # gmane.science.biology.informatics.conductor/52319 if options.is_medip: medip = "TRUE" else: medip = "FALSE" fdr_method = options.fdr_method E.info("applying test for differential methylation") R('''meth = MEDIPS.meth( MSet1 = treatment_set, MSet2 = control_set, CSet = CS, ISet1 = NULL, ISet2 = NULL, p.adj = "%(fdr_method)s", diff.method = "edgeR", MeDIP = %(medip)s, CNV = F, minRowSum = 1)''' % locals()) # Note: several Gb in size # Output full methylation data table R('''write.table(meth, file=gzfile('%s', 'w'), sep="\t", row.names=F, quote=F)''' % E.getOutputFile("data.tsv.gz")) # save R session if options.output_rdata: R('''save.image(file='%s', safe=FALSE)''' % E.getOutputFile("session.RData")) # DMR analysis - test for windows and output if "dmr" in options.toolset: E.info("selecting differentially methylated windows") # test windows for differential methylation fdr_threshold = options.fdr_threshold R('''tested = MEDIPS.selectSig(meth, adj=T, ratio=NULL, p.value=%(fdr_threshold)f, bg.counts=NULL, CNV=F)''' % locals()) R('''write.table(tested, file=gzfile('%s', 'w'), sep="\t", quote=F)''' % E.getOutputFile("significant_windows.gz")) # select gain and merge adjacent windows try: R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),]; gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') E.info('gain output: %s, merged: %s' % (str(R('''dim(gain)''')), str(R('''dim(gain_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(gain_merged, file=of, sep="\t", quote=F, row.names=FALSE, col.names=FALSE); close(of)''' % E.getOutputFile("gain.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute gain windows: msg=%s" % msg) # select loss and merge adjacent windows try: R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),]; loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''') E.info('loss output: %s, merged: %s' % (str(R('''dim(loss)''')), str(R('''dim(loss_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(loss_merged, file=of, sep="\t", quote=F, row.names=F, col.names=F); close(of)''' % E.getOutputFile("loss.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute loss windows: msg=%s" % msg) # if "rpm" in options.toolset or do_all: # outputfile = E.getOutputFile("rpm.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = T, descr = "rpm")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # if "rms" in options.toolset or do_all: # outputfile = E.getOutputFile("rms.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = F, descr = "rms")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # write footer and output benchmark information. E.Stop()
def buildExpressionStats( dbhandle, outfile, tablenames, outdir, regex_table="(?P<design>[^_]+)_" "(?P<geneset>[^_]+)_" "(?P<counting_method>[^_]+)_" "(?P<method>[^_]+)_" "(?P<level>[^_]+)_diff"): """compile expression summary statistics from database. This method outputs a table with the number of genes tested, failed, differentially expressed, etc. for a series of DE tests. Arguments --------- dbhandle : object Database handle. tables : list List of tables to process. outfile : string Output filename in :term:`tsv` format. outdir : string Output directory for diagnostic plots. regex : string Regular expression to extract experimental information from table name. """ keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join( ("design", "geneset", "level", "counting_method", "treatment_name", "control_name", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "twofold")) + "\n") for tablename in tablenames: r = re.search(regex_table, tablename) if r is None: raise ValueError( "can't match tablename '%s' to regex" % tablename) geneset = r.group("geneset") design = r.group("design") level = r.group("level") counting_method = r.group("counting_method") geneset = r.group("geneset") def toDict(vals, l=2): return collections.defaultdict( int, [(tuple(x[:l]), x[l]) for x in vals]) tested = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename)s " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) status = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, status, " "COUNT(*) FROM %(tablename)s " "GROUP BY treatment_name,control_name,status" % locals()).fetchall(), 3) signif = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename)s " "WHERE significant " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) fold2 = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename)s " "WHERE (l2fold >= 1 or l2fold <= -1) AND significant " "GROUP BY treatment_name,control_name,significant" % locals()).fetchall()) for treatment_name, control_name in tested.keys(): outf.write("\t".join(map(str, ( design, geneset, level, counting_method, treatment_name, control_name, tested[(treatment_name, control_name)], "\t".join( [str(status[(treatment_name, control_name, x)]) for x in keys_status]), signif[(treatment_name, control_name)], fold2[(treatment_name, control_name)]))) + "\n") # plot length versus P-Value data = Database.executewait( dbhandle, "SELECT i.sum, pvalue " "FROM %(tablename)s, " "%(geneset)s_geneinfo as i " "WHERE i.gene_id = test_id AND " "significant" % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = ("%(outdir)s/%(design)s_%(geneset)s_%(level)s" "_pvalue_vs_length.png") % locals() R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10( length )', ylab='log10( pvalue )', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
# -*- coding: utf-8 -*- from rpy2.robjects import r from rpy2.robjects import IntVector x = IntVector(range(9)) y = IntVector(range(9)) import ipdb ipdb.set_trace() r.png('figura1.png') r.plot(x, y) r['dev.off']() r.png('figura2.jpeg') r.plot(x, y, xlab='x', ylab='y', main='Minha plotagem', type='l') r['dev.off']() r.png('figura3.pdf') normal = r.rnorm(500, 0, 1) r.hist(normal) r['dev.off']()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--input-format", dest="input_format", type="choice", choices=("bed", "bam"), help="input file format [default=%default].") parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-e", "--extension", dest="extension", type="int", help="extension size [default=%default].") parser.add_option("-b", "--bin-size", dest="bin_size", type="int", help="bin size of genome vector [default=%default].") parser.add_option("-l", "--fragment-length", dest="fragment_length", type="int", help="bin size of genome vector [default=%default].") parser.add_option("-s", "--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis [default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "rms", "rpm", "all"), help = "actions to perform [default=%default].") parser.add_option("-w", "--bigwig", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a genome file [default=%default]") parser.set_defaults( input_format="bam", ucsc_genome="hg19", genome_file=None, extension=400, bin_size=50, saturation_iterations=10, fragment_length=700, toolset=[], bigwig=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if len(args) != 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() filename_sample = args[0] if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.Hsapiens.UCSC.%s' % options.ucsc_genome R.library(genome_file) tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) bin_size = options.bin_size extension = options.extension fragment_length = options.fragment_length saturation_iterations = options.saturation_iterations if options.input_format == "bam": E.info("converting bam files") filename_sample = bamToMEDIPS( filename_sample, os.path.join(tmpdir, "sample.medips")) elif options.input_format == "bed": E.info("converting bed files") filename_sample = bedToMEDIPS( filename_sample, os.path.join(tmpdir, "sample.medips")) E.info("loading data") R('''CONTROL.SET = MEDIPS.readAlignedSequences( BSgenome = "%(genome_file)s", file = "%(filename_sample)s" ) ''' % locals() ) slotnames = (("extend", "extend", "%i"), ("distFunction", "distance_function", "%s"), ("slope", "slope", "%f"), ("fragmentLength", "fragment_length", "%i"), ("bin_size", "bin_size", "%i"), ("seq_pattern", "pattern", "%s"), ("number_regions", "nregions", "%i"), ("number_pattern", "npatterns", "%i"), ("cali_chr", "calibration_contig", "%s"), ("genome_name", "genome", "%s")) E.info("computing genome vector") R('''CONTROL.SET = MEDIPS.genomeVector(data = CONTROL.SET, bin_size = %(bin_size)i, extend=%(extension)i )''' % locals()) E.info("computing CpG positions") R('''CONTROL.SET = MEDIPS.getPositions(data = CONTROL.SET, pattern = "CG")''' ) E.info("compute coupling vector") R('''CONTROL.SET = MEDIPS.couplingVector(data = CONTROL.SET, fragmentLength = %(fragment_length)i, func = "count")''' % locals() ) E.info("compute calibration curve") R('''CONTROL.SET = MEDIPS.calibrationCurve(data = CONTROL.SET)''') E.info("normalizing") R('''CONTROL.SET = MEDIPS.normalize(data = CONTROL.SET)''') outfile = IOTools.openFile(E.getOutputFile("summary.tsv.gz"), "w") outfile.write("category\tvalue\n") if "saturation" in options.toolset or do_all: E.info("saturation analysis") R('''sr.control = MEDIPS.saturationAnalysis(data = CONTROL.SET, bin_size = %(bin_size)i, extend = %(extension)i, no_iterations = %(saturation_iterations)i, no_random_iterations = 1)''' % locals() ) R.png(E.getOutputFile("saturation.png")) R('''MEDIPS.plotSaturation(sr.control)''') R('''dev.off()''') R('''write.csv( sr.control$estimation, file ='%s' )''' % E.getOutputFile("saturation_estimation.csv")) outfile.write("estimated_correlation\t%f\n" % R('''sr.control$maxEstCor''')[1] ) outfile.write("true_correlation\t%f\n" % R('''sr.control$maxTruCor''')[1] ) if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") R('''cr.control = MEDIPS.coverageAnalysis(data = CONTROL.SET, extend = %(extension)i, no_iterations = 10)''' % locals()) R.png(E.getOutputFile("cpg_coverage.png")) R('''MEDIPS.plotCoverage(cr.control)''') R('''dev.off()''') # three rows R('''write.csv( cr.control$coveredPos, file ='%s' )''' % E.getOutputFile("saturation_coveredpos.csv")) # coverage threshold # number of CpG covered # percentage of CpG covered R('''write.csv( cr.control$matrix, file ='%s' )''' % E.getOutputFile("saturation_matrix.csv")) # R('''er.control = MEDIPS.CpGenrich(data = CONTROL.SET)''') if "calibration" in options.toolset or do_all: E.info("plotting calibration") R.png(E.getOutputFile("calibration.png")) R('''MEDIPS.plotCalibrationPlot(data = CONTROL.SET, linearFit = T, xrange=250)''') R('''dev.off()''') for slotname, label, pattern in slotnames: value = tuple(R('''CONTROL.SET@%s''' % slotname )) if len(value) == 0: continue outfile.write("%s\t%s\n" % (label, pattern % tuple(R('''CONTROL.SET@%s''' % slotname ))[0] ) ) outfile.close() if "rpm" in options.toolset or do_all: outputfile = E.getOutputFile("rpm.wig") R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = T, descr = "rpm")''' % locals()) if options.bigwig: bigwig(outputfile, contig_sizes) else: compress(outputfile) if "rms" in options.toolset or do_all: outputfile = E.getOutputFile("rms.wig") R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = F, descr = "rms")''' % locals()) if options.bigwig: bigwig(outputfile, contig_sizes) else: compress(outputfile) shutil.rmtree(tmpdir) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-m", "--method", dest="method", type="choice", help= "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]", choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t")) parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("--plot-legend", dest="legend", type="string", help="legend for histograms." "") parser.add_option("-f", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.add_option( "-n", "--norm-test", dest="norm_test", action="store_true", help= """test if a set of values is normally distributed. Mean and variance are calculated from the data.""") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="""number of bins (for plotting purposes only).""") parser.add_option("--bin-size", dest="bin_size", type="float", help="""bin size for plot.""") parser.add_option("--min-value", dest="min_value", type="float", help="""minimum_value for plot.""") parser.add_option("--max-value", dest="max_value", type="float", help="""maximum_value for plot.""") parser.add_option("--skip-plot", dest="plot", action="store_false", help="""skipping plotting.""") parser.add_option("--header-names", dest="header", type="string", help="""header of value column [default=%default].""") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, legend=None, norm_test=False, num_bins=0, legend_range="2,2", bin_size=None, min_value=None, plot=True, header="value", title=None, ) (options, args) = E.Start(parser, add_pipe_options=True) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.legend: options.legend = options.legend.split(",") map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) f = str else: f = float if options.filename_input1: infile1 = IOTools.openFile(options.filename_input1, "r") else: infile1 = sys.stdin values1, errors1 = IOTools.ReadList(infile1, map_function=f, map_category=map_category2value) if options.filename_input1: infile1.close() if errors1 and options.loglevel >= 3: options.stdlog.write("# errors in input1: %s\n" % ";".join(map(str, errors1))) if options.norm_test: mean = R.mean(values1) stddev = R.sd(values1) options.stdlog.write( "# creating %i samples from normal distribution with mean %f and stddev %f\n" % (len(values1), mean, stddev)) values2 = R.rnorm(len(values1), mean, stddev) errors2 = () else: values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_function=f, map_category=map_category2value) if errors2 and options.loglevel >= 3: options.stdlog.write("# errors in input2: %s\n" % ";".join(map(str, errors2))) if options.loglevel >= 1: options.stdlog.write( "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" % (len(values1), len(errors1), len(values2), len(errors2))) if options.method in ("paired-mwu", "paired-t"): if len(values1) != len(values2): raise ValueError( "number of values must be equal for paired tests.") if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2, *xargs, **kwargs) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False, correct=True, *xargs, **kwargs) elif options.method == "paired-mwu": result = R.wilcox_test(values1, values2, paired=True, correct=True, *xargs, **kwargs) elif options.method == "paired-t": result = R.t_test(values1, values2, paired=True, *xargs, **kwargs) elif options.method == "shapiro": if len(values1) > 5000: E.warn( "shapiro-wilk test only accepts < 5000 values, a random sample has been created." ) values1 = random.sample(values1, 5000) result = R.shapiro_test(values1, *xargs, **kwargs) if options.plot: R.assign("v1", values1) R.assign("v2", values2) if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""" ) # compute breaks: min_value = min(min(values1), min(values2)) if options.min_value is not None: min_value = min(min_value, options.min_value) max_value = max(max(values1), max(values2)) if options.max_value is not None: max_value = max(max_value, options.max_value) extra_options = "" if options.num_bins and not (options.min_value or options.max_value): extra_options += ", breaks=%i" % options.num_bins elif options.num_bins and (options.min_value or options.max_value): bin_size = float((max_value - min_value)) / (options.num_bins + 1) breaks = [ min_value + x * bin_size for x in range(options.num_bins) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) elif options.bin_size is not None: num_bins = int(((max_value - min_value) / options.bin_size)) + 1 breaks = [ min_value + x * options.bin_size for x in range(num_bins + 1) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) R("""h1 <- hist( v1, freq=FALSE, density=20, main='Relative frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) R("""h1 <- hist( v1, freq=TRUE, density=20, main='Absolute frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=TRUE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) if options.loglevel >= 1: options.stdout.write("## Results for %s\n" % result['method']) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key in list(result.keys()): if key == "data.name": continue options.stdout.write("\t".join((key, str(result[key]))) + "\n") stat = Stats.Summary(values1) for key, value in list(stat.items()): options.stdout.write("%s1\t%s\n" % (str(key), str(value))) stat = Stats.Summary(values2) for key, value in list(stat.items()): options.stdout.write("%s2\t%s\n" % (str(key), str(value))) if options.plot: if options.hardcopy: R.dev_off() E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="choice", help="method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]", choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t")) parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("--plot-legend", dest="legend", type="string", help="legend for histograms.""") parser.add_option("-f", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.add_option("-n", "--norm-test", dest="norm_test", action="store_true", help="""test if a set of values is normally distributed. Mean and variance are calculated from the data.""") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="""number of bins (for plotting purposes only).""") parser.add_option("--bin-size", dest="bin_size", type="float", help="""bin size for plot.""") parser.add_option("--min-value", dest="min_value", type="float", help="""minimum_value for plot.""") parser.add_option("--max-value", dest="max_value", type="float", help="""maximum_value for plot.""") parser.add_option("--skip-plot", dest="plot", action="store_false", help="""skipping plotting.""") parser.add_option("--header-names", dest="header", type="string", help="""header of value column [default=%default].""") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, legend=None, norm_test=False, num_bins=0, legend_range="2,2", bin_size=None, min_value=None, plot=True, header="value", title=None, ) (options, args) = E.Start(parser, add_pipe_options=True) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.legend: options.legend = options.legend.split(",") map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) f = str else: f = float if options.filename_input1: infile1 = IOTools.openFile(options.filename_input1, "r") else: infile1 = sys.stdin values1, errors1 = IOTools.ReadList(infile1, map_function=f, map_category=map_category2value) if options.filename_input1: infile1.close() if errors1 and options.loglevel >= 3: options.stdlog.write("# errors in input1: %s\n" % ";".join(map(str, errors1))) if options.norm_test: mean = R.mean(values1) stddev = R.sd(values1) options.stdlog.write("# creating %i samples from normal distribution with mean %f and stddev %f\n" % ( len(values1), mean, stddev)) values2 = R.rnorm(len(values1), mean, stddev) errors2 = () else: values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_function=f, map_category=map_category2value) if errors2 and options.loglevel >= 3: options.stdlog.write("# errors in input2: %s\n" % ";".join(map(str, errors2))) if options.loglevel >= 1: options.stdlog.write("# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" % (len(values1), len(errors1), len(values2), len(errors2))) if options.method in ("paired-mwu", "paired-t"): if len(values1) != len(values2): raise ValueError( "number of values must be equal for paired tests.") if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2, *xargs, **kwargs) elif options.method == "mwu": result = R.wilcox_test( values1, values2, paired=False, correct=True, *xargs, **kwargs) elif options.method == "paired-mwu": result = R.wilcox_test( values1, values2, paired=True, correct=True, *xargs, **kwargs) elif options.method == "paired-t": result = R.t_test(values1, values2, paired=True, *xargs, **kwargs) elif options.method == "shapiro": if len(values1) > 5000: E.warn( "shapiro-wilk test only accepts < 5000 values, a random sample has been created.") values1 = random.sample(values1, 5000) result = R.shapiro_test(values1, *xargs, **kwargs) if options.plot: R.assign("v1", values1) R.assign("v2", values2) if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""") # compute breaks: min_value = min(min(values1), min(values2)) if options.min_value is not None: min_value = min(min_value, options.min_value) max_value = max(max(values1), max(values2)) if options.max_value is not None: max_value = max(max_value, options.max_value) extra_options = "" if options.num_bins and not (options.min_value or options.max_value): extra_options += ", breaks=%i" % options.num_bins elif options.num_bins and (options.min_value or options.max_value): bin_size = float((max_value - min_value)) / (options.num_bins + 1) breaks = [ min_value + x * bin_size for x in range(options.num_bins)] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) elif options.bin_size is not None: num_bins = int(((max_value - min_value) / options.bin_size)) + 1 breaks = [ min_value + x * options.bin_size for x in range(num_bins + 1)] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) R("""h1 <- hist( v1, freq=FALSE, density=20, main='Relative frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))""" % ( "','".join(options.legend))) R("""h1 <- hist( v1, freq=TRUE, density=20, main='Absolute frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=TRUE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))""" % ( "','".join(options.legend))) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) if options.loglevel >= 1: options.stdout.write("## Results for %s\n" % result['method']) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key in list(result.keys()): if key == "data.name": continue options.stdout.write("\t".join((key, str(result[key]))) + "\n") stat = Stats.Summary(values1) for key, value in list(stat.items()): options.stdout.write("%s1\t%s\n" % (str(key), str(value))) stat = Stats.Summary(values2) for key, value in list(stat.items()): options.stdout.write("%s2\t%s\n" % (str(key), str(value))) if options.plot: if options.hardcopy: R.dev_off() E.Stop()
# -*- coding: utf-8 -*- from rpy2.robjects import r from rpy2.robjects import IntVector x = IntVector(range(9)) y = IntVector(range(9)) import ipdb; ipdb.set_trace() r.png('figura1.png') r.plot(x, y) r['dev.off']() r.png('figura2.jpeg') r.plot(x, y, xlab='x', ylab='y', main='Minha plotagem', type='l') r['dev.off']() r.png('figura3.pdf') normal = r.rnorm(500, 0, 1) r.hist(normal) r['dev.off']()
for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, bwa = %(BWA)s, %(chrstring)s nrit = 1)''' % locals()) R.png(E.getOutputFile("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.getOutputFile("%s_saturation_estimation.tsv" % fn)) outfile = IOTools.openFile( E.getOutputFile("%s_saturation.tsv" % fn), "w") outfile.write("category\tvalues\n") outfile.write( "estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write( "true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write(
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_table2scatter.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-c", "--columns", dest="columns", type="string", help= "columns to take from table. Choices are 'all', 'all-but-first' or a ','-separated list of columns." ) parser.add_option( "--logscale", dest="logscale", type="string", help="log-transform one or both axes [default=%Default].") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file [default=%default].", metavar="FILE") parser.add_option("-f", "--file", dest="input_filename", type="string", help="filename with table data [default=%default].", metavar="FILE") parser.add_option("-2", "--file2", dest="input_filename2", type="string", help="additional data file [default=%default].", metavar="FILE") parser.add_option( "-s", "--stats", dest="statistics", type="choice", choices=("correlation", "spearman", "pearson", "count"), help="statistical quantities to compute [default=%default]", action="append") parser.add_option("-p", "--plot", dest="plot", type="choice", choices=("scatter", "pairs", "panel", "bar", "bar-stacked", "bar-besides", "1_vs_x", "matched", "boxplot", "scatter+marginal", "scatter-regression"), help="plots to plot [default=%default]", action="append") parser.add_option( "-t", "--threshold", dest="threshold", type="float", help="min threshold to use for counting method [default=%default].") parser.add_option( "-o", "--colours", dest="colours", type="int", help="column with colour information [default=%default].") parser.add_option( "-l", "--plot-labels", dest="labels", type="string", help="column labels for x and y in matched plots [default=%default].") parser.add_option("-d", "--add-diagonal", dest="add_diagonal", action="store_true", help="add diagonal to plot [default=%default].") parser.add_option("-e", "--plot-legend", dest="legend", type="int", help="column with legend [default=%default].") parser.add_option("-r", "--options", dest="r_options", type="string", help="R plotting options [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("full", "sparse"), help="output format [default=%default].") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.add_option("", "--xrange", dest="xrange", type="string", help="x viewing range of plot [default=%default].") parser.add_option("", "--yrange", dest="yrange", type="string", help="y viewing range of plot[default=%default].") parser.add_option("--allow-empty-file", dest="fail_on_empty", action="store_false", help="do not fail on empty input [default=%default].") parser.add_option("--fail-on-empty", dest="fail_on_empty", action="store_true", help="fail on empty input [default=%default].") parser.set_defaults(hardcopy=None, input_filename="", input_filename2=None, columns="all", logscale=None, statistics=[], plot=[], threshold=0.0, labels="x,y", colours=None, diagonal=False, legend=None, title=None, xrange=None, yrange=None, r_options="", fail_on_empty=True, format="full") (options, args) = E.Start(parser) if len(args) == 1 and not options.input_filename: options.input_filename = args[0] if options.columns not in ("all", "all-but-first"): options.columns = [int(x) - 1 for x in options.columns.split(",")] if options.colours: options.colours -= 1 if options.legend: options.legend -= 1 table = {} headers = [] # read data matrix if options.input_filename: lines = IOTools.openFile(options.input_filename, "r").readlines() else: # note: this will not work for interactive viewing, but # creating hardcopy plots works. lines = sys.stdin.readlines() lines = [x for x in lines if x[0] != "#"] if len(lines) == 0: if options.fail_on_empty: raise IOError("no input") E.warn("empty input") E.Stop() return matrix, headers, colours, legend = readTable(lines, "matrix", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) if options.input_filename2: # read another matrix (should be of the same format. matrix2, headers2, colours2, legend2 = readTable( lines, "matrix2", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) R.assign("headers", headers) ndata = R("""length( matrix[,1] )""")[0] if options.loglevel >= 1: options.stdlog.write("# read matrix: %ix%i\n" % (len(headers), ndata)) if colours: R.assign("colours", colours) for method in options.statistics: if method == "correlation": cor = R.cor(matrix, use="pairwise.complete.obs") writeMatrix(sys.stdout, cor, headers=headers, format="%5.2f") elif method == "pearson": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "n", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): try: result = R("""cor.test( matrix[,%i], matrix[,%i] )""" % (x + 1, y + 1)) except rpy.RPyException as msg: E.warn( "correlation not computed for columns %i(%s) and %i(%s): %s" % (x, headers[x], y, headers[y], msg)) options.stdout.write( "%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" % (headers[x], headers[y], "na", "na", "na", 0, "na", "na")) else: options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result.rx2('estimate').rx2('cor')[0], Stats.getSignificance( float(result.rx2('p.value')[0])), result.rx2('p.value')[0], result.rx2('parameter').rx2('df')[0], result.rx2('method')[0], result.rx2('alternative')[0])) elif method == "spearman": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): result = R( """cor.test( matrix[,%i], matrix[,%i], method='spearman')""" % (x + 1, y + 1)) options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result['estimate']['rho'], Stats.getSignificance(float(result['p.value'])), result['p.value'], result['parameter']['df'], result['method'], result['alternative'])) elif method == "count": # number of shared elements > threshold m, r, c = MatlabTools.ReadMatrix(open(options.input_filename, "r"), take=options.columns, headers=True) mask = numpy.greater(m, options.threshold) counts = numpy.dot(numpy.transpose(mask), mask) writeMatrix(options.stdout, counts, headers=c, format="%i") if options.plot: # remove columns that are completely empty if "pairs" in options.plot: colsums = R('''colSums( is.na(matrix ))''') take = [x for x in range(len(colsums)) if colsums[x] != ndata] if take: E.warn("removing empty columns %s before plotting" % str(take)) matrix = R.subset(matrix, select=[x + 1 for x in take]) R.assign("""matrix""", matrix) headers = [headers[x] for x in take] if legend: legend = [headers[x] for x in take] if options.r_options: extra_options = ", %s" % options.r_options else: extra_options = "" if options.legend is not None and len(legend): extra_options += ", legend=c('%s')" % "','".join(legend) if options.labels: xlabel, ylabel = options.labels.split(",") extra_options += ", xlab='%s', ylab='%s'" % (xlabel, ylabel) else: xlabel, ylabel = "", "" if options.colours: extra_options += ", col=colours" if options.logscale: extra_options += ", log='%s'" % options.logscale if options.xrange: extra_options += ", xlim=c(%f,%f)" % tuple( map(float, options.xrange.split(","))) if options.yrange: extra_options += ", ylim=c(%f,%f)" % tuple( map(float, options.yrange.split(","))) if options.hardcopy: if options.hardcopy.endswith(".eps"): R.postscript(options.hardcopy) elif options.hardcopy.endswith(".png"): R.png(options.hardcopy, width=1024, height=768, type="cairo") elif options.hardcopy.endswith(".jpg"): R.jpg(options.hardcopy, width=1024, height=768, type="cairo") for method in options.plot: if ndata < 100: point_size = "1" pch = "o" elif ndata < 1000: point_size = "1" pch = "o" else: point_size = "0.5" pch = "." if method == "scatter": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (point_size, extra_options)) if method == "scatter-regression": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (point_size, extra_options)) dat = R( """dat <- data.frame(x = matrix[,1], y = matrix[,2])""") R("""new <- data.frame(x = seq( min(matrix[,1]), max(matrix[,1]), (max(matrix[,1]) - min(matrix[,1])) / 100))""" ) mod = R("""mod <- lm( y ~ x, dat)""") R("""predict(mod, new, se.fit = TRUE)""") R("""pred.w.plim <- predict(mod, new, interval="prediction")""" ) R("""pred.w.clim <- predict(mod, new, interval="confidence")""" ) R("""matpoints(new$x,cbind(pred.w.clim, pred.w.plim[,-1]), lty=c(1,2,2,3,3), type="l")""" ) R.mtext("y = %f * x + %f, r=%6.4f, n=%i" % (mod["coefficients"]["x"], mod["coefficients"]["(Intercept)"], R("""cor( dat )[2]"""), ndata), 3, cex=1.0) elif method == "pairs": if options.add_diagonal: R("""panel.hist <- function( x,y,... ) { points(x,y,...); abline(0,1); }""" ) else: R("""panel.hist <- function( x,y,... ) { points(x,y,...); }""" ) # There used to be a argument na_action="na.omit", but # removed this as there appeared error messages saying # "na.action is not a graphical parameter" and the # plots showed occasionally the wrong scale. # cex=point_size also caused trouble (error message: # "X11 used font size 8 when 2 was requested" or # similar) if options.colours: R.pairs(matrix, pch=pch, col=colours, main=options.title, panel="panel.hist", labels=headers, cex_labels=2.0) else: R.pairs(matrix, pch=pch, panel="panel.hist", main=options.title, labels=headers, cex_labels=2.0) elif method == "boxplot": extra_options += ",main='%s'" % options.title # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R("""op <- par(mar=c(11,4,4,2))""" ) # the 10 allows the names.arg below the barplot R("""boxplot( matrix %s)""" % extra_options) elif method == "bar" or method == "bar-stacked": if not options.colours: extra_options += ", col=rainbow(5)" # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R("""op <- par(mar=c(11,4,4,2))""" ) # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), %s)""" % extra_options) elif method == "bar-besides": if not options.colours: extra_options += ", col=rainbow(%i)" % ndata # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R("""op <- par(mar=c(11,4,4,2))""" ) # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), beside=TRUE %s)""" % extra_options) elif method == "scatter+marginal": if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R("""matrix""") R(""" x <- matrix[,1]; y <- matrix[,2]; xhist <- hist(x, breaks=20, plot=FALSE); yhist <- hist(y, breaks=20, plot=FALSE); top <- max(c(xhist$counts, yhist$counts)); nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE ); par(mar=c(3,3,1,1)) ; plot(x, y, cex=%s, pch="o" %s) ; par(mar=c(0,3,1,1)) ; barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ; par(mar=c(3,0,1,1)) ; title(main='%s'); barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE ) ; title(main='%s'); """ % (point_size, extra_options, xlabel, ylabel)) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) elif method in ("panel", "1_vs_x", "matched"): if method == "panel": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): pairs.append((x, y)) elif method == "1_vs_x": pairs = [] for x in range(1, len(headers)): pairs.append((0, x)) # print matching columns elif method == "matched": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): if headers[x] == headers[y]: pairs.append((x, y)) break w = int(math.ceil(math.sqrt(len(pairs)))) h = int(math.ceil(float(len(pairs)) / w)) PosInf = 1e300000 NegInf = -1e300000 xlabel, ylabel = options.labels.split(",") R("""layout(matrix(seq(1,%i), %i, %i, byrow = TRUE))""" % (w * h, w, h)) for a, b in pairs: new_matrix = [ x for x in zip( list(matrix[a].values())[0], list(matrix[b].values())[0]) if x[0] not in (float("nan"), PosInf, NegInf) and x[1] not in (float("nan"), PosInf, NegInf) ] try: R("""plot(matrix[,%i], matrix[,%i], main='%s versus %s', cex=0.5, pch=".", xlab='%s', ylab='%s' )""" % (a + 1, b + 1, headers[b], headers[a], xlabel, ylabel)) except rpy.RException as msg: print("could not plot %s versus %s: %s" % (headers[b], headers[a], msg)) if options.hardcopy: R['dev.off']() E.info("matrix added as >matrix< in R.") if not options.hardcopy: if options.input_filename: interpreter = code.InteractiveConsole(globals()) interpreter.interact() else: E.info( "can not start new interactive session as input has come from stdin." ) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: r_table2scatter.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take from table. Choices are 'all', 'all-but-first' or a ','-separated list of columns.") parser.add_option("--logscale", dest="logscale", type="string", help="log-transform one or both axes [default=%Default].") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file [default=%default].", metavar="FILE") parser.add_option("-f", "--file", dest="input_filename", type="string", help="filename with table data [default=%default].", metavar="FILE") parser.add_option("-2", "--file2", dest="input_filename2", type="string", help="additional data file [default=%default].", metavar="FILE") parser.add_option("-s", "--stats", dest="statistics", type="choice", choices=("correlation", "spearman", "pearson", "count"), help="statistical quantities to compute [default=%default]", action="append") parser.add_option("-p", "--plot", dest="plot", type="choice", choices=("scatter", "pairs", "panel", "bar", "bar-stacked", "bar-besides", "1_vs_x", "matched", "boxplot", "scatter+marginal", "scatter-regression"), help="plots to plot [default=%default]", action="append") parser.add_option("-t", "--threshold", dest="threshold", type="float", help="min threshold to use for counting method [default=%default].") parser.add_option("-o", "--colours", dest="colours", type="int", help="column with colour information [default=%default].") parser.add_option("-l", "--plot-labels", dest="labels", type="string", help="column labels for x and y in matched plots [default=%default].") parser.add_option("-d", "--add-diagonal", dest="add_diagonal", action="store_true", help="add diagonal to plot [default=%default].") parser.add_option("-e", "--plot-legend", dest="legend", type="int", help="column with legend [default=%default].") parser.add_option("-r", "--options", dest="r_options", type="string", help="R plotting options [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("full", "sparse"), help="output format [default=%default].") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.add_option("", "--xrange", dest="xrange", type="string", help="x viewing range of plot [default=%default].") parser.add_option("", "--yrange", dest="yrange", type="string", help="y viewing range of plot[default=%default].") parser.add_option("--allow-empty-file", dest="fail_on_empty", action="store_false", help="do not fail on empty input [default=%default].") parser.add_option("--fail-on-empty", dest="fail_on_empty", action="store_true", help="fail on empty input [default=%default].") parser.set_defaults( hardcopy=None, input_filename="", input_filename2=None, columns="all", logscale=None, statistics=[], plot=[], threshold=0.0, labels="x,y", colours=None, diagonal=False, legend=None, title=None, xrange=None, yrange=None, r_options="", fail_on_empty=True, format="full") (options, args) = E.Start(parser) if len(args) == 1 and not options.input_filename: options.input_filename = args[0] if options.columns not in ("all", "all-but-first"): options.columns = [int(x) - 1 for x in options.columns.split(",")] if options.colours: options.colours -= 1 if options.legend: options.legend -= 1 table = {} headers = [] # read data matrix if options.input_filename: lines = IOTools.openFile(options.input_filename, "r").readlines() else: # note: this will not work for interactive viewing, but # creating hardcopy plots works. lines = sys.stdin.readlines() lines = [x for x in lines if x[0] != "#"] if len(lines) == 0: if options.fail_on_empty: raise IOError("no input") E.warn("empty input") E.Stop() return matrix, headers, colours, legend = readTable(lines, "matrix", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) if options.input_filename2: # read another matrix (should be of the same format. matrix2, headers2, colours2, legend2 = readTable( lines, "matrix2", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) R.assign("headers", headers) ndata = R("""length( matrix[,1] )""")[0] if options.loglevel >= 1: options.stdlog.write("# read matrix: %ix%i\n" % (len(headers), ndata)) if colours: R.assign("colours", colours) for method in options.statistics: if method == "correlation": cor = R.cor(matrix, use="pairwise.complete.obs") writeMatrix(sys.stdout, cor, headers=headers, format="%5.2f") elif method == "pearson": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "n", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): try: result = R( """cor.test( matrix[,%i], matrix[,%i] )""" % (x + 1, y + 1)) except rpy.RPyException as msg: E.warn("correlation not computed for columns %i(%s) and %i(%s): %s" % ( x, headers[x], y, headers[y], msg)) options.stdout.write("%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" % (headers[x], headers[y], "na", "na", "na", 0, "na", "na")) else: options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result.rx2('estimate').rx2( 'cor')[0], Stats.getSignificance( float(result.rx2('p.value')[0])), result.rx2('p.value')[0], result.rx2('parameter').rx2( 'df')[0], result.rx2('method')[0], result.rx2('alternative')[0])) elif method == "spearman": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): result = R( """cor.test( matrix[,%i], matrix[,%i], method='spearman')""" % (x + 1, y + 1)) options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result['estimate']['rho'], Stats.getSignificance(float(result['p.value'])), result['p.value'], result['parameter']['df'], result['method'], result['alternative'])) elif method == "count": # number of shared elements > threshold m, r, c = MatlabTools.ReadMatrix(open(options.input_filename, "r"), take=options.columns, headers=True) mask = numpy.greater(m, options.threshold) counts = numpy.dot(numpy.transpose(mask), mask) writeMatrix(options.stdout, counts, headers=c, format="%i") if options.plot: # remove columns that are completely empty if "pairs" in options.plot: colsums = R('''colSums( is.na(matrix ))''') take = [x for x in range(len(colsums)) if colsums[x] != ndata] if take: E.warn("removing empty columns %s before plotting" % str(take)) matrix = R.subset(matrix, select=[x + 1 for x in take]) R.assign("""matrix""", matrix) headers = [headers[x] for x in take] if legend: legend = [headers[x] for x in take] if options.r_options: extra_options = ", %s" % options.r_options else: extra_options = "" if options.legend is not None and len(legend): extra_options += ", legend=c('%s')" % "','".join(legend) if options.labels: xlabel, ylabel = options.labels.split(",") extra_options += ", xlab='%s', ylab='%s'" % (xlabel, ylabel) else: xlabel, ylabel = "", "" if options.colours: extra_options += ", col=colours" if options.logscale: extra_options += ", log='%s'" % options.logscale if options.xrange: extra_options += ", xlim=c(%f,%f)" % tuple( map(float, options.xrange.split(","))) if options.yrange: extra_options += ", ylim=c(%f,%f)" % tuple( map(float, options.yrange.split(","))) if options.hardcopy: if options.hardcopy.endswith(".eps"): R.postscript(options.hardcopy) elif options.hardcopy.endswith(".png"): R.png(options.hardcopy, width=1024, height=768, type="cairo") elif options.hardcopy.endswith(".jpg"): R.jpg(options.hardcopy, width=1024, height=768, type="cairo") for method in options.plot: if ndata < 100: point_size = "1" pch = "o" elif ndata < 1000: point_size = "1" pch = "o" else: point_size = "0.5" pch = "." if method == "scatter": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % ( point_size, extra_options)) if method == "scatter-regression": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % ( point_size, extra_options)) dat = R( """dat <- data.frame(x = matrix[,1], y = matrix[,2])""") R( """new <- data.frame(x = seq( min(matrix[,1]), max(matrix[,1]), (max(matrix[,1]) - min(matrix[,1])) / 100))""") mod = R("""mod <- lm( y ~ x, dat)""") R("""predict(mod, new, se.fit = TRUE)""") R("""pred.w.plim <- predict(mod, new, interval="prediction")""") R("""pred.w.clim <- predict(mod, new, interval="confidence")""") R( """matpoints(new$x,cbind(pred.w.clim, pred.w.plim[,-1]), lty=c(1,2,2,3,3), type="l")""") R.mtext( "y = %f * x + %f, r=%6.4f, n=%i" % (mod["coefficients"]["x"], mod["coefficients"][ "(Intercept)"], R("""cor( dat )[2]"""), ndata), 3, cex=1.0) elif method == "pairs": if options.add_diagonal: R( """panel.hist <- function( x,y,... ) { points(x,y,...); abline(0,1); }""") else: R( """panel.hist <- function( x,y,... ) { points(x,y,...); }""") # There used to be a argument na_action="na.omit", but # removed this as there appeared error messages saying # "na.action is not a graphical parameter" and the # plots showed occasionally the wrong scale. # cex=point_size also caused trouble (error message: # "X11 used font size 8 when 2 was requested" or # similar) if options.colours: R.pairs(matrix, pch=pch, col=colours, main=options.title, panel="panel.hist", labels=headers, cex_labels=2.0) else: R.pairs(matrix, pch=pch, panel="panel.hist", main=options.title, labels=headers, cex_labels=2.0) elif method == "boxplot": extra_options += ",main='%s'" % options.title # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R( """op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot R("""boxplot( matrix %s)""" % extra_options) elif method == "bar" or method == "bar-stacked": if not options.colours: extra_options += ", col=rainbow(5)" # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R( """op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), %s)""" % extra_options) elif method == "bar-besides": if not options.colours: extra_options += ", col=rainbow(%i)" % ndata # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R( """op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), beside=TRUE %s)""" % extra_options) elif method == "scatter+marginal": if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R("""matrix""") R(""" x <- matrix[,1]; y <- matrix[,2]; xhist <- hist(x, breaks=20, plot=FALSE); yhist <- hist(y, breaks=20, plot=FALSE); top <- max(c(xhist$counts, yhist$counts)); nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE ); par(mar=c(3,3,1,1)) ; plot(x, y, cex=%s, pch="o" %s) ; par(mar=c(0,3,1,1)) ; barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ; par(mar=c(3,0,1,1)) ; title(main='%s'); barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE ) ; title(main='%s'); """ % (point_size, extra_options, xlabel, ylabel)) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) elif method in ("panel", "1_vs_x", "matched"): if method == "panel": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): pairs.append((x, y)) elif method == "1_vs_x": pairs = [] for x in range(1, len(headers)): pairs.append((0, x)) # print matching columns elif method == "matched": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): if headers[x] == headers[y]: pairs.append((x, y)) break w = int(math.ceil(math.sqrt(len(pairs)))) h = int(math.ceil(float(len(pairs)) / w)) PosInf = 1e300000 NegInf = -1e300000 xlabel, ylabel = options.labels.split(",") R("""layout(matrix(seq(1,%i), %i, %i, byrow = TRUE))""" % (w * h, w, h)) for a, b in pairs: new_matrix = [x for x in zip( list(matrix[a].values())[0], list(matrix[b].values())[0]) if x[0] not in (float("nan"), PosInf, NegInf) and x[1] not in (float("nan"), PosInf, NegInf)] try: R("""plot(matrix[,%i], matrix[,%i], main='%s versus %s', cex=0.5, pch=".", xlab='%s', ylab='%s' )""" % ( a + 1, b + 1, headers[b], headers[a], xlabel, ylabel)) except rpy.RException as msg: print("could not plot %s versus %s: %s" % (headers[b], headers[a], msg)) if options.hardcopy: R['dev.off']() E.info("matrix added as >matrix< in R.") if not options.hardcopy: if options.input_filename: interpreter = code.InteractiveConsole(globals()) interpreter.interact() else: E.info( "can not start new interactive session as input has come from stdin.") E.Stop()
import pandas as pd import rpy2.robjects as ro from rpy2.robjects import r from rpy2.robjects import pandas2ri pandas2ri.activate() import psycopg2 con=psycopg2.connect("dbname='stars' user='******' host='localhost'") cur=con.cursor() cur.execute("select jmag, ksmag from sheikhi") df=pd.DataFrame(cur.fetchall(),columns=['j','ks']) r.png("isochroneofsheikhidata.png") r.plot(df['j']-df['ks'],df['j'],xlim=ro.FloatVector([min(df['j']-df['ks']),max(df['j']-df['ks'])]),ylim=ro.FloatVector([max(df['j']),min(df['j'])]),xlab='color [J-Ks]', ylab='Jmag', main='cmd from Sheikhi data') v=raw_input()
map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap( open(options.filename_input_map, "r"), map_functions=(str,float)) values1, errors1 = IOTools.ReadList( open(options.filename_input1, "r"), map_category=map_category2value ) values2, errors2 = IOTools.ReadList( open(options.filename_input2, "r"), map_category=map_category2value ) E.info( "ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1), len(values2), len(errors2)) ) if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test( values1, values2 ) elif options.method == "mwu": result = R.wilcox_test( values1, values2, paired=False) R.assign("v1", values1) R.assign("v2", values2) R.layout(R.matrix((1,2,3,4), 2, 2, byrow = True)) R.boxplot( values1, values2, col=('white','red'), main="Boxplot" ) R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""")
def plotGeneLevelReadExtension(infile, outfile): '''plot reads extending beyond last exon.''' infiles = glob.glob(infile + ".*.tsv.gz") outdir = os.path.join(PARAMS["exportdir"], "utr_extension") R('''suppressMessages(library(RColorBrewer))''') R('''suppressMessages(library(MASS))''') R('''suppressMessages(library(HiddenMarkov))''') # the bin size , see gtf2table - could be cleaned from column names binsize = 100 territory_size = 15000 for filename in infiles: E.info("processing %s" % filename) parts = os.path.basename(filename).split(".") data = R( '''data = read.table( gzfile( "%(filename)s"), header=TRUE, fill=TRUE, row.names=1)''' % locals()) ########################################## ########################################## ########################################## ## estimation ########################################## # take only those with a 'complete' territory R('''d = data[-which( apply( data,1,function(x)any(is.na(x)))),]''') # save UTR R('''utrs = d$utr''') # remove length and utr column R('''d = d[-c(1,2)]''') # remove those which are completely empty, logtransform or scale data and export R('''lraw = log10( d[-which( apply(d,1,function(x)all(x==0))),] + 1 )''' ) utrs = R('''utrs = utrs[-which( apply(d,1,function(x)all(x==0)))]''') scaled = R( '''lscaled = t(scale(t(lraw), center=FALSE, scale=apply(lraw,1,max) ))''' ) exons = R('''lraw[,1]''') if len(utrs) == 0: E.warn("no data for %s" % filename) continue ####################################################### ####################################################### ####################################################### R('''myplot = function( reads, utrs, ... ) { oreads = t(data.matrix( reads )[order(utrs), ] ) outrs = utrs[order(utrs)] image( 1:nrow(oreads), 1:ncol(oreads), oreads , xlab = "", ylab = "", col=brewer.pal(9,"Greens"), axes=FALSE) # axis(BELOW<-1, at=1:nrow(oreads), labels=rownames(oreads), cex.axis=0.7) par(new=TRUE) plot( outrs, 1:length(outrs), yaxs="i", xaxs="i", ylab="genes", xlab="len(utr) / bp", type="S", xlim=c(0,nrow(oreads)*%(binsize)i)) }''' % locals()) fn = ".".join((parts[0], parts[4], "raw", "png")) outfilename = os.path.join(outdir, fn) R.png(outfilename, height=2000, width=1000) R('''myplot( lraw, utrs )''') R['dev.off']() # plot scaled data fn = ".".join((parts[0], parts[4], "scaled", "png")) outfilename = os.path.join(outdir, fn) R.png(outfilename, height=2000, width=1000) R('''myplot( lscaled, utrs )''') R['dev.off']() P.touch(outfile)
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option( "-m", "--method", dest="method", type="string", help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]") parser.add_option( "-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar = "FILE" ) parser.add_option( "-1", "--infile1", dest="filename_input1", type="string" , help="input filename for distribution 1.") parser.add_option( "-2", "--infile2", dest="filename_input2", type="string" , help="input filename for distribution 2.") parser.add_option( "-p", "--infile-map", dest="filename_input_map", type="string" , help="input filename for mapping categories to values.") parser.set_defaults( method = "ks", filename_input1 = None, filename_input2 = None, filename_input_map = None, ) (options, args) = E.Start( parser, add_pipe_options = True, add_psql_options = True,) map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap( open(options.filename_input_map, "r"), map_functions=(str,float)) values1, errors1 = IOTools.ReadList( open(options.filename_input1, "r"), map_category=map_category2value ) values2, errors2 = IOTools.ReadList( open(options.filename_input2, "r"), map_category=map_category2value ) E.info( "ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1), len(values2), len(errors2)) ) if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test( values1, values2 ) elif options.method == "mwu": result = R.wilcox_test( values1, values2, paired=False) R.assign("v1", values1) R.assign("v2", values2) R.layout(R.matrix((1,2,3,4), 2, 2, byrow = True)) R.boxplot( values1, values2, col=('white','red'), main="Boxplot" ) R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""") R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""") R("""hist( v2, freq=FALSE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") R("""hist( v1, freq=TRUE, width=0.5, density=10, main='Absolute frequency histogram')""") R("""hist( v2, freq=TRUE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") print "## Results for %s" % result['method'] for x in ['p.value', 'statistic', 'alternative', 'method']: print x, result[x] E.Stop()
def buildUTRExtension(infile, outfile): '''build new utrs by building and fitting an HMM to reads upstream and downstream of known genes. Works on output of buildGeneLevelReadExtension. Known problems * the size of the extension is limited by the window size * introns within UTRs are ignored. * UTR extension might be underestimated for highly expressed genes as relative read counts drop off quickly, even though there is a good amount of reads still present in the UTR. The model The model is a three-state model:: UTR --|--> notUTR --|--> otherTranscript --| ^---| ^------| ^-------| ^-----------------------------| The chain starts in UTR and ends in notUTr or otherTranscript. The otherTranscript state models peaks of within the upstream/ downstream region of a gene. These peaks might correspond to additional exons or unknown transcripts. Without this state, the UTR might be artificially extend to include these peaks. Emissions are modelled with beta distributions. These distributions permit both bimodal (UTR) and unimodal (notUTR) distribution of counts. Parameter estimation Parameters are derived from known UTRs within full length territories. Transitions and emissions for the otherTranscript state are set heuristically: * low probabibily for remaining in state "otherTranscript". * these transcripts should be short. * emissions biased towards high counts - only strong signals will be considered. * these could be estimated from known UTRs, but I am worried UTR extensions then will be diluted. Alternatives The method could be improved. * base level resolution? * longer chains result in more data and longer running times. * the averaging in windows smoothes the data, which might have a beneficial effect. * raw counts instead of scaled counts? * better model, as highly expressed genes should give more confident predictions. ''' # the bin size , see gtf2table - can be cleaned from column names # or better set as options in .ini file binsize = 100 territory_size = 15000 # read gene coordinates geneinfos = {} for x in CSV.DictReader(IOTools.openFile(infile), dialect='excel-tab'): contig, strand, start, end = x['contig'], x['strand'], int( x['start']), int(x['end']) geneinfos[x['gene_id']] = (contig, strand, start, end) infiles = [ infile + ".readextension_upstream_sense.tsv.gz", infile + ".readextension_downstream_sense.tsv.gz" ] outdir = os.path.join(PARAMS["exportdir"], "utr_extension") R('''suppressMessages(library(RColorBrewer))''') R('''suppressMessages(library(MASS))''') R('''suppressMessages(library(HiddenMarkov))''') # for upstream, downstream upstream_utrs, downstream_utrs = {}, {} all_genes = set() for filename, new_utrs in zip(infiles, (upstream_utrs, downstream_utrs)): E.info("processing %s" % filename) parts = os.path.basename(filename).split(".") data = R( '''data = read.table( gzfile( "%(filename)s"), header=TRUE, fill=TRUE, row.names=1)''' % locals()) ########################################## ########################################## ########################################## ## estimation ########################################## # take only those with a 'complete' territory R('''d = data[-which( apply( data,1,function(x)any(is.na(x)))),]''') # save UTR R('''utrs = d$utr''') # remove length and utr column R('''d = d[-c(1,2)]''') # remove those which are completely empty, logtransform or scale data and export R('''lraw = log10( d[-which( apply(d,1,function(x)all(x==0))),] + 1 )''' ) utrs = R('''utrs = utrs[-which( apply(d,1,function(x)all(x==0)))]''') scaled = R( '''lscaled = t(scale(t(lraw), center=FALSE, scale=apply(lraw,1,max) ))''' ) exons = R('''lraw[,1]''') ####################################################### ####################################################### ####################################################### # do the estimation: E.debug("estimation: utrs=%i, exons=%i, vals=%i, dim=%s" % (len(utrs), len(exons), len(scaled), R.dim(scaled))) # counts within and outside UTRs within_utr, outside_utr, otherTranscript = [], [], [] # number of transitions between utrs transitions = numpy.zeros((3, 3), numpy.int) for x in xrange(len(utrs)): utr, exon = utrs[x], exons[x] # only consider genes with expression coverage # note: expression level is logscaled here, 10^1 = 10 if exon < 0.1: continue # first row is column names, so x + 1 values = list(scaled.rx(x + 1, True)) utr_bins = utr // binsize nonutr_bins = (territory_size - utr) // binsize # build transition matrix transitions[0][0] += utr_bins transitions[0][1] += 1 transitions[1][1] += nonutr_bins outside_utr.extend([x for x in values[utr_bins:] if x <= 0.5]) # ignore exon and zero counts within_utr.extend([x for x in values[1:utr_bins] if x > 0.1]) # add only high counts to otherTranscript emissions otherTranscript.extend([x for x in values[utr_bins:] if x > 0.5]) # estimation for # 5% chance of transiting to otherTranscript transitions[1][2] = transitions[1][1] * 0.05 # 10% chance of remaining in otherTranscript transitions[2][1] = 900 transitions[2][2] = 100 E.info( "counting: (n,mean): within utr=%i,%f, outside utr=%i,%f, otherTranscript=%i,%f" % \ ( len(within_utr), numpy.mean(within_utr), len(outside_utr), numpy.mean(outside_utr), len(otherTranscript), numpy.mean(otherTranscript)) ) ro.globalenv['transitions'] = R.matrix(transitions, nrow=3, ncol=3) R('''transitions = transitions / rowSums( transitions )''') ro.globalenv['within_utr'] = ro.FloatVector(within_utr[:10000]) ro.globalenv['outside_utr'] = ro.FloatVector(outside_utr[:10000]) ro.globalenv['otherTranscript'] = ro.FloatVector( otherTranscript[:10000]) # estimate beta distribution parameters R('''doFit = function( data ) { data[data == 0] = data[data == 0] + 0.001 data[data == 1] = data[data == 1] - 0.001 f = fitdistr( data, dbeta, list( shape1=0.5, shape2=0.5 ) ) return (f) }''') fit_within_utr = R( '''fit_within_utr = suppressMessages(doFit( within_utr))''') fit_outside_utr = R( '''fit_outside_utr = suppressMessages(doFit( outside_utr))''') fit_other = R( '''fit_otherTranscript = suppressMessages(doFit( otherTranscript))''' ) within_a, within_b = list(fit_within_utr.rx("estimate"))[0] outside_a, outside_b = list(fit_outside_utr.rx("estimate"))[0] other_a, other_b = list(fit_other.rx("estimate"))[0] E.info( "beta estimates: within_utr=%f,%f outside=%f,%f, other=%f,%f" % \ (within_a, within_b, outside_a, outside_b, other_a, other_b)) fn = ".".join((parts[0], parts[4], "fit", "png")) outfilename = os.path.join(outdir, fn) R.png(outfilename, height=1000, width=1000) R('''par(mfrow=c(3,1))''') R('''x=seq(0,1,0.02)''') R('''hist( within_utr, 50, col=rgb( 0,0,1,0.2) )''') R('''par(new=TRUE)''') R('''plot( x, dbeta( x, fit_within_utr$estimate['shape1'], fit_within_utr$estimate['shape2']), type='l', col='blue')''' ) R('''hist( outside_utr, 50, col=rgb( 1,0,0,0.2 ) )''') R('''par(new=TRUE)''') R('''plot( x, dbeta( x, fit_outside_utr$estimate['shape1'], fit_outside_utr$estimate['shape2']), type='l', col='red')''' ) R('''hist( otherTranscript, 50, col=rgb( 0,1,0,0.2 ) )''') R('''par(new=TRUE)''') R('''plot( x, dbeta( x, fit_otherTranscript$estimate['shape1'], fit_otherTranscript$estimate['shape2']), type='l', col='green')''' ) R['dev.off']() ##################################################### ##################################################### ##################################################### # build hmm # state 1 = UTR # state 2 = notUTR # state 3 = other transcript p = R('''betaparams = list( shape1=c(fit_within_utr$estimate['shape1'], fit_outside_utr$estimate['shape1'], fit_otherTranscript$estimate['shape1']), shape2=c(fit_within_utr$estimate['shape2'], fit_outside_utr$estimate['shape2'], fit_otherTranscript$estimate['shape2'])) ''' ) R('''hmm = dthmm(NULL, transitions, c(1,0,0), "beta", betaparams )''') E.info("fitting starts") ##################################################### ##################################################### ##################################################### # fit to every sequence genes = R('''rownames(data)''') all_genes.update(set(genes)) utrs = R('''data$utr''') exons = R('''data$exon''') nseqs = len(utrs) counter = E.Counter() for idx in xrange(len(utrs)): gene_id = genes[idx] old_utr = utrs[idx] if idx % 100 == 0: E.debug("processing gene %i/%i" % (idx, len(utrs))) counter.input += 1 # do not predict if terminal exon not expressed if exons[idx] < 1: counter.skipped_notexpressed += 1 new_utrs[gene_id] = Utr._make( (old_utr, None, None, "notexpressed")) continue R('''obs = data[%i,][-c(1,2)]''' % (idx + 1)) # remove na obs = R('''obs = obs[!is.na(obs)]''') if len(obs) <= 1 or max(obs) == 0: new_utrs[gene_id] = Utr._make( (old_utr, None, None, "no observations")) continue # normalize R('''obs = obs / max(obs)''') # add small epsilon to 0 and 1 values R('''obs[obs==0] = obs[obs==0] + 0.001 ''') R('''obs[obs==1] = obs[obs==1] - 0.001 ''') R('''hmm$x = obs''') states = None try: states = list(R('''states = Viterbi( hmm )''')) except ri.RRuntimeError, msg: counter.skipped_error += 1 new_utrs[gene_id] = Utr._make((old_utr, None, None, "fail")) continue max_utr = binsize * (len(states) - 1) # subtract 1 for last exon try: new_utr = binsize * (states.index(2) - 1) new_utrs[gene_id] = Utr._make( (old_utr, new_utr, max_utr, "ok")) counter.success += 1 except ValueError: new_utrs[gene_id] = Utr._make( (old_utr, max_utr, max_utr, "max")) counter.maxutr += 1
def draw_survival_curves(feature, surv, assignment=None, filename='tmp.png', show=False, title=True, labels=None, colors=['blue', 'red'], ann=None, show_legend=True, q=.25, std=None): if assignment is None: num_panels = 1 assignment = feature.map(lambda s: 1) name = lambda v: str(feature.name) if feature.name != None else '' else: num_panels = len(assignment.unique()) name = lambda v: str(assignment.name) + ' = ' + str(v) if (labels is None) and ((len(feature) / feature.nunique()) > 10): labels = r.sort(r.c(*feature.unique())) # R sorts bad colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black'] if feature.dtype == 'bool': feature = feature.map({True: 'True', False: 'False'}) r.png(filename=filename, width=200 * (num_panels + 1), height=300, res=75) fmla = robjects.Formula('Surv(days, event) ~ feature') r.par(mfrow=r.c(1, num_panels)) r.par(mar=r.c(4, 5, 4, 1)) r.par(xpd=True) if (get_vec_type(feature) == 'real') and (len(feature.unique()) > 10): colors = ['blue', 'orange', 'red'] if q == .5: labels = ['Bottom 50%', 'Top 50%'] else: labels = [ 'Bottom {}%'.format(int(q * 100)), 'Normal', 'Top {}%'.format(int(q * 100)) ] ls = r.c(*colors) def plot_me(sub_f, label): if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10): sub_f = to_quants(sub_f, q=q, std=std) m = get_cox_ph(surv, sub_f, formula=fmla) r_data = m.rx2('call')[2] p = log_rank(sub_f, surv)['p'] ls = r.c(*colors) r.plot(survival.survfit(fmla, r_data), lty=1, col=ls, lwd=4, cex=1.25, xlab='Years to Event', ylab='Survival') r.title(label, cex=3.) if ann == 'p': r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4) elif ann != None: r.text(0, labels=ann, pos=4) if show_legend == 'out': r.par(xpd=True, mar=r.c(4, 5, 5, 8)) for value in sorted(assignment.ix[feature.index].dropna().unique()): f = feature.ix[assignment[assignment == value].index] if len(f.unique()) > 1: plot_me(f, name(value)) if show_legend == True: mean_s = surv.ix[:, 'event'].ix[assignment[assignment == value].index].mean() if mean_s < .5: r.legend(surv.ix[:, 'days'].max() * .05 / 365., .45, labels, lty=1, col=ls, lwd=3, bty='o') else: r.legend(surv.ix[:, 'days'].max() * .4 / 365, .9, labels, lty=1, col=ls, lwd=3, bty='o') elif show_legend == 'out': r.legend(surv.ix[:, 'days'].max() * 1.1 / 365, .9, labels, lty=1, col=ls, lwd=3, bty='o') r('dev.off()') if show: return Show(filename)
def plotGeneLevelReadExtension(infile, outfile): '''plot reads extending beyond last exon.''' infiles = glob.glob(infile + ".*.tsv.gz") outdir = os.path.join(PARAMS["exportdir"], "utr_extension") R('''suppressMessages(library(RColorBrewer))''') R('''suppressMessages(library(MASS))''') R('''suppressMessages(library(HiddenMarkov))''') # the bin size , see gtf2table - could be cleaned from column names binsize = 100 territory_size = 15000 for filename in infiles: E.info("processing %s" % filename) parts = os.path.basename(filename).split(".") data = R( '''data = read.table( gzfile( "%(filename)s"), header=TRUE, fill=TRUE, row.names=1)''' % locals() ) ########################################## ########################################## ########################################## # estimation ########################################## # take only those with a 'complete' territory R('''d = data[-which( apply( data,1,function(x)any(is.na(x)))),]''') # save UTR R('''utrs = d$utr''' ) # remove length and utr column R('''d = d[-c(1,2)]''') # remove those which are completely empty, logtransform or scale data # and export R('''lraw = log10( d[-which( apply(d,1,function(x)all(x==0))),] + 1 )''') utrs = R('''utrs = utrs[-which( apply(d,1,function(x)all(x==0)))]''' ) scaled = R( '''lscaled = t(scale(t(lraw), center=FALSE, scale=apply(lraw,1,max) ))''' ) exons = R('''lraw[,1]''') if len(utrs) == 0: E.warn("no data for %s" % filename) continue ####################################################### ####################################################### ####################################################### R('''myplot = function( reads, utrs, ... ) { oreads = t(data.matrix( reads )[order(utrs), ] ) outrs = utrs[order(utrs)] image( 1:nrow(oreads), 1:ncol(oreads), oreads , xlab = "", ylab = "", col=brewer.pal(9,"Greens"), axes=FALSE) # axis(BELOW<-1, at=1:nrow(oreads), labels=rownames(oreads), cex.axis=0.7) par(new=TRUE) plot( outrs, 1:length(outrs), yaxs="i", xaxs="i", ylab="genes", xlab="len(utr) / bp", type="S", xlim=c(0,nrow(oreads)*%(binsize)i)) }''' % locals()) fn = ".".join((parts[0], parts[4], "raw", "png")) outfilename = os.path.join(outdir, fn) R.png(outfilename, height=2000, width=1000) R('''myplot( lraw, utrs )''' ) R['dev.off']() # plot scaled data fn = ".".join((parts[0], parts[4], "scaled", "png")) outfilename = os.path.join(outdir, fn) R.png(outfilename, height=2000, width=1000) R('''myplot( lscaled, utrs )''' ) R['dev.off']() P.touch(outfile)
def buildExpressionStats(tables, method, outfile, outdir): '''build expression summary statistics. Creates also diagnostic plots in <exportdir>/<method> directory. ''' dbhandle = sqlite3.connect(PARAMS["database"]) def _split(tablename): # this would be much easier, if feature_counts/gene_counts/etc. # would not contain an underscore. try: design, geneset, counting_method = re.match( "([^_]+)_vs_([^_]+)_(.*)_%s" % method, tablename).groups() except AttributeError: try: design, geneset = re.match( "([^_]+)_([^_]+)_%s" % method, tablename).groups() counting_method = "na" except AttributeError: raise ValueError("can't parse tablename %s" % tablename) return design, geneset, counting_method # return re.match("([^_]+)_", tablename ).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join( ("design", "geneset", "level", "treatment_name", "counting_method", "control_name", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "twofold")) + "\n") all_tables = set(Database.getTables(dbhandle)) for level in CUFFDIFF_LEVELS: for tablename in tables: tablename_diff = "%s_%s_diff" % (tablename, level) tablename_levels = "%s_%s_diff" % (tablename, level) design, geneset, counting_method = _split(tablename_diff) if tablename_diff not in all_tables: continue def toDict(vals, l=2): return collections.defaultdict( int, [(tuple(x[:l]), x[l]) for x in vals]) tested = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) status = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, status, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name,status" % locals()).fetchall(), 3) signif = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE significant " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) fold2 = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE (l2fold >= 1 or l2fold <= -1) AND significant " "GROUP BY treatment_name,control_name,significant" % locals()).fetchall()) for treatment_name, control_name in tested.keys(): outf.write("\t".join(map(str, ( design, geneset, level, counting_method, treatment_name, control_name, tested[(treatment_name, control_name)], "\t".join( [str(status[(treatment_name, control_name, x)]) for x in keys_status]), signif[(treatment_name, control_name)], fold2[(treatment_name, control_name)]))) + "\n") ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, "SELECT i.sum, pvalue " "FROM %(tablename_diff)s, " "%(geneset)s_geneinfo as i " "WHERE i.gene_id = test_id AND " "significant" % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(design)s_%(geneset)s_%(level)s_pvalue_vs_length.png" % locals() R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10( length )', ylab='log10( pvalue )', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
def buildUTRExtension(infile, outfile): '''build new utrs by building and fitting an HMM to reads upstream and downstream of known genes. Works on output of buildGeneLevelReadExtension. Known problems * the size of the extension is limited by the window size * introns within UTRs are ignored. * UTR extension might be underestimated for highly expressed genes as relative read counts drop off quickly, even though there is a good amount of reads still present in the UTR. The model The model is a three-state model:: UTR --|--> notUTR --|--> otherTranscript --| ^---| ^------| ^-------| ^-----------------------------| The chain starts in UTR and ends in notUTr or otherTranscript. The otherTranscript state models peaks of within the upstream/ downstream region of a gene. These peaks might correspond to additional exons or unknown transcripts. Without this state, the UTR might be artificially extend to include these peaks. Emissions are modelled with beta distributions. These distributions permit both bimodal (UTR) and unimodal (notUTR) distribution of counts. Parameter estimation Parameters are derived from known UTRs within full length territories. Transitions and emissions for the otherTranscript state are set heuristically: * low probabibily for remaining in state "otherTranscript". * these transcripts should be short. * emissions biased towards high counts - only strong signals will be considered. * these could be estimated from known UTRs, but I am worried UTR extensions then will be diluted. Alternatives The method could be improved. * base level resolution? * longer chains result in more data and longer running times. * the averaging in windows smoothes the data, which might have a beneficial effect. * raw counts instead of scaled counts? * better model, as highly expressed genes should give more confident predictions. ''' # the bin size , see gtf2table - can be cleaned from column names # or better set as options in .ini file binsize = 100 territory_size = 15000 # read gene coordinates geneinfos = {} for x in CSV.DictReader(IOTools.openFile(infile), dialect='excel-tab'): contig, strand, start, end = x['contig'], x[ 'strand'], int(x['start']), int(x['end']) geneinfos[x['gene_id']] = (contig, strand, start, end) infiles = [infile + ".readextension_upstream_sense.tsv.gz", infile + ".readextension_downstream_sense.tsv.gz"] outdir = os.path.join(PARAMS["exportdir"], "utr_extension") R('''suppressMessages(library(RColorBrewer))''') R('''suppressMessages(library(MASS))''') R('''suppressMessages(library(HiddenMarkov))''') # for upstream, downstream upstream_utrs, downstream_utrs = {}, {} all_genes = set() for filename, new_utrs in zip(infiles, (upstream_utrs, downstream_utrs)): E.info("processing %s" % filename) parts = os.path.basename(filename).split(".") data = R( '''data = read.table( gzfile( "%(filename)s"), header=TRUE, fill=TRUE, row.names=1)''' % locals() ) ########################################## ########################################## ########################################## # estimation ########################################## # take only those with a 'complete' territory R('''d = data[-which( apply( data,1,function(x)any(is.na(x)))),]''') # save UTR R('''utrs = d$utr''' ) # remove length and utr column R('''d = d[-c(1,2)]''') # remove those which are completely empty, logtransform or scale data # and export R('''lraw = log10( d[-which( apply(d,1,function(x)all(x==0))),] + 1 )''') utrs = R('''utrs = utrs[-which( apply(d,1,function(x)all(x==0)))]''' ) scaled = R( '''lscaled = t(scale(t(lraw), center=FALSE, scale=apply(lraw,1,max) ))''' ) exons = R('''lraw[,1]''') ####################################################### ####################################################### ####################################################### # do the estimation: E.debug("estimation: utrs=%i, exons=%i, vals=%i, dim=%s" % (len(utrs), len(exons), len(scaled), R.dim(scaled))) # counts within and outside UTRs within_utr, outside_utr, otherTranscript = [], [], [] # number of transitions between utrs transitions = numpy.zeros((3, 3), numpy.int) for x in xrange(len(utrs)): utr, exon = utrs[x], exons[x] # only consider genes with expression coverage # note: expression level is logscaled here, 10^1 = 10 if exon < 0.1: continue # first row is column names, so x + 1 values = list(scaled.rx(x + 1, True)) utr_bins = utr // binsize nonutr_bins = (territory_size - utr) // binsize # build transition matrix transitions[0][0] += utr_bins transitions[0][1] += 1 transitions[1][1] += nonutr_bins outside_utr.extend([x for x in values[utr_bins:] if x <= 0.5]) # ignore exon and zero counts within_utr.extend([x for x in values[1:utr_bins] if x > 0.1]) # add only high counts to otherTranscript emissions otherTranscript.extend([x for x in values[utr_bins:] if x > 0.5]) # estimation for # 5% chance of transiting to otherTranscript transitions[1][2] = transitions[1][1] * 0.05 # 10% chance of remaining in otherTranscript transitions[2][1] = 900 transitions[2][2] = 100 E.info("counting: (n,mean): within utr=%i,%f, outside utr=%i,%f, otherTranscript=%i,%f" % (len(within_utr), numpy.mean(within_utr), len(outside_utr), numpy.mean(outside_utr), len(otherTranscript), numpy.mean(otherTranscript))) ro.globalenv['transitions'] = R.matrix(transitions, nrow=3, ncol=3) R('''transitions = transitions / rowSums( transitions )''') ro.globalenv['within_utr'] = ro.FloatVector(within_utr[:10000]) ro.globalenv['outside_utr'] = ro.FloatVector(outside_utr[:10000]) ro.globalenv['otherTranscript'] = ro.FloatVector( otherTranscript[:10000]) # estimate beta distribution parameters R('''doFit = function( data ) { data[data == 0] = data[data == 0] + 0.001 data[data == 1] = data[data == 1] - 0.001 f = fitdistr( data, dbeta, list( shape1=0.5, shape2=0.5 ) ) return (f) }''' ) fit_within_utr = R( '''fit_within_utr = suppressMessages(doFit( within_utr))''' ) fit_outside_utr = R( '''fit_outside_utr = suppressMessages(doFit( outside_utr))''' ) fit_other = R( '''fit_otherTranscript = suppressMessages(doFit( otherTranscript))''' ) within_a, within_b = list(fit_within_utr.rx("estimate"))[0] outside_a, outside_b = list(fit_outside_utr.rx("estimate"))[0] other_a, other_b = list(fit_other.rx("estimate"))[0] E.info("beta estimates: within_utr=%f,%f outside=%f,%f, other=%f,%f" % (within_a, within_b, outside_a, outside_b, other_a, other_b)) fn = ".".join((parts[0], parts[4], "fit", "png")) outfilename = os.path.join(outdir, fn) R.png(outfilename, height=1000, width=1000) R( '''par(mfrow=c(3,1))''' ) R( '''x=seq(0,1,0.02)''') R( '''hist( within_utr, 50, col=rgb( 0,0,1,0.2) )''' ) R( '''par(new=TRUE)''') R( '''plot( x, dbeta( x, fit_within_utr$estimate['shape1'], fit_within_utr$estimate['shape2']), type='l', col='blue')''') R( '''hist( outside_utr, 50, col=rgb( 1,0,0,0.2 ) )''' ) R( '''par(new=TRUE)''') R( '''plot( x, dbeta( x, fit_outside_utr$estimate['shape1'], fit_outside_utr$estimate['shape2']), type='l', col='red')''') R( '''hist( otherTranscript, 50, col=rgb( 0,1,0,0.2 ) )''' ) R( '''par(new=TRUE)''') R( '''plot( x, dbeta( x, fit_otherTranscript$estimate['shape1'], fit_otherTranscript$estimate['shape2']), type='l', col='green')''') R['dev.off']() ##################################################### ##################################################### ##################################################### # build hmm # state 1 = UTR # state 2 = notUTR # state 3 = other transcript p = R('''betaparams = list( shape1=c(fit_within_utr$estimate['shape1'], fit_outside_utr$estimate['shape1'], fit_otherTranscript$estimate['shape1']), shape2=c(fit_within_utr$estimate['shape2'], fit_outside_utr$estimate['shape2'], fit_otherTranscript$estimate['shape2'])) ''') R('''hmm = dthmm(NULL, transitions, c(1,0,0), "beta", betaparams )''' ) E.info("fitting starts") ##################################################### ##################################################### ##################################################### # fit to every sequence genes = R('''rownames(data)''') all_genes.update(set(genes)) utrs = R('''data$utr''') exons = R('''data$exon''') nseqs = len(utrs) counter = E.Counter() for idx in xrange(len(utrs)): gene_id = genes[idx] old_utr = utrs[idx] if idx % 100 == 0: E.debug("processing gene %i/%i" % (idx, len(utrs))) counter.input += 1 # do not predict if terminal exon not expressed if exons[idx] < 1: counter.skipped_notexpressed += 1 new_utrs[gene_id] = Utr._make( (old_utr, None, None, "notexpressed")) continue R('''obs = data[%i,][-c(1,2)]''' % (idx + 1) ) # remove na obs = R('''obs = obs[!is.na(obs)]''' ) if len(obs) <= 1 or max(obs) == 0: new_utrs[gene_id] = Utr._make( (old_utr, None, None, "no observations")) continue # normalize R('''obs = obs / max(obs)''') # add small epsilon to 0 and 1 values R('''obs[obs==0] = obs[obs==0] + 0.001 ''') R('''obs[obs==1] = obs[obs==1] - 0.001 ''') R('''hmm$x = obs''') states = None try: states = list(R('''states = Viterbi( hmm )''')) except ri.RRuntimeError, msg: counter.skipped_error += 1 new_utrs[gene_id] = Utr._make((old_utr, None, None, "fail")) continue max_utr = binsize * (len(states) - 1) # subtract 1 for last exon try: new_utr = binsize * (states.index(2) - 1) new_utrs[gene_id] = Utr._make( (old_utr, new_utr, max_utr, "ok")) counter.success += 1 except ValueError: new_utrs[gene_id] = Utr._make( (old_utr, max_utr, max_utr, "max")) counter.maxutr += 1
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="string", help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("-p", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, ) (options, args) = E.start(parser, add_pipe_options=True) map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"), map_category=map_category2value) values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_category=map_category2value) E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1), len(values2), len(errors2))) if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False) R.assign("v1", values1) R.assign("v2", values2) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""") R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""") R("""hist( v2, freq=FALSE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") R("""hist( v1, freq=TRUE, width=0.5, density=10, main='Absolute frequency histogram')""") R("""hist( v2, freq=TRUE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") print("## Results for %s" % result['method']) for x in ['p.value', 'statistic', 'alternative', 'method']: print(x, result[x]) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-e", "--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("-s", "--shift", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("-b", "--bin-size", dest="bin_size", type="int", help="bin size of genome vector [default=%default].") parser.add_option("-l", "--fragment-length", dest="fragment_length", type="int", help="bin size of genome vector [default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.set_defaults( input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, bin_size=50, window_size=300, saturation_iterations=10, fragment_length=700, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], is_medip=True, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if len(options.treatment_files) < 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) bin_size = options.bin_size window_size = options.window_size extend = options.extend shift = options.shift fragment_length = options.fragment_length saturation_iterations = options.saturation_iterations uniq = "TRUE" if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, nrit = 1)''' % locals()) R.png(E.getOutputFile("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.getOutputFile("%s_saturation_estimation.tsv" % fn)) outfile = IOTools.openFile("%s_saturation.tsv" % fn, "w") outfile.write("category\tvalues\n") outfile.write("estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write("true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write("nreads\t%s\n" % ",".join(["%i" % x for x in R('''sr$numberReads''')])) outfile.close() if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") for fn in options.treatment_files + options.control_files: R('''cr = MEDIPS.seqCoverage( file='%(fn)s', BSgenome='%(genome_file)s', pattern='CG', shift=%(shift)i, extend=%(extend)i, uniq=%(uniq)s)''' % locals()) R.png(E.getOutputFile("%s_cpg_coverage_pie.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''') R('''dev.off()''') R.png(E.getOutputFile("%s_cpg_coverage_hist.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "hist", t=15)''') R('''dev.off()''') # note: this file is large R('''write.table(cr$cov.res, file=gzfile('%s','w'), sep='\t')''' % E.getOutputFile("%s_saturation_coveredpos.tsv.gz" % fn)) if 'enrichment' in options.toolset or do_all: E.info("CpG enrichment analysis") outfile = IOTools.openFile(E.getOutputFile("enrichment.tsv.gz"), "w") slotnames = (("regions.CG", "regions_CG", "%i"), ("regions.C", "regions_C", "%s"), ("regions.G", "regions_G", "%f"), ("regions.relH", "regions_relH", "%i"), ("regions.GoGe", "regions_GoGe", "%i"), ("genome.CG", "genome_CG", "%s"), ("genome.C", "genome_C", "%s"), ("genome.G", "genome_G", "%i"), ("genome.relH", "genome_relH", "%i"), ("enrichment.score.relH", "enrichment_relH", "%s"), ("enrichment.score.GoGe", "enrichment_GoGe", "%s")) outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n") for fn in options.treatment_files + options.control_files: R('''ce = MEDIPS.CpGenrich( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, uniq=%(uniq)s)''' % locals()) outfile.write("%s" % fn) for slotname, label, pattern in slotnames: value = tuple(R('''ce$%s''' % slotname)) if len(value) == 0: value = "" outfile.write("\t%s" % pattern % value[0]) outfile.write("\n") outfile.close() if "dmr" in options.toolset or "correlation" in options.toolset \ or do_all: # build four sets for x, fn in enumerate(options.treatment_files): R('''treatment_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s)''' % locals()) R('''treatment_set = c(%s)''' % ",".join( ["treatment_R%i" % x for x in range(len(options.treatment_files))])) if options.control_files: for x, fn in enumerate(options.control_files): R('''control_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s)''' % locals()) R('''control_set = c(%s)''' % ",".join( ["control_R%i" % x for x in range(len(options.control_files))])) # build coupling vector R('''CS = MEDIPS.couplingVector(pattern="CG", refObj = treatment_set[[1]])''') if "correlation" in options.toolset or do_all: R('''cor.matrix = MEDIPS.correlation( c(treatment_set, control_set))''') R('''write.table(cor.matrix, file='%s', sep="\t")''' % E.getOutputFile("correlation")) if "dmr" in options.toolset or do_all: # Data that does not fit the model causes # "Error in 1:max_signal_index : argument of length 0" # The advice is to set MeDIP=FALSE # See: http://comments.gmane.org/gmane.science.biology.informatics.conductor/52319 if options.is_medip: medip = "TRUE" else: medip = "FALSE" R('''meth = MEDIPS.meth( MSet1 = treatment_set, MSet2 = control_set, CSet = CS, ISet1 = NULL, ISet2 = NULL, p.adj = "bonferroni", diff.method = "edgeR", prob.method = "poisson", MeDIP = %(medip)s, CNV = F, type = "rpkm", minRowSum = 1)''' % locals()) # test windows for differential methylation R('''tested = MEDIPS.selectSig(meth, adj=T, ratio=NULL, p.value=0.1, bg.counts=NULL, CNV=F)''') R('''write.table(tested, file=gzFile('%s', 'w') sep="\t", quote=F)''' % E.getOutputFile("windows")) # select gain and merge adjacent windows R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),]; gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') R('''write.table(gain_merged, file=gzFile('%s', 'w') sep="\t", row.names=FALSE, col.names=FALSE)''' % E.getOutputFile("gain.bed.gz")) # select loss and merge adjacent windows R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),]; loss_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') R('''write.table(loss_merged, file=gzFile('%s', 'w') sep="\t", row.names=FALSE, col.names=FALSE)''' % E.getOutputFile("loss.bed.gz")) # if "rpm" in options.toolset or do_all: # outputfile = E.getOutputFile("rpm.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = T, descr = "rpm")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # if "rms" in options.toolset or do_all: # outputfile = E.getOutputFile("rms.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = F, descr = "rms")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # write footer and output benchmark information. E.Stop()