def buildBenchmarkInput(infile, outfile): tmpfile = P.getTempFile() dbhandle = sqlite3.connect(PARAMS["database_name"]) cc = dbhandle.cursor() statement = ''' SELECT DISTINCT transcript_id, protein_id FROM peptide_info ''' cc.execute(statement) tmpfile.write("transcript_id\tprotein_id\n") tmpfile.write("\n".join(["\t".join(x) for x in cc])) tmpfile.write("\n") tmpfilename = tmpfile.name statement = ''' perl %(scriptsdir)s/extract_fasta.pl %(infile)s < cds.fasta python %(scripstdir)s/fasta2variants.py --is-cds | python %(scriptsdir)s/substitute_tokens.py --map-tsv-file=%(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def calculateSequenceComposition(interval_names, sequence_file, outfile, header_line=True): ''' given a set of interval names that are present in a fasta file, return CpG content file ''' interval_file = open(interval_names) if header_line: interval_file.readline() sequence_file = open(sequence_file) interval_set = set() for line in interval_file.readlines(): interval_set.add(line[:-1]) temp = P.getTempFile("/ifs/scratch") for record in FastaIterator.iterate(sequence_file): seq_id = record.title.split(" ")[0] if seq_id in interval_set: temp.write(">%s\n%s\n" % (record.title, record.sequence)) temp.close() inf = temp.name statement = ''' cat %(inf)s | cgat fasta2table -s na -s cpg -s length --log=%(outfile)s.log > %(outfile)s''' P.run()
def loadPicardGCStats(infiles, outfile): '''Merge Picard insert size stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".gcstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db %(csv2db_options)s --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome): '''import repeats from a UCSC formatted file. The repeats are stored as a :term:`gff` formatted file. ''' repclasses = "','".join(repeattypes.split(",")) # Repeats are either stored in a single ``rmsk`` table (hg19) or in # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, .... # In order to do a single statement, the ucsc mysql database is # queried for tables that end in rmsk. dbhandle = PipelineUCSC.connectToUCSC(host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"], database=ucsc_database) cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'") tables = [x[0] for x in cc.fetchall()] if len(tables) == 0: raise ValueError("could not find any `rmsk` tables") tmpfile = P.getTempFile(shared=True) total_repeats = 0 for table in tables: E.info("%s: loading repeats from %s" % (ucsc_database, table)) cc = dbhandle.execute( """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.', strand, '.', CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";') FROM %(table)s WHERE repClass in ('%(repclasses)s') """ % locals()) n = 0 for data in cc.fetchall(): n += 1 tmpfile.write("\t".join(map(str, data)) + "\n") E.info("%s: %s=%i repeats downloaded" % (ucsc_database, table, n)) total_repeats += n if total_repeats == 0: raise ValueErrror("did not find any repeats for %s" % ucsc_database) tmpfile.close() tmpfilename = tmpfile.name statement = '''cat %(tmpfilename)s | %(pipeline_scriptsdir)s/gff_sort pos | cgat gff2gff --method=sanitize --sanitize-method=genome --skip-missing --genome-file=%(genome)s --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """cgat combine_tables --missing-value=0 %(filenames)s | cgat csv2db --header-names=%(column)s,%(header)s --replace-header --add-index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def extractEnsemblLincRNA(infile, outfile): tmpf = P.getTempFile("/ifs/scratch") for gtf in GTF.iterator(iotools.openFile(infile)): if gtf.source == "lincRNA": tmpf.write(str(gtf) + "\n") else: continue tmpf.close() tmpf = tmpf.name statement = ("cat %(tmpf)s |" " cgat gtf2gtf" " --method=sort --sort-order=gene" " --log=%(outfile)s.log |" " gzip > %(outfile)s") P.run() os.unlink(tmpf)
def loadTranscriptSummary(infile, outfile): '''summarize binding information per transcript.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() # sqlite can not do full outer join cc.execute("""DROP TABLE IF EXISTS %(table)s""" % locals()) transcripts = [ x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info"). fetchall() ] tmpf = P.getTempFile() tables = ("tata", "cpg") titles = tables vals = [] for table in tables: t = set([ x[0] for x in cc.execute("SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall() ]) vals.append(t) tmpf.write("transcript_id\t%s\n" % "\t".join(titles)) for transcript_id in transcripts: tmpf.write("%s\t%s\n" % (transcript_id, "\t".join( [str(int(transcript_id in v)) for v in vals]))) tmpf.close() P.load(tmpf.name, outfile) os.unlink(tmpf.name)
def loadLncRNAClass(infile, outfile): ''' load the lncRNA classifications ''' # just load each transcript with its classification temp = P.getTempFile(".") inf = iotools.openFile(infile) for transcript in GTF.transcript_iterator(GTF.iterator(inf)): temp.write("%s\t%s\t%s\n" % (transcript[0].transcript_id, transcript[0].gene_id, transcript[0].source)) temp.close() P.load(temp.name, outfile, options="--header-names=transcript_id,gene_id,class " "--add-index=transcript_id " "--add-index=gene_id") os.unlink(temp.name)
def importFromSeries(infiles, outfile): '''import expression levels from a GEO series.''' tablename = P.toTable(outfile) tmpf = P.getTempFile() infile_data, infile_map = infiles map_header = iotools.readMap(open(infile_map, "r")) if "ID_REF" not in map_header: map_header["ID_REF"] = "probeset" inf = gzip.open(infile_data, "r") for line in inf: if line.startswith("!"): continue if not line.strip(): continue line = re.sub('"', "", line) if line.startswith("ID_REF"): line = "\t".join([map_header[x] for x in line[:-1].split("\t")]) + "\n" tmpf.write(line) tmpf.close() tmpname = tmpf.name header = map_header["ID_REF"] statement = ''' cgat csv2db %(csv2db_options)s \ --add-index=%(header)s \ --table=%(tablename)s \ < %(tmpname)s > %(outfile)s ''' P.run() os.unlink(tmpname)
def loadMissedReadCounts(infiles, outfile): '''load summary table of numbers of missed reads.''' def _getlines(inf): return len(iotools.openFile(inf).readlines()) - 1 tmpfile = P.getTempFile() infiles = sorted(infiles) tmpfile.write( "track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n") for x in range(0, len(infiles), 2): junctions, transcriptome = infiles[x], infiles[x + 1] track = P.snip(junctions, ".missed_junctions.gz") mapped_genome = _getlines(track + ".mapped_reads.gz") tmpfile.write("%s\t%i\t%i\t%i\n" % (track, mapped_genome, _getlines(junctions), _getlines(transcriptome))) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() # Load into database P.load(outf.name, outfile, options="--add-index=track") os.unlink(outf.name)
def loadMACS(infile, outfile, bamfile, tablename=None): '''load MACS results in *tablename* This method loads only positive peaks. It filters peaks by p-value, q-value and fold change and loads the diagnostic data and re-calculates peakcenter, peakval, ... using the supplied bamfile. If *tablename* is not given, it will be :file:`<track>_intervals` where track is derived from ``infile`` and assumed to end in :file:`.macs`. This method creates two optional additional files: * if the file :file:`<track>_diag.xls` is present, load MACS diagnostic data into the table :file:`<track>_macsdiag`. * if the file :file:`<track>_model.r` is present, call R to create a MACS peak-shift plot and save it as :file:`<track>_model.pdf` in the :file:`export/MACS` directory. This method creates :file:`<outfile>.tsv.gz` with the results of the filtering. ''' track = P.snip(os.path.basename(infile), ".macs") folder = os.path.dirname(infile) if len(folder) > 0: infilename = folder + "/" + track + "_peaks.xls" filename_diag = folder + "/" + track + "_diag.xls" filename_r = folder + "/" + track + "_model.r" filename_rlog = folder + "/" + track + ".r.log" filename_pdf = track + "_model.pdf" else: infilename = track + "_peaks.xls" filename_diag = track + "_diag.xls" filename_r = track + "_model.r" filename_rlog = track + ".r.log" filename_pdf = track + "_model.pdf" if not os.path.exists(infilename): E.warn("could not find %s" % infilename) P.touch(outfile) return # create plot by calling R if os.path.exists(filename_r): if len(folder) > 0: statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; mv %(filename_pdf)s %(folder)s/%(filename_pdf)s; ''' else: statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; ''' P.run() # filter peaks shift = getPeakShiftFromMacs(infile) assert shift is not None, "could not determine peak shift from MACS file %s" % infile E.info("%s: found peak shift of %i" % (track, shift)) samfiles = [pysam.Samfile(bamfile, "rb")] offsets = [shift / 2] outtemp = P.getTempFile(".") tmpfilename = outtemp.name outtemp.write("\t".join(( "interval_id", "contig", "start", "end", "npeaks", "peakcenter", "length", "avgval", "peakval", "nprobes", "pvalue", "fold", "qvalue", "macs_summit", "macs_nprobes", )) + "\n") id = 0 # get thresholds max_qvalue = float(PARAMS["macs_max_qvalue"]) # min, as it is -10log10 min_pvalue = float(PARAMS["macs_min_pvalue"]) counter = E.Counter() with iotools.openFile(infilename, "r") as ins: for peak in WrapperMACS.iteratePeaks(ins): if peak.fdr > max_qvalue: counter.removed_qvalue += 1 continue elif peak.pvalue < min_pvalue: counter.removed_pvalue += 1 continue assert peak.start < peak.end npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks( peak.contig, peak.start, peak.end, samfiles, offsets) outtemp.write("\t".join(map(str, ( id, peak.contig, peak.start, peak.end, npeaks, peakcenter, length, avgval, peakval, nreads, peak.pvalue, peak.fold, peak.fdr, peak.start + peak.summit - 1, peak.tags))) + "\n") id += 1 counter.output += 1 outtemp.close() # output filtering summary outf = iotools.openFile("%s.tsv.gz" % outfile, "w") outf.write("category\tcounts\n") outf.write("%s\n" % counter.asTable()) outf.close() E.info("%s filtering: %s" % (track, str(counter))) if counter.output == 0: E.warn("%s: no peaks found" % track) # load data into table if tablename is None: tablename = "%s_macs_intervals" % track statement = '''cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=interval_id --add-index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename) # load diagnostic data if os.path.exists(filename_diag): tablename = "%s_macsdiag" % track statement = ''' cat %(filename_diag)s | sed "s/FC range.*/fc\\tnpeaks\\tp90\\tp80\\tp70\\tp60\\tp50\\tp40\\tp30\\tp20/" | cgat csv2db %(csv2db_options)s --map=fc:str --table=%(tablename)s >> %(outfile)s ''' P.run()
def loadIntervalsFromBed(bedfile, track, outfile, bamfiles, offsets): '''load intervals from :term:`bed` formatted files into database. Re-evaluate the intervals by counting reads within the interval. In contrast to the initial pipeline, the genome is not binned. In particular, the meaning of the columns in the table changes to: nProbes: number of reads in interval PeakCenter: position with maximum number of reads in interval AvgVal: average coverage within interval ''' tmpfile = P.getTempFile(".") headers = ("AvgVal", "DisttoStart", "GeneList", "Length", "PeakCenter", "PeakVal", "Position", "interval_id", "nCpGs", "nGenes", "nPeaks", "nProbes", "nPromoters", "contig", "start", "end") tmpfile.write("\t".join(headers) + "\n") avgval, contig, disttostart, end, genelist, length, peakcenter, peakval, position, start, interval_id, ncpgs, ngenes, npeaks, nprobes, npromoters = \ 0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, mlength = int(PARAMS["calling_merge_min_interval_length"]) c = E.Counter() # count tags for bed in Bed.iterator(iotools.openFile(infile, "r")): c.input += 1 if "name" not in bed: bed.name = c.input # remove very short intervals if bed.end - bed.start < mlength: c.skipped_length += 1 continue if replicates: npeaks, peakcenter, length, avgval, peakval, nprobes = \ PipelineChipseq.countPeaks( bed.contig, bed.start, bed.end, samfiles, offsets) # nreads can be 0 if the intervals overlap only slightly # and due to the binning, no reads are actually in the # overlap region. However, most of these intervals should # be small and have already be deleted via the # merge_min_interval_length cutoff. do not output # intervals without reads. if nprobes == 0: c.skipped_reads += 1 else: npeaks, peakcenter, length, avgval, peakval, nprobes = ( 1, bed.start + (bed.end - bed.start) // 2, bed.end - bed.start, 1, 1, 1) c.output += 1 tmpfile.write("\t".join(map( str, (avgval, disttostart, genelist, length, peakcenter, peakval, position, bed.name, ncpgs, ngenes, npeaks, nprobes, npromoters, bed.contig, bed.start, bed.end))) + "\n") if c.output == 0: E.warn("%s - no intervals") tmpfile.close() tmpfilename = tmpfile.name tablename = "%s_intervals" % track.asTable() statement = ''' cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=interval_id --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfile.name) E.info("%s\n" % str(c))
def loadZinba(infile, outfile, bamfile, tablename=None, controlfile=None): '''load Zinba results in *tablename* This method loads only positive peaks. It filters peaks by p-value, q-value and fold change and loads the diagnostic data and re-calculates peakcenter, peakval, ... using the supplied bamfile. If *tablename* is not given, it will be :file:`<track>_intervals` where track is derived from ``infile`` and assumed to end in :file:`.zinba`. If no peaks were predicted, an empty table is created. This method creates :file:`<outfile>.tsv.gz` with the results of the filtering. This method uses the refined peak locations. Zinba peaks can be overlapping. This method does not merge overlapping intervals. Zinba calls peaks in regions where there are many reads inside the control. Thus this method applies a filtering step removing all intervals in which there is a peak of more than readlength / 2 height in the control. .. note: Zinba calls peaks that are overlapping. ''' track = P.snip(os.path.basename(infile), ".zinba") folder = os.path.dirname(infile) infilename = infile + ".peaks" outtemp = P.getTempFile(".") tmpfilename = outtemp.name outtemp.write("\t".join(( "interval_id", "contig", "start", "end", "npeaks", "peakcenter", "length", "avgval", "peakval", "nprobes", "pvalue", "fold", "qvalue", "macs_summit", "macs_nprobes", )) + "\n") counter = E.Counter() if not os.path.exists(infilename): E.warn("could not find %s" % infilename) elif iotools.isEmpty(infilename): E.warn("no data in %s" % infilename) else: # filter peaks shift = getPeakShiftFromZinba(infile) assert shift is not None, \ "could not determine peak shift from Zinba file %s" % infile E.info("%s: found peak shift of %i" % (track, shift)) samfiles = [pysam.Samfile(bamfile, "rb")] offsets = [shift / 2] if controlfile: controlfiles = [pysam.Samfile(controlfile, "rb")] readlength = BamTools.estimateTagSize(controlfile) control_max_peakval = readlength // 2 E.info("removing intervals in which control has peak higher than %i reads" % control_max_peakval) else: controlfiles = None id = 0 # get thresholds max_qvalue = float(PARAMS["zinba_fdr_threshold"]) with iotools.openFile(infilename, "r") as ins: for peak in WrapperZinba.iteratePeaks(ins): # filter by qvalue if peak.fdr > max_qvalue: counter.removed_qvalue += 1 continue assert peak.refined_start < peak.refined_end # filter by control if controlfiles: npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig, peak.refined_start, peak.refined_end, controlfiles, offsets) if peakval > control_max_peakval: counter.removed_control += 1 continue # output peak npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig, peak.refined_start, peak.refined_end, samfiles, offsets) outtemp.write("\t".join(map(str, ( id, peak.contig, peak.refined_start, peak.refined_end, npeaks, peakcenter, length, avgval, peakval, nreads, 1.0 - peak.posterior, 1.0, peak.fdr, peak.refined_start + peak.summit - 1, peak.height))) + "\n") id += 1 counter.output += 1 outtemp.close() # output filtering summary outf = iotools.openFile("%s.tsv.gz" % outfile, "w") outf.write("category\tcounts\n") outf.write("%s\n" % counter.asTable()) outf.close() E.info("%s filtering: %s" % (track, str(counter))) if counter.output == 0: E.warn("%s: no peaks found" % track) # load data into table if tablename is None: tablename = "%s_intervals" % track statement = ''' cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=interval_id --add-index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def buildJunctionsDB(infiles, outfile): '''build a database of all junctions.''' to_cluster = USECLUSTER outfile_junctions = outfile + ".junctions.bed.gz" min_anchor_length = 3 read_length = 50 infiles = (infiles, ) tmpfile = P.getTempFile(".") for infile in infiles: if infile.endswith(".bam"): junctions_file = P.snip(infile, ".bam") + ".junctions.bed.gz" columns = (0, 1, 2, 5) else: junctions_file = infile columns = (0, 1, 2, 3) if not os.path.exists(junctions_file): E.warn("can't find junctions file '%s'" % junctions_file) continue inf = iotools.openFile(junctions_file) for line in inf: if line.startswith("#"): continue if line.startswith("track"): continue data = line[:-1].split("\t") try: tmpfile.write("\t".join([data[x] for x in columns]) + "\n") except IndexError: raise IndexError("parsing error in line %s" % line) tmpfile.close() tmpfilename = tmpfile.name statement = ''' sort %(tmpfilename)s | gzip > %(outfile_junctions)s ''' P.run() os.unlink(tmpfilename) E.info("building junctions database") statement = ''' juncs_db %(min_anchor_length)i %(read_length)i <( zcat %(outfile_junctions)s ) /dev/null /dev/null %(bowtie_genome_dir)s/%(genome)s.fa > %(outfile)s 2> %(outfile)s.log ''' P.run() E.info("indexing junctions database") prefix = P.snip(outfile, ".fa") # build raw index statement = ''' bowtie-build -f %(outfile)s %(prefix)s >> %(outfile)s.log 2>&1 ''' P.run() # build color space index statement = ''' bowtie-build -C -f %(outfile)s %(prefix)s_cs >> %(outfile)s.log 2>&1 ''' P.run()
def plotRelativeAbundanceCorrelations(infiles, outfile): ''' plot the correlation between the estimated relative abundance of species and the true relative abundances - done on the shared set ''' # connect to database dbh = sqlite3.connect(PARAMS["database_name"]) cc = dbh.cursor() true_file = infiles[0] temp = P.getTempFile() temp.write("true\testimate\tlevel\n") for estimate_file in infiles[1:]: if os.path.basename(estimate_file)[ len("metaphlan_"):] == os.path.basename(true_file): tablenames = [ P.toTable(os.path.basename(true_file)), P.toTable(os.path.basename(estimate_file)) ] # get data for each taxonomic level for taxa in [ "phylum", "class", "order", "family", "genus", "species" ]: statement = """SELECT a.relab, b.rel_abundance, a.level FROM %s as a, %s as b WHERE b.taxon_level == "%s" AND a.taxa == b.taxon""" % (tablenames[0], tablenames[1], taxa) for data in cc.execute(statement).fetchall(): true, estimate, level = data[0], data[1], data[2] temp.write("%f\t%f\t%s\n" % (true, estimate, level)) temp.close() inf = temp.name R('''data <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % inf) R('''library(ggplot2)''') R('''data$estimate <- data$estimate/100''') R('''ggplot(data, aes(true, estimate, colour = level)) + geom_point() + geom_smooth(method = "lm")''' ) R('''ggsave("%s")''' % outfile) out_cors = P.snip(outfile, ".pdf") + ".cors" R('''cors <- data.frame("level" = c("phylum", "class", "order", "family", "genus", "species"), "cor" = rep(0, 6))''' ) R('''cors[1,2] <- cor(data$true[data$level == "phylum"], data$estimate[data$level == "phylum"])''' ) R('''cors[2,2] <- cor(data$true[data$level == "class"], data$estimate[data$level == "class"])''' ) R('''cors[3,2] <- cor(data$true[data$level == "order"], data$estimate[data$level == "order"])''' ) R('''cors[4,2] <- cor(data$true[data$level == "family"], data$estimate[data$level == "family"])''' ) R('''cors[5,2] <- cor(data$true[data$level == "genus"], data$estimate[data$level == "genus"])''' ) R('''cors[6,2] <- cor(data$true[data$level == "species"], data$estimate[data$level == "species"])''' ) R('''write.table(cors, file = "%s", row.names = F, sep = "\t")''' % out_cors) # do the same at the low end - not for species R('''data$estimate <- data$estimate/100''') R('''ggplot(data[data$true < 0.75 & data$level != "species",], aes(true, estimate, colour = level)) + geom_point() + geom_smooth(method = "lm", se = F)''' ) outf = P.snip(outfile, ".pdf") + ".lowest.pdf" R('''ggsave("%s")''' % outf) out_cors = P.snip(outfile, ".pdf") + ".lowest.cors" R('''cors <- data.frame("level" = c("phylum", "class", "order", "family", "genus", "species"), "cor" = rep(0, 6))''' ) R('''cors[1,2] <- cor(data$true[data$level == "phylum" & data$true < 0.75], data$estimate[data$level == "phylum" & data$true < 0.75])''' ) R('''cors[2,2] <- cor(data$true[data$level == "class" & data$true < 0.75], data$estimate[data$level == "class" & data$true < 0.75])''' ) R('''cors[3,2] <- cor(data$true[data$level == "order" & data$true < 0.75], data$estimate[data$level == "order" & data$true < 0.75])''' ) R('''cors[4,2] <- cor(data$true[data$level == "family" & data$true < 0.75], data$estimate[data$level == "family" & data$true < 0.75])''' ) R('''cors[5,2] <- cor(data$true[data$level == "genus" & data$true < 0.75], data$estimate[data$level == "genus" & data$true < 0.75])''' ) R('''write.table(cors, file = "%s", row.names = F, sep = "\t")''' % out_cors) os.unlink(inf)