def buildPicardCoverageStats(infile, outfile, baits, regions): '''run picard:CollectHSMetrics Generate coverage statistics for regions of interest from a bed file using Picard. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. baits : :term:`bed` formatted file of bait regions regions : :term:`bed` formatted file of target regions ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return statement = '''picard %(picard_opts)s CollectHsMetrics BAIT_INTERVALS=%(baits)s TARGET_INTERVALS=%(regions)s INPUT=%(infile)s OUTPUT=%(outfile)s VALIDATION_STRINGENCY=LENIENT''' % locals() P.run(statement)
def loadFastqc(infile, outfile): '''load FASTQC stats into database.''' track = P.snip(infile, ".fastqc") filename = os.path.join(PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt") PipelineReadqc.loadFastqc(filename, database_url=PARAMS["database"]["url"]) IOTools.touch_file(outfile)
def runFastqScreen(infiles, outfile): '''run FastqScreen on input files.''' # variables required for statement built by FastqScreen() tempdir = P.get_temp_dir(".") outdir = os.path.join(PARAMS["exportdir"], "fastq_screen") # configure job_threads with fastq_screen_options from PARAMS job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options']) if len(job_threads) != 1: raise ValueError("Wrong number of threads for fastq_screen") job_threads = int(re.sub(r'--threads ', '', job_threads[0])) # Create fastq_screen config file in temp directory # using parameters from Pipeline.yml with IOTools.open_file(os.path.join(tempdir, "fastq_screen.conf"), "w") as f: for i, k in list(PARAMS.items()): if i.startswith("fastq_screen_database"): f.write("DATABASE\t%s\t%s\n" % (i[22:], k)) m = PipelineMapping.FastqScreen() statement = m.build((infiles, ), outfile) P.run(statement, job_memory="8G") shutil.rmtree(tempdir) IOTools.touch_file(outfile)
def buildPicardGCStats(infile, outfile, genome_file): """picard:CollectGCBiasMetrics Collect GC bias metrics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. """ job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return statement = '''picard %(picard_opts)s CollectGcBiasMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT CHART_OUTPUT=%(outfile)s.pdf SUMMARY_OUTPUT=%(outfile)s.summary >& %(outfile)s''' P.run(statement)
def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.get_temp_filename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=True, masker="dust", proportion=P.get_params()["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) IOTools.touch_file(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run(statement) os.unlink(tmpfasta)
def buildPicardRnaSeqMetrics(infiles, strand, outfile): '''run picard:RNASeqMetrics Arguments --------- infiles : string Input filename in :term:`BAM` format. Genome file in refflat format (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat) outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 infile, genome = infiles if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return statement = '''picard %(picard_opts)s CollectRnaSeqMetrics REF_FLAT=%(genome)s INPUT=%(infile)s ASSUME_SORTED=true OUTPUT=%(outfile)s STRAND=%(strand)s VALIDATION_STRINGENCY=SILENT ''' P.run(statement)
def loadGO(infile, outfile, tablename): """import GO results into individual tables. This method concatenates all the results from a GO analysis and uploads into a single table. """ indir = infile + ".dir" if not os.path.exists(indir): IOTools.touch_file(outfile) return load_statement = P.build_load_statement(tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=goid ") statement = ''' python %(toolsdir)s/cat_tables.py %(indir)s/*.overall | %(load_statement)s > %(outfile)s ''' P.run(statement)
def exportMotifDiscoverySequences(infile, outfile): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' track = P.snip(infile, "_intervals.load") dbhandle = connect() p = P.substitute_parameters(**locals()) nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full=False, masker=P.as_list(p['motifs_masker']), halfwidth=int(p["motifs_halfwidth"]), maxsize=int(p["motifs_max_size"]), proportion=p["motifs_proportion"], min_sequences=p["motifs_min_sequences"], num_sequences=p["motifs_num_sequences"], order=p['motifs_score']) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) IOTools.touch_file(outfile)
def getCpGIslandsFromUCSC(dbhandle, outfile): '''get CpG islands from UCSC database and save as a :term:`bed` formatted file. The name column in the bed file will be set to the UCSC name. Arguments --------- dbhandle : object Database handle to UCSC mysql database outfile : string Filename of output file in :term:`bed` format. ''' table = "cpgIslandExt" sql = """SELECT chrom, chromStart, chromEnd, name FROM %(table)s ORDER by chrom, chromStart""" sql = sql % locals() E.debug("executing sql statement: %s" % sql) try: cc = dbhandle.execute(sql) outfile = IOTools.open_file(outfile, "w") for data in cc.fetchall(): outfile.write("\t".join(map(str, data)) + "\n") outfile.close() except Exception: E.warn("Failed to connect to table %s. %s is empty" % (table, outfile)) IOTools.touch_file(outfile)
def buildPicardInsertSizeStats(infile, outfile, genome_file): '''run Picard:CollectInsertSizeMetrics Collect insert size statistics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return statement = '''picard %(picard_opts)s CollectInsertSizeMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run(statement, job_memory=PICARD_MEMORY)
def buildPicardDuplicateStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard and keep the dedupped .bam file. Pair duplication is properly handled, including inter-chromosomal cases. SE data is also handled. These stats also contain a histogram that estimates the return from additional sequecing. No marked bam files are retained (/dev/null...) Note that picards counts reads but they are in fact alignments. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return statement = '''picard %(picard_opts)s MarkDuplicates INPUT=%(infile)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s.duplicate_metrics OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s.log && samtools index %(outfile)s''' P.run(statement)
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.get_temp_dir(".") databases = " ".join(P.as_list(P.get_params()["tomtom_databases"])) target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "tomtom", outfile) if IOTools.is_empty(infile): E.warn("input is empty - no computation performed") IOTools.touch_file(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run(statement) # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def runGOFromDatabase(outfile, outdir, statement_fg, statement_bg, go_file, ontology_file=None, samples=1000): """check for GO enrichment. Gene lists are extracted from a database. This method is a wrapper for `runGO.py`. Arguments --------- outfile : string Output filename outdir : string Output directory for auxiliary files statement_fg : string SQL statement to select genes of foreground set. statement_bg : string SQL statement to select genes in background set. go_file : string Filename with Gene-to-GO assignments ontology_file : string Filename with ontology information. samples : int Number of samples for empirical FDR. If not given, use BH FDR. """ dbhandle = sqlite3.connect(PARAMS["database_name"]) cc = dbhandle.cursor() fg = set([x[0] for x in cc.execute(statement_fg).fetchall()]) bg = set([x[0] for x in cc.execute(statement_bg).fetchall()]) if len(fg) == 0: IOTools.touch_file(outfile) return fg_file = os.path.join(outdir, "foreground") bg_file = os.path.join(outdir, "background") outf = open(fg_file, "w") outf.write("\n".join(map(str, fg)) + "\n") outf.close() outf = open(bg_file, "w") outf.write("\n".join(map(str, bg)) + "\n") outf.close() runGOFromFiles(outfile, outdir, fg_file, bg_file, go_file, ontology_file=ontology_file, samples=samples)
def test_touch_file_updates_existing_file(self): with IOTools.open_file(self.filename, "w") as outf: outf.write("some data\n") created = os.stat(self.filename).st_mtime time.sleep(1) IOTools.touch_file(self.filename) modified = os.stat(self.filename).st_mtime self.assertGreater(modified, created) with IOTools.open_file(self.filename) as inf: data = inf.read() self.assertEqual(data, "some data\n")
def test_touch_file_creates_empty_file(self): self.assertFalse(os.path.exists(self.filename)) IOTools.touch_file(self.filename) self.assertTrue(os.path.exists(self.filename)) if self.filename.endswith(".gz"): self.assertFalse(IOTools.is_empty(self.filename)) else: self.assertTrue(IOTools.is_empty(self.filename)) with IOTools.open_file(self.filename) as inf: data = inf.read() self.assertEqual(len(data), 0)
def summarizeFastqScreen(infiles, outfiles): all_files = [] for infile in infiles: all_files.extend(glob.glob(IOTools.snip(infile, "screen") + "*_screen.txt")) if len(all_files) == 0: E.warn("no fastqcscreen results to concatenate") for x in outfiles: IOTools.touch_file(x) return df_summary, df_details = PipelineReadqc.read_fastq_screen( all_files) df_summary.to_csv(outfiles[0], sep="\t", index=True) df_details.to_csv(outfiles[1], sep="\t", index=True)
def buildPicardDuplicationStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard, the marked records are discarded. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return # currently, MarkDuplicates cannot handle split alignments from gsnap # these can be identified by the custom XT tag. if ".gsnap.bam" in infile: tmpf = P.get_temp_file(".") tmpfile_name = tmpf.name statement = '''samtools view -h %(infile)s | awk "!/\\tXT:/" | samtools view /dev/stdin -S -b > %(tmpfile_name)s; ''' % locals() data_source = tmpfile_name else: statement = "" data_source = infile statement += '''picard %(picard_opts)s MarkDuplicates INPUT=%(data_source)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s OUTPUT=/dev/null VALIDATION_STRINGENCY=SILENT ''' P.run(statement) if ".gsnap.bam" in infile: os.unlink(tmpfile_name)
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) tmpdir = P.get_temp_dir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.as_list(P.get_params()['motifs_masker']), halfwidth=int(P.get_params()["meme_halfwidth"]), maxsize=int(P.get_params()["meme_max_size"]), proportion=P.get_params()["meme_proportion"], min_sequences=P.get_params()["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) IOTools.touch_file(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def buildIntronLevelReadCounts(infiles, outfile): '''count reads in gene models Count the reads from a :term:`bam` file which overlap the positions of introns in a :term:`gtf` format transcripts file. Parameters ---------- infiles : list of str infile :term:`str` Input filename in :term:`bam` format geneset :term:`str` Input filename in :term:`gtf` format outfile : str Output filename in :term:`tsv` format .. note:: In paired-end data sets each mate will be counted. Thus the actual read counts are approximately twice the fragment counts. ''' infile, exons = infiles job_memory = "4G" if "transcriptome.dir" in infile: IOTools.touch_file(outfile) return statement = ''' zcat %(exons)s | awk -v OFS="\\t" -v FS="\\t" '{$3="exon"; print}' | cgat gtf2table --reporter=genes --bam-file=%(infile)s --counter=length --column-prefix="introns_" --counter=read-counts --column-prefix="" --counter=read-coverage --column-prefix=coverage_ | gzip > %(outfile)s ''' P.run(statement)
def loadTomTom(infile, outfile): '''load tomtom results''' tablename = P.to_table(outfile) resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", infile) xml_file = os.path.join(resultsdir, "tomtom.xml") if not os.path.exists(xml_file): E.warn("no tomtom output - skipped loading ") IOTools.touch_file(outfile) return # get the motif name from the xml file tree = xml.etree.ElementTree.ElementTree() tree.parse(xml_file) motifs = tree.find("targets") name2alt = {} for motif in motifs.getiterator("motif"): name = motif.get("name") alt = motif.get("alt") name2alt[name] = alt tmpfile = P.get_temp_file(".") # parse the text file for line in IOTools.open_file(infile): if line.startswith("#Query"): tmpfile.write( "target_name\tquery_id\ttarget_id\toptimal_offset\tpvalue\tevalue\tqvalue\tOverlap\tquery_consensus\ttarget_consensus\torientation\n" ) continue data = line[:-1].split("\t") target_name = name2alt[data[1]] tmpfile.write("%s\t%s" % (target_name, line)) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def buildPicardAlignmentStats(infile, outfile, genome_file): '''run picard:CollectMultipleMetrics Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return # Picard seems to have problem if quality information is missing # or there is no sequence/quality information within the bam file. # Thus, add it explicitly. statement = '''cat %(infile)s | cgat bam2bam -v 0 --method=set-sequence --output-sam --log=%(outfile)s.bam2bam.log | picard %(picard_opts)s CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run(statement)
def compute_file_metrics(infile, outfile, metric, suffixes): """apply a tool to compute metrics on a list of files matching regex_pattern.""" if suffixes is None or len(suffixes) == 0: E.info("No metrics computed for {}".format(outfile)) IOTools.touch_file(outfile) return track = P.snip(infile, ".log") # convert regex patterns to a suffix match: # prepend a .* # append a $ regex_pattern = " -or ".join( ["-regex .*{}$".format(pipes.quote(x)) for x in suffixes]) E.debug("applying metric {} to files matching {}".format( metric, regex_pattern)) if metric == "file": statement = '''find %(track)s.dir -type f -not -regex '.*\/report.*' -not -regex '.*\/_.*' \( %(regex_pattern)s \) | sort -k1,1 > %(outfile)s''' else: statement = '''find %(track)s.dir -type f -not -regex '.*\/report.*' -not -regex '.*\/_.*' \( %(regex_pattern)s \) -exec %(scriptsdir)s/cgat_file_apply.sh {} %(metric)s \; | perl -p -e "s/ +/\\t/g" | sort -k1,1 > %(outfile)s''' P.run(statement)
def runFastqScreen(infiles, outfile): '''run FastqScreen on input files.''' # configure job_threads with fastq_screen_options from PARAMS job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options']) if len(job_threads) != 1: raise ValueError("Wrong number of threads for fastq_screen") job_threads = int(re.sub(r'--threads ', '', job_threads[0])) tempdir = P.get_temp_dir(".") conf_fn = os.path.join(tempdir, "fastq_screen.conf") with IOTools.open_file(conf_fn, "w") as f: for i, k in PARAMS.items(): if i.startswith("fastq_screen_database"): f.write("DATABASE\t%s\t%s\n" % (i[22:], k)) m = PipelineMapping.FastqScreen(config_filename=conf_fn) statement = m.build((infiles,), outfile) P.run(statement, job_memory="8G") shutil.rmtree(tempdir) IOTools.touch_file(outfile)
def plotDETagStats(infile, composition_file, outfile): '''plot differential expression statistics Arguments --------- infile : string Filename with :term:`tsv` formatted list of differential methylation results output from :doc:`scripts/runExpression`. composition_file : string Filename with :term:`tsv` formatted data about nucleotide compositions of windows tested. outfile : string Output filename, used as sentinel only. ''' Expression.plotDETagStats(infile, outfile, additional_file=composition_file, join_columns=("contig", "start", "end"), additional_columns=("CpG_density", "length")) IOTools.touch_file(outfile)
def runMEMEOnSequences(infile, outfile): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme skipped" % outfile) IOTools.touch_file(outfile) return target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) tmpdir = P.get_temp_dir(".") statement = ''' meme %(infile)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(motifs_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def buildDiscoverySequences(infile, outfile, npeaks, width, masker): '''get the peak sequences, masking or not specificed in the ini file. ''' track = P.snip(infile, "_intervals.load") dbhandle = connect() nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full=False, masker=[masker], halfwidth=width, maxsize=int(PARAMS["motifs_max_size"]), proportion=None, num_sequences=npeaks, order='peakval') if nseq == 0: E.warn("%s: no sequences in foreground" % outfile) IOTools.touch_file(outfile)
def buildSpikeResults(infile, outfile): '''build matrices with results from spike-in and upload into database. The method will output several files: .spiked.gz: Number of intervals that have been spiked-in for each bin of expression and fold-change .power.gz: Global power analysis - aggregates over all ranges of fold-change and expression and outputs the power, the proportion of intervals overall that could be detected as differentially methylated. This is a table with the following columns: fdr - fdr threshold power - power level, number of intervals detectable intervals - number of intervals in observed data at given level of fdr and power. intervals_percent - percentage of intervals in observed data at given level of fdr and power The method will also upload the results into the database. Arguments --------- infile : string Input filename in :term:`tsv` format. Usually the output of :mod:`scripts/runExpression`. outfile : string Output filename in :term:`tsv` format. ''' expression_nbins = 10 fold_nbins = 10 spikefile = P.snip(infile, '.tsv.gz') + '.spike.gz' if not os.path.exists(spikefile): E.warn('no spike data: %s' % spikefile) IOTools.touch_file(outfile) return ######################################## # output and load spiked results tmpfile_name = P.get_temp_filename(shared=True) statement = '''zcat %(spikefile)s | grep -e "^spike" -e "^test_id" > %(tmpfile_name)s ''' P.run(statement) E.debug("outputting spiked counts") (spiked, spiked_d2hist_counts, xedges, yedges, spiked_l10average, spiked_l2fold) = \ outputSpikeCounts( outfile=P.snip(outfile, ".power.gz") + ".spiked.gz", infile_name=tmpfile_name, expression_nbins=expression_nbins, fold_nbins=fold_nbins) ######################################## # output and load unspiked results statement = '''zcat %(infile)s | grep -v -e "^spike" > %(tmpfile_name)s ''' P.run(statement) E.debug("outputting unspiked counts") (unspiked, unspiked_d2hist_counts, unspiked_xedges, unspiked_yedges, unspiked_l10average, unspiked_l2fold) = \ outputSpikeCounts( outfile=P.snip(outfile, ".power.gz") + ".unspiked.gz", infile_name=tmpfile_name, expression_bins=xedges, fold_bins=yedges) E.debug("computing power") assert xedges.all() == unspiked_xedges.all() tmpfile = IOTools.open_file(tmpfile_name, "w") tmpfile.write("\t".join(("expression", "fold", "fdr", "counts", "percent")) + "\n") fdr_thresholds = [0.01, 0.05] + list(numpy.arange(0.1, 1.0, 0.1)) power_thresholds = numpy.arange(0.1, 1.1, 0.1) spiked_total = float(spiked_d2hist_counts.sum().sum()) unspiked_total = float(unspiked_d2hist_counts.sum().sum()) outf = IOTools.open_file(outfile, "w") outf.write("fdr\tpower\tintervals\tintervals_percent\n") # significant results for fdr in fdr_thresholds: take = spiked['qvalue'] < fdr # compute 2D histogram in spiked data below fdr threshold spiked_d2hist_fdr, xedges, yedges = \ numpy.histogram2d(spiked_l10average[take], spiked_l2fold[take], bins=(xedges, yedges)) # convert to percentage of spike-ins per bin spiked_d2hist_fdr_normed = spiked_d2hist_fdr / spiked_d2hist_counts spiked_d2hist_fdr_normed = numpy.nan_to_num(spiked_d2hist_fdr_normed) # set values without data to -1 spiked_d2hist_fdr_normed[spiked_d2hist_counts == 0] = -1.0 # output to table for database upload for x, y in itertools.product(list(range(len(xedges) - 1)), list(range(len(yedges) - 1))): tmpfile.write("\t".join( map(str, (xedges[x], yedges[y], fdr, spiked_d2hist_fdr[x, y], 100.0 * spiked_d2hist_fdr_normed[x, y]))) + "\n") # take elements in spiked_hist_fdr above a certain threshold for power in power_thresholds: # select 2D bins at a given power level power_take = spiked_d2hist_fdr_normed >= power # select the counts in the unspiked data according # to this level power_counts = unspiked_d2hist_counts[power_take] outf.write("\t".join( map(str, (fdr, power, power_counts.sum().sum(), 100.0 * power_counts.sum().sum() / unspiked_total))) + "\n") tmpfile.close() outf.close() # upload into table method = P.snip(os.path.dirname(outfile), ".dir") tablename = P.to_table( P.snip(outfile, "power.gz") + method + ".spike.load") P.load(tmpfile_name, outfile + ".log", tablename=tablename, options="--add-index=fdr") os.unlink(tmpfile_name)
def buildPseudogenes(infiles, outfile, dbhandle): '''build a set of pseudogenes. Transcripts are extracted from the GTF file and designated as pseudogenes if: * the gene_type or transcript_type contains the phrase "pseudo". This taken is from the database. * the feature is 'processed_transcript' and has similarity to protein coding genes. Similarity is assessed by aligning the transcript and peptide set against each other with exonerate_. Pseudogenic transcripts can overlap with protein coding transcripts. Arguments --------- infiles : list Filenames of ENSEMBL geneset in :term:`gtf` format and associated peptide sequences in :term:`fasta` format. outfile : filename Output in :term:`gtf` format with inferred or annotated pseudogenes. dbandle : object Database handle for extracting transcript biotypes. ''' infile_gtf, infile_peptides_fasta = infiles # JJ - there are also 'nontranslated_CDS', but no explanation of these if PARAMS["genome"].startswith("dm"): E.warn("Ensembl dm genome annotations only contain source" " 'pseudogenes' - skipping exonerate step") statement = """zcat %(infile_gtf)s |awk '$2 ~ /pseudogene/' | gzip > %(outfile)s""" P.run(statement) return tmpfile1 = P.get_temp_filename(shared=True) # collect processed transcripts and save as fasta sequences statement = ''' zcat %(infile_gtf)s | awk '$2 ~ /processed/' | cgat gff2fasta --is-gtf --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(tmpfile1)s ''' P.run(statement) if IOTools.is_empty(tmpfile1): E.warn("no pseudogenes found") os.unlink(tmpfile1) IOTools.touch_file(outfile) return model = "protein2dna" # map processed transcripts against peptide sequences statement = ''' cat %(tmpfile1)s | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=100 --log=%(outfile)s.log "exonerate --target %%STDIN%% --query %(infile_peptides_fasta)s --model %(model)s --bestn 1 --score 200 --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\" --showalignment no --showsugar no --showcigar no --showvulgar no " | grep -v -e "exonerate" -e "Hostname" | gzip > %(outfile)s.links.gz ''' P.run(statement) os.unlink(tmpfile1) inf = IOTools.open_file("%s.links.gz" % outfile) best_matches = {} for line in inf: peptide_id, transcript_id, score = line[:-1].split("\t") score = int(score) if transcript_id in best_matches and \ best_matches[transcript_id][0] > score: continue best_matches[transcript_id] = (score, peptide_id) inf.close() E.info("found %i best links" % len(best_matches)) new_pseudos = set(best_matches.keys()) cc = dbhandle.cursor() known_pseudos = set([ x[0] for x in cc.execute("""SELECT DISTINCT transcript_id FROM transcript_info WHERE transcript_biotype like '%pseudo%' OR gene_biotype like '%pseudo%' """) ]) E.info("pseudogenes from: processed_transcripts=%i, known_pseudos=%i, " "intersection=%i" % ((len(new_pseudos), len(known_pseudos), len(new_pseudos.intersection(known_pseudos))))) all_pseudos = new_pseudos.union(known_pseudos) c = E.Counter() outf = IOTools.open_file(outfile, "w") inf = GTF.iterator(IOTools.open_file(infile_gtf)) for gtf in inf: c.input += 1 if gtf.transcript_id not in all_pseudos: continue c.output += 1 outf.write("%s\n" % gtf) outf.close() E.info("exons: %s" % str(c))
def buildNUMTs(infile, outfile): '''output set of potential nuclear mitochondrial genes (NUMTs). This function works by aligning the mitochondrial chromosome against genome using exonerate_. This can take a while. Arguments --------- infile : string Ignored. outfile : filename Output in :term:`gtf` format with potential NUMTs. ''' if not PARAMS["numts_mitochrom"]: E.info("skipping numts creation") IOTools.touch_file(outfile) return fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) if PARAMS["numts_mitochrom"] not in fasta: E.warn("mitochondrial genome %s not found" % PARAMS["numts_mitochrom"]) IOTools.touch_file(outfile) return tmpfile_mito = P.get_temp_filename(".") statement = ''' cgat index_fasta --extract=%(numts_mitochrom)s --log=%(outfile)s.log %(genome_dir)s/%(genome)s > %(tmpfile_mito)s ''' P.run(statement) if IOTools.is_empty(tmpfile_mito): E.warn("mitochondrial genome empty.") os.unlink(tmpfile_mito) IOTools.touch_file(outfile) return format = ("qi", "qS", "qab", "qae", "ti", "tS", "tab", "tae", "s", "pi", "C") format = "\\\\t".join(["%%%s" % x for x in format]) # collect all results min_score = 100 statement = ''' cat %(genome_dir)s/%(genome)s.fasta | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=1 --log=%(outfile)s.log "exonerate --target %%STDIN%% --query %(tmpfile_mito)s --model affine:local --score %(min_score)i --showalignment no --showsugar no --showcigar no --showvulgar no --ryo \\"%(format)s\\n\\" " | grep -v -e "exonerate" -e "Hostname" | gzip > %(outfile)s.links.gz ''' P.run(statement) # convert to gtf inf = IOTools.open_file("%s.links.gz" % outfile) outf = IOTools.open_file(outfile, "w") min_score = PARAMS["numts_score"] c = E.Counter() for line in inf: (query_contig, query_strand, query_start, query_end, target_contig, target_strand, target_start, target_end, score, pid, alignment) = line[:-1].split("\t") c.input += 1 score = int(score) if score < min_score: c.skipped += 1 continue if target_strand == "-": target_start, target_end = target_end, target_start gff = GTF.Entry() gff.contig = target_contig gff.start, gff.end = int(target_start), int(target_end) assert gff.start < gff.end gff.strand = target_strand gff.score = int(score) gff.feature = "numts" gff.gene_id = "%s:%s-%s" % (query_contig, query_start, query_end) gff.transcript_id = "%s:%s-%s" % (query_contig, query_start, query_end) outf.write("%s\n" % str(gff)) c.output += 1 inf.close() outf.close() E.info("filtering numts: %s" % str(c)) os.unlink(tmpfile_mito)
def makeAdaptorFasta(infile, outfile, track, dbh, contaminants_file): '''Generate a .fasta file of adaptor sequences that are overrepresented in the reads from a sample. Requires cutadapt >= 1.7. Arguments --------- infile : string Input filename that has been QC'ed. The filename is used to check if the input was a :term:`sra` file and guess the number of tracks to check. outfile : string Output filename in :term:`fasta` format. track : string Track name, used to access FastQC results in database. dbh : object Database handle. contaminants_file : string Path of file containing contaminants used for screening by Fastqc. ''' tracks = [track] if infile.endswith(".sra"): # patch for SRA files, look at multiple tracks f, fastq_format, datatype = Sra.peek(infile) if len(f) == 2: tracks = [track + "_fastq_1", track + "_fastq_2"] elif infile.endswith(".fastq.1.gz"): tracks = [track + "_fastq_1", track + "_fastq_2"] elif infile.endswith(".fastq.gz"): tracks = [track] found_contaminants = [] for t in tracks: table = PipelineTracks.AutoSample(os.path.basename(t)).asTable() # if sample name starts with a number, sql table will have # prepended "_" if re.match("^\d+.*", table): table = "_" + table query = '''SELECT Possible_Source, Sequence FROM %s_fastqc_Overrepresented_sequences;''' % table cc = dbh.cursor() # if there is no contamination table for even a single sample # it will prevent the whole pipeline progressing try: found_contaminants.extend(cc.execute(query).fetchall()) except sqlite3.OperationalError: E.warn("No table found for {}".format(t)) if len(found_contaminants) == 0: IOTools.touch_file(outfile) return # read contaminants from existing file with IOTools.open_file(contaminants_file, "r") as inf: known_contaminants = [l.split() for l in inf if not l.startswith("#") and l.strip()] known_contaminants = {" ".join(x[:-1]): x[-1] for x in known_contaminants} # output the full sequence of the contaminant if found # in the list of known contaminants, otherwise don't report! matched_contaminants = set() with IOTools.open_file(outfile, "w") as outf: for found_source, found_seq in found_contaminants: possible_source = found_source.split(" (")[0] if possible_source in known_contaminants: matched_contaminants.update((possible_source,)) else: pass if len(matched_contaminants) > 0: for match in matched_contaminants: outf.write(">%s\n%s\n" % (match.replace(" ,", ""), known_contaminants[match]))