def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.getTempFilename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=True, masker="dust", proportion=PARAMS["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) P.touch(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run() os.unlink(tmpfasta)
def permuteMATS(infile, outfiles, outdir): '''creates directories for permutation testing Creates directories for permutation testing and leaves dummy init file in directory (for timestamping) Only becomes active if :term:`PARAMS` permute is set to 1 Parameters ---------- infile: string name and path to design outfile: list list of unknown length, capturing all permutations retrospectively outdir: string directory to generate permutations in permutations : string :term:`PARAMS`. number of directories to be generated ''' if not os.path.exists(outdir): os.makedirs(outdir) for i in range(0, PARAMS["permutations"]): if not os.path.exists("%s/run%i.dir" % (outdir, i)): os.makedirs("%s/run%i.dir" % (outdir, i)) P.touch("%s/run%i.dir/init" % (outdir, i))
def loadGO(infile, outfile, tablename): """import GO results into individual tables. This method concatenates all the results from a GO analysis and uploads into a single table. """ indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return load_statement = P.build_load_statement( tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=goid ") statement = ''' python %(toolsdir)s/cat_tables.py %(indir)s/*.overall | %(load_statement)s > %(outfile)s ''' P.run()
def buildPicardRnaSeqMetrics(infiles, strand, outfile): '''run picard:RNASeqMetrics Arguments --------- infiles : string Input filename in :term:`BAM` format. Genome file in refflat format (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat) outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 infile, genome = infiles if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectRnaSeqMetrics REF_FLAT=%(genome)s INPUT=%(infile)s ASSUME_SORTED=true OUTPUT=%(outfile)s STRAND=%(strand)s VALIDATION_STRINGENCY=SILENT ''' P.run()
def removeBamfiles(infiles, outfile): for bamfile in infiles: bam_index = bamfile + ".bai" os.unlink(bamfile) if os.path.exists(bam_index): os.unlink(bam_index) P.touch(outfile)
def generatePSP(positives, negatives, outfile): ''' generate a discrimitative PSP file from the positives and negatives that can be used to do descriminative MEME ''' psp_options = PARAMS["psp_options"] nseqs_pos = int(FastaIterator.count(positives)) nseqs_neg = int(FastaIterator.count(negatives)) if nseqs_pos < 2 or nseqs_neg < 2: E.warn("%s: input files do not have sufficent sequences" "to run psp-gen, skipping" % outfile) P.touch(outfile) return # get appropriate options from meme options if PARAMS.get("meme_revcomp", True): psp_options += " -revcomp" statement = '''psp-gen -pos %(positives)s -neg %(negatives)s %(psp_options)s > %(outfile)s ''' P.run()
def buildPicardInsertSizeStats(infile, outfile, genome_file): '''run Picard:CollectInsertSizeMetrics Collect insert size statistics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectInsertSizeMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def buildPicardAlignmentStats(infile, outfile, genome_file): '''gather BAM file alignment statistics using Picard ''' job_options = getPicardOptions() job_threads = 3 if getNumReadsFromBAMFile(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # Picard seems to have problem if quality information is missing # or there is no sequence/quality information within the bam file. # Thus, add it explicitely. statement = '''cat %(infile)s | python %(scriptsdir)s/bam2bam.py -v 0 --method=set-sequence --output-sam | CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def buildPicardInsertSizeStats(infile, outfile, genome_file): '''run Picard:CollectInsertSizeMetrics Collect insert size statistics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''CollectInsertSizeMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def buildPicardDuplicateStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard and keep the dedupped .bam file. Pair duplication is properly handled, including inter-chromosomal cases. SE data is also handled. These stats also contain a histogram that estimates the return from additional sequecing. No marked bam files are retained (/dev/null...) Note that picards counts reads but they are in fact alignments. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s MarkDuplicates INPUT=%(infile)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s.duplicate_metrics OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT; ''' statement += '''samtools index %(outfile)s ;''' P.run()
def buildPicardGCStats(infile, outfile, genome_file): """picard:CollectGCBiasMetrics Collect GC bias metrics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. """ job_memory = PICARD_MEMORY job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''CollectGcBiasMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT CHART_OUTPUT=%(outfile)s.pdf SUMMARY_OUTPUT=%(outfile)s.summary >& %(outfile)s''' P.run()
def getCpGIslandsFromUCSC(dbhandle, outfile): '''get CpG islands from UCSC database and save as a :term:`bed` formatted file. The name column in the bed file will be set to the UCSC name. Arguments --------- dbhandle : object Database handle to UCSC mysql database outfile : string Filename of output file in :term:`bed` format. ''' cc = dbhandle.cursor() table = "cpgIslandExt" sql = """SELECT chrom, chromStart, chromEnd, name FROM %(table)s ORDER by chrom, chromStart""" sql = sql % locals() E.debug("executing sql statement: %s" % sql) try: cc.execute(sql) outfile = IOTools.openFile(outfile, "w") for data in cc.fetchall(): outfile.write("\t".join(map(str, data)) + "\n") outfile.close() except Exception: E.warn("Failed to connect to table %s. %s is empty" % (table, outfile)) P.touch(outfile)
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def runFastqScreen(infiles, outfile): '''run FastqScreen on input files.''' # variables required for statement built by FastqScreen() tempdir = P.getTempDir(".") outdir = os.path.join(PARAMS["exportdir"], "fastq_screen") # configure job_threads with fastq_screen_options from PARAMS job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options']) if len(job_threads) != 1: raise ValueError("Wrong number of threads for fastq_screen") job_threads = int(re.sub(r'--threads ', '', job_threads[0])) job_memory = "8G" # Create fastq_screen config file in temp directory # using parameters from Pipeline.ini with IOTools.openFile(os.path.join(tempdir, "fastq_screen.conf"), "w") as f: for i, k in list(PARAMS.items()): if i.startswith("fastq_screen_database"): f.write("DATABASE\t%s\t%s\n" % (i[22:], k)) m = PipelineMapping.FastqScreen() statement = m.build((infiles,), outfile) P.run() shutil.rmtree(tempdir) P.touch(outfile)
def buildPicardDuplicateStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard and keep the dedupped .bam file. Pair duplication is properly handled, including inter-chromosomal cases. SE data is also handled. These stats also contain a histogram that estimates the return from additional sequecing. No marked bam files are retained (/dev/null...) Note that picards counts reads but they are in fact alignments. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s MarkDuplicates INPUT=%(infile)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s.duplicate_metrics OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT; ''' statement += '''samtools index %(outfile)s ;''' P.run()
def loadGO(infile, outfile, tablename): """import GO results into individual tables. This method concatenates all the results from a GO analysis and uploads into a single table. """ indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return load_statement = P.build_load_statement(tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=goid ") statement = ''' python %(toolsdir)s/cat_tables.py %(indir)s/*.overall | %(load_statement)s > %(outfile)s ''' P.run()
def buildPicardAlignmentStats(infile, outfile, genome_file): '''run picard:CollectMultipleMetrics Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectMultipleMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def buildPicardCoverageStats(infile, outfile, baits, regions): '''run picard:CollectHsMetrics Generate coverage statistics for regions of interest from a bed file using Picard. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. baits : :term:`bed` formatted file of bait regions regions : :term:`bed` formatted file of target regions ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectHsMetrics BAIT_INTERVALS=%(baits)s TARGET_INTERVALS=%(regions)s INPUT=%(infile)s OUTPUT=%(outfile)s VALIDATION_STRINGENCY=LENIENT''' % locals() P.run()
def buildPicardCoverageStats(infile, outfile, baits, regions): '''run picard:CalculateHSMetrics Generate coverage statistics for regions of interest from a bed file using Picard. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. baits : :term:`bed` formatted file of bait regions regions : :term:`bed` formatted file of target regions ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CalculateHsMetrics BAIT_INTERVALS=%(baits)s TARGET_INTERVALS=%(regions)s INPUT=%(infile)s OUTPUT=%(outfile)s VALIDATION_STRINGENCY=LENIENT''' % locals() P.run()
def calculateM3DSpikeClustersPvalue(infiles, outfile): job_options = "-l mem_free=4G -pe dedicated 1" design = infiles[-1] infiles = infiles[:-1] RRBS.calculateM3DSpikepvalue(infiles, outfile, design, submit=True, job_options=job_options) P.touch(outfile)
def makeSummaryPlots(infile, outfile): job_options = "-l mem_free=48G" RRBS.summaryPlots(infile, outfile, submit=True, job_options=job_options) P.touch(outfile)
def buildPicardGCStats(infile, outfile, genome_file): """picard:CollectGCBiasMetrics Collect GC bias metrics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. """ job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectGcBiasMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT CHART_OUTPUT=%(outfile)s.pdf SUMMARY_OUTPUT=%(outfile)s.summary >& %(outfile)s''' P.run()
def exportMotifDiscoverySequences(infile, outfile): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' track = P.snip(infile, "_intervals.load") dbhandle = connect() p = P.substituteParameters(**locals()) nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full=False, masker=P.asList(p['motifs_masker']), halfwidth=int(p["motifs_halfwidth"]), maxsize=int(p["motifs_max_size"]), proportion=p["motifs_proportion"], min_sequences=p["motifs_min_sequences"], num_sequences=p["motifs_num_sequences"], order=p['motifs_score']) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile)
def buildPicardRnaSeqMetrics(infiles, strand, outfile): '''run picard:RNASeqMetrics Arguments --------- infiles : string Input filename in :term:`BAM` format. Genome file in refflat format (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat) outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 infile, genome = infiles if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectRnaSeqMetrics REF_FLAT=%(genome)s INPUT=%(infile)s ASSUME_SORTED=true OUTPUT=%(outfile)s STRAND=%(strand)s VALIDATION_STRINGENCY=SILENT ''' P.run()
def mergeMeanTables(infiles, outfile): ''' Collate and merge all separate tables into a single large table for all MZ and DZ twins ''' job_memory = "300G" panel = outfile.split("/")[-1].split("-")[1] cell_type = outfile.split("/")[-1].split("mean_")[-1] cell_type = P.snip(cell_type, ".tsv") table_name = "_".join([cell_type, "mean"]) out_dir = "/".join(outfile.split("/")[:-1]) twin_id = "twin.id" statement = ''' python /ifs/devel/projects/proj052/flow_pipeline/scripts/flow2twins.py --task=merge_flow --twin-id-column=%(twin_id)s --demographics-file=%(twins_demographics)s --demo-id-column=%(twins_demo_header)s --database=%(database)s --tablename=%(table_name)s --filter-gates="(F|S)SC-(A|H)" --filter-zero-arrays --log=%(outfile)s.log --output-directory=%(out_dir)s --output-file-pattern=%(table_name)s ''' P.run() P.touch(outfile)
def splitFiles(infile, outfile): """ Arbitrarily split files into chunks for parallelisation """ Timeseries.splitFiles(infile=infile, nchunks=PARAMS["resampling_chunks"], out_dir="parallel_files.dir") P.touch(outfile)
def getCpGIslandsFromUCSC(dbhandle, outfile): '''get CpG islands from UCSC database and save as a :term:`bed` formatted file. The name column in the bed file will be set to the UCSC name. Arguments --------- dbhandle : object Database handle to UCSC mysql database outfile : string Filename of output file in :term:`bed` format. ''' table = "cpgIslandExt" sql = """SELECT chrom, chromStart, chromEnd, name FROM %(table)s ORDER by chrom, chromStart""" sql = sql % locals() E.debug("executing sql statement: %s" % sql) try: cc = dbhandle.execute(sql) outfile = IOTools.openFile(outfile, "w") for data in cc.fetchall(): outfile.write("\t".join(map(str, data)) + "\n") outfile.close() except Exception: E.warn("Failed to connect to table %s. %s is empty" % (table, outfile)) P.touch(outfile)
def runFIMO(motifs, database, outfile, exportdir, options={}): '''run fimo to look for occurances of motifs supplied in sequence database. :param:`motifs` is the path to a MEME formated motif file. :param:`database` is a fasta file. :param:`outfile` is the text output from fimo :param:`exportdir` specifies the directory to put exported files (html,gff) :param:options is a dictionary: {'option':'value'} will be passed as --option=value and will overwrite options specified in the PARAMs''' # if the motifs file is empty, then fimo will return an error # this isn't very useful behavoir. inlines = IOTools.openFile(motifs).read() #print inlines if not re.search("MOTIF", inlines): E.warning("No motifs found in %s" % motifs) P.touch(outfile) return else: E.debug("%s: %i motifs found" % (motifs, len(re.findall("MOTIF", inlines)))) fimo_options = PARAMS.get("fimo_options", "") for option, value in options.iteritems(): fimo_options = re.sub("%s=\S+" % option, "", fimo_options) if value is None: fimo_options += " --%s" % option else: fimo_options += " --%s=%s" % (option, value) tmpout = P.getTempFilename() track = os.path.basename(outfile) exportdir = os.path.abspath(exportdir) xmlout = P.snip(outfile,".txt") + ".xml" logfile = P.snip(outfile,".txt") + ".log" gffout = os.path.join(exportdir, track + ".gff") htmlout = os.path.join(exportdir, track + ".html") statement = ''' fimo --oc %(tmpout)s %(fimo_options)s %(motifs)s %(database)s &> %(logfile)s; checkpoint; mv %(tmpout)s/fimo.txt %(outfile)s; checkpoint; mv %(tmpout)s/fimo.xml %(xmlout)s; checkpoint; mv %(tmpout)s/fimo.gff %(gffout)s checkpoint; mv %(tmpout)s/fimo.html %(htmlout)s; checkpoint; rm -r %(tmpout)s ''' P.run()
def joint_index_dexseq(infile, outfile): db = connect() db.executescript(''' DROP INDEX IF EXISTS dexseq_results_joint; CREATE INDEX dexseq_results_joint ON dexseq_results(groupID,featureID);''') P.touch(outfile)
def subsetSequenceData(infile, outfile): """subset fastq files""" ignore_pipe_erors = True ignore_errors = True m = PipelineMapping.SubsetHead(limit=PARAMS["sample_size"]) statement = m.build((infile,), outfile) P.run() P.touch(outfile)
def reMergeBamfiles(infiles, sentinel): infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles] outfile = P.snip(sentinel, ".sentinel") + ".bam" bad_samples = PARAMS["options_to_remove"].split(",") to_merge = IDR.filterBadLibraries(infiles, bad_samples) IDR.mergeBams(to_merge, outfile) P.touch(sentinel)
def runGOFromDatabase(outfile, outdir, statement_fg, statement_bg, go_file, ontology_file=None, samples=1000): """check for GO enrichment. Gene lists are extracted from a database. This method is a wrapper for `runGO.py`. Arguments --------- outfile : string Output filename outdir : string Output directory for auxiliary files statement_fg : string SQL statement to select genes of foreground set. statement_bg : string SQL statement to select genes in background set. go_file : string Filename with Gene-to-GO assignments ontology_file : string Filename with ontology information. samples : int Number of samples for empirical FDR. If not given, use BH FDR. """ dbhandle = sqlite3.connect(PARAMS["database_name"]) cc = dbhandle.cursor() fg = set([x[0] for x in cc.execute(statement_fg).fetchall()]) bg = set([x[0] for x in cc.execute(statement_bg).fetchall()]) if len(fg) == 0: P.touch(outfile) return fg_file = os.path.join(outdir, "foreground") bg_file = os.path.join(outdir, "background") outf = open(fg_file, "w") outf.write("\n".join(map(str, fg)) + "\n") outf.close() outf = open(bg_file, "w") outf.write("\n".join(map(str, bg)) + "\n") outf.close() runGOFromFiles(outfile, outdir, fg_file, bg_file, go_file, ontology_file=ontology_file, samples=samples)
def splitFiles(infile, outfile): ''' Arbitrarily split files into chunks for parallelisation ''' Timeseries.splitFiles(infile=infile, nchunks=PARAMS['resampling_chunks'], out_dir="parallel_files.dir") P.touch(outfile)
def poolSampleBamfiles(infiles, sentinel): """ Merge filtered sample files for each tissue """ infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles] outfile = P.snip(sentinel, ".sentinel") + ".bam" IDR.mergeBams(infiles, outfile) P.touch(sentinel)
def callPeaksOnIndividualReplicates(infile, outfile): infile = P.snip(infile, ".sentinel") + ".bam" # fetch peak calling parameters PARAMS_PEAKCALLER = get_peak_caller_parameters( PARAMS["options_peak_caller"]) # call peaks IDR.callIDRPeaks(infile, outfile, PARAMS["options_peak_caller"], PARAMS["options_control_type"], PARAMS_PEAKCALLER) P.touch(outfile)
def genReplicateData(infile, outfile): """ Split each replicate into a separate file for clustering within each replicate. Relies on each replicate being the same across the whole time series. """ outdir = outfile.split("/")[0] Timeseries.splitReplicates(infile=infile, axis="column", group_var="replicates", outdir=outdir) P.touch(outfile)
def splitPooledBamfiles(infile, sentinel): infile = P.snip(infile, ".sentinel") + ".bam" outfile = P.snip(sentinel, ".sentinel") params = '2' try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinel)
def plotDETagStats(infiles, outfile): '''plot differential expression stats''' infile, composition_file = infiles Expression.plotDETagStats( infile, outfile, additional_file=composition_file, join_columns=("contig", "start", "end"), additional_columns=("CpG_density", "length")) P.touch(outfile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--method", dest="method", type="choice", choices=["compensation", "parse_gating"], help="select method to perform on workspace " "file.") parser.add_option("--gating-directory", dest="gate_dir", type="string", help="directory to store gating dummy files") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # write footer and output benchmark information. E.Stop() infile = argv[-1] if options.method == "compensation": split_file = infile.split("/") infile = split_file[-1] split_file.remove(infile) path = "/".join(split_file) out_df = P52.get_compensation_matrix(path=path, infile=infile) out_df.to_csv(options.stdout, sep="\t") elif options.method == "parse_gating": for dfile in P52.parse_gating_file(infile): outfile = options.gate_dir + "/" + dfile P.touch(outfile) else: pass
def buildPicardDuplicationStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard, the marked records are discarded. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # currently, MarkDuplicates cannot handle split alignments from gsnap # these can be identified by the custom XT tag. if ".gsnap.bam" in infile: tmpf = P.getTempFile(".") tmpfile_name = tmpf.name statement = '''samtools view -h %(infile)s | awk "!/\\tXT:/" | samtools view /dev/stdin -S -b > %(tmpfile_name)s; ''' % locals() data_source = tmpfile_name else: statement = "" data_source = infile os.environ["CGAT_JAVA_OPTS"] = "-Xmx%s -XX:+UseParNewGC\ -XX:+UseConcMarkSweepGC" % (PICARD_MEMORY) statement += '''MarkDuplicates INPUT=%(data_source)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s OUTPUT=/dev/null VALIDATION_STRINGENCY=SILENT ''' P.run() os.unsetenv("CGAT_JAVA_OPTS") if ".gsnap.bam" in infile: os.unlink(tmpfile_name)
def plotPathwayGenes(infile, outfile): ''' plot the genes that are differentially expressed and fall into pathways ''' # R will not be able to plot anything if none of the # differentially expressed genes are associated # with a pathway. plot nothing if this is the case # colour of the pathways should associate with the # track that they come from # because the plots can get unwieldy with large # gene sets, if there are more than 20 genes # associated with a pathway then take the top 20 # This should be explained in the documentation col = random.sample(range(1,600,1), 1)[0] track = os.path.basename(infile).replace(".genes", "") if len(open(infile).readlines()) == 1: R('''pdf("%s") plot(c(0,1,2,3,4), c(0,1,2,3,4), cex = 0) text(2, y = 2, labels = "No genes were associated with pathways", cex = 1) ''' % outfile.replace(".plots", ".pdf")) P.touch(outfile) else: # NB. size of plot should be proportional to the # number of genes in the pathways R(''' library("ggplot2") dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t") pathways <- unique(dat$pathway) for (p in pathways){ toPlot <- aggregate(l2fold~gene, dat[dat$pathway == p,], mean) if (regexpr("/", p)[1] != -1){ # "/" in name not compatible with outfile names p <- sub("/", "|", p)} outf <- paste(paste("pathways.dir/", paste("%s", p, sep = "."), sep = ""), "genes.pdf", sep = ".") cols <- col2rgb(%i) col <- rgb(cols[1], cols[2], cols[3], maxColorValue = 255) toPlot$col <- col if (nrow(toPlot) > 10){ toPlot <- toPlot[order(abs(toPlot$l2fold), decreasing = T),][1:10,]} plot1 <- ggplot(toPlot, aes(x = gene, y = l2fold, fill = col, stat = "identity")) + geom_bar(stat = "identity") + coord_flip() + scale_fill_manual(values = toPlot$col) plot1 + ggtitle(p) + theme(text = element_text(size = 40, color = "black"), axis.text = element_text(colour = "Black")) ggsave(file = outf, width = 11, height = nrow(toPlot), limitsize = F) } ''' % (infile, track, col)) P.touch(outfile)
def callPeaksOnPooledReplicates(infile, outfile): # fetch peak calling parameters PARAMS_PEAKCALLER = get_peak_caller_parameters( PARAMS["options_peak_caller"]) # call peaks on pseudoreplicates IDR.callIDRPeaks(infile, outfile, PARAMS["options_peak_caller"], PARAMS["options_control_type"], PARAMS_PEAKCALLER, pseudoreplicate=False) P.touch(outfile)
def loadFastqc(infile, outfile): '''load FASTQC stats into database.''' track = P.snip(infile, ".fastqc") filename = os.path.join( PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt") PipelineReadqc.loadFastqc(filename, backend=PARAMS["database_backend"], database=PARAMS["database_name"], host=PARAMS["database_host"], username=PARAMS["database_username"], password=PARAMS["database_password"], port=PARAMS["database_port"]) P.touch(outfile)
def loadFastqc(infile, outfile): '''load FASTQC stats into database.''' track = P.snip(infile.replace("processed.dir/", ""), ".fastqc") filename = os.path.join( PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt") PipelineReadqc.loadFastqc(filename, backend=PARAMS["database_backend"], database=PARAMS["database_name"], host=PARAMS["database_host"], username=PARAMS["database_username"], password=PARAMS["database_password"], port=PARAMS["database_port"]) P.touch(outfile)
def genReplicateData(infile, outfile): ''' Split each replicate into a separate file for clustering within each replicate. Relies on each replicate being the same across the whole time series. ''' outdir = outfile.split("/")[0] Timeseries.splitReplicates(infile=infile, axis="column", group_var="replicates", outdir=outdir) P.touch(outfile)
def loadMATS(infile, outfile): '''load RMATS results into relational database Loads rMATS results into relational database. Continues if table empty. Parameters ---------- infile: term:`tsv` file containing one type of rMATS results. outfile: .load file ''' try: P.load(infile, outfile) except: P.touch(outfile)