def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.getTempFilename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals(track, tmpfasta, dbhandle, full=True, masker="dust", proportion=PARAMS["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) P.touch(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run() os.unlink(tmpfasta)
def loadContigSummary(infile, outfile): ''' load contig summary stats for each assembler ''' outname = P.snip(os.path.dirname(infile), ".dir") + "_" + os.path.basename(infile) + ".load" P.load(infile, outname) P.touch(outfile)
def loadFastqc( infile, outfile ): '''load FASTQC stats.''' track = P.snip( infile, ".fastqc" ) filename = os.path.join( PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt" ) for fn in glob.glob( filename ): prefix = os.path.basename( os.path.dirname( fn ) ) results = [] for name, status, header, data in FastqcSectionIterator(IOTools.openFile( fn )): # do not collect basic stats, see loadFastQCSummary if name == "Basic Statistics": continue parser = CSV2DB.buildParser() (options, args) = parser.parse_args([]) options.tablename = prefix + "_" + re.sub(" ", "_", name ) options.allow_empty= True inf = cStringIO.StringIO( "\n".join( [header] + data ) + "\n" ) CSV2DB.run( inf, options ) results.append( (name, status ) ) # load status table parser = CSV2DB.buildParser() (options, args) = parser.parse_args([]) options.tablename = prefix + "_status" options.allow_empty= True inf = cStringIO.StringIO( "\n".join( ["name\tstatus"] + ["\t".join( x ) for x in results ] ) + "\n" ) CSV2DB.run( inf, options ) P.touch( outfile )
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") to_cluster = True databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def buildAssemblyBWAIndices(infile, outfile): ''' build bwa indices ''' statement = '''bwa index %(infile)s''' P.run() P.touch(outfile)
def importGO(infile, outfile, suffix): '''import GO results into a table.''' x = "_expdiff.%s" % suffix assert infile.endswith(x) track, method, control = getExpressionMatch(infile[:-len(x)] + ".expdiff") if track == control: return tablename = "%(track)s_vs_%(control)s_%(method)s_%(suffix)s" % locals() indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return statement = ''' python %(toolsdir)s/cat_tables.py %(indir)s/*.overall |\ python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --allow-empty \ --index=category \ --index=goid \ --table=%(tablename)s \ > %(outfile)s ''' P.run()
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") to_cluster = True databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def buildPicardAlignmentStats(infile, outfile, genome_file): '''gather BAM file alignment statistics using Picard ''' to_cluster = True job_options = getPicardOptions() if getNumReadsFromBAMFile(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # Picard seems to have problem if quality information is missing # or there is no sequence/quality information within the bam file. # Thus, add it explicitely. statement = '''cat %(infile)s | python %(scriptsdir)s/bam2bam.py -v 0 --set-sequence --sam | CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def runGOFromDatabase( outfile, outdir, statement_fg, statement_bg, go_file, ontology_file = None, samples = 1000 ): '''Take gene lists from the SQL database using ``statement_foreground`` and ``statement_background`` ''' dbhandle = sqlite3.connect( PARAMS["database"] ) cc = dbhandle.cursor() fg = set( [x[0] for x in cc.execute( statement_fg).fetchall() ] ) bg = set( [x[0] for x in cc.execute( statement_bg).fetchall() ] ) if len(fg) == 0: P.touch( outfile ) return fg_file = os.path.join( outdir, "foreground" ) bg_file = os.path.join( outdir, "background" ) outf = open( fg_file, "w") outf.write("\n".join( map(str, fg ) ) + "\n" ) outf.close() outf = open( bg_file, "w") outf.write("\n".join( map(str, bg ) ) + "\n" ) outf.close() runGOFromFiles( outfile, outdir, fg_file, bg_file, go_file, ontology_file = ontology_file, samples = samples )
def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.getTempFilename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=True, masker="dust", proportion=PARAMS["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) P.touch(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run() os.unlink(tmpfasta)
def mergeEffectsPerGene( infile, outfile ): '''summarize effects on a per-gene level.''' tablename = outfile[:-len(".load")] dbhandle = connect() statement = ''' CREATE TABLE %(tablename)s AS SELECT DISTINCT track, gene_id, COUNT(*) AS ntranscripts, MIN(e.nalleles) AS min_nalleles, MAX(e.nalleles) AS max_nalleles, MIN(e.stop_min) AS min_stop_min, MAX(e.stop_min) AS max_stop_min, MIN(e.stop_max) AS min_stop_max, MAX(e.stop_max) AS max_stop_max, SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_knockout, SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_affected FROM annotations.transcript_info as i, effects AS e WHERE i.transcript_id = e.transcript_id GROUP BY i.gene_id, track ''' % locals() Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s" % locals() ) Database.executewait( dbhandle, statement ) Database.executewait( dbhandle, "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" % locals()) dbhandle.commit() P.touch(outfile)
def exportMotifDiscoverySequences( infile, outfile ): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' track = P.snip( infile, "_intervals.load" ) dbhandle = connect() p = P.substituteParameters( **locals() ) nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full = False, masker = P.asList(p['motifs_masker']), halfwidth = int(p["motifs_halfwidth"]), maxsize = int(p["motifs_max_size"]), proportion = p["motifs_proportion"], min_sequences = p["motifs_min_sequences"], num_sequences = p["motifs_num_sequences"], order = p['motifs_score']) if nseq == 0: E.warn( "%s: no sequences - meme skipped" % outfile) P.touch( outfile )
def removeBamfiles(infiles, outfile): for bamfile in infiles: bam_index = bamfile + ".bai" os.unlink(bamfile) if os.path.exists(bam_index): os.unlink(bam_index) P.touch(outfile)
def plotFalsePositiveRates(infile, outfile): ''' barplot the false positive rates across taxonomic levels ''' R('''library(ggplot2)''') R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % infile) for i in [0, 1]: # specificity outf = P.snip(outfile, ".pdf") + ".%i.specificity.pdf" % i R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = fp_rate, fill = track, stat = "identity"))''' % i) R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''') R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''' ) R('''ggsave("%s")''' % outf) # sensitivity outf = P.snip(outfile, ".pdf") + ".%i.sensitivity.pdf" % i R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = tp_rate, fill = track, stat = "identity"))''' % i) R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''') R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''' ) R('''ggsave("%s")''' % outf) P.touch(outfile)
def exportMotifDiscoverySequences(infile, outfile): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' track = P.snip(infile, "_intervals.load") dbhandle = connect() p = P.substituteParameters(**locals()) nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full=False, masker=P.asList(p['motifs_masker']), halfwidth=int(p["motifs_halfwidth"]), maxsize=int(p["motifs_max_size"]), proportion=p["motifs_proportion"], min_sequences=p["motifs_min_sequences"], num_sequences=p["motifs_num_sequences"], order=p['motifs_score']) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile)
def loadFilteredContigLengths(infile, outfile): ''' load contig lengths ''' outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip(os.path.basename(infile), ".tsv") + ".load" P.load(infile, outname) P.touch(outfile)
def buildPicardAlignmentStats( infile, outfile, genome_file ): '''gather BAM file alignment statistics using Picard ''' to_cluster = True job_options = getPicardOptions() if getNumReadsFromBAMFile(infile) == 0: E.warn( "no reads in %s - no metrics" % infile ) P.touch( outfile ) return # Picard seems to have problem if quality information is missing # or there is no sequence/quality information within the bam file. # Thus, add it explicitely. statement = '''cat %(infile)s | python %(scriptsdir)s/bam2bam.py -v 0 --set-sequence --sam | CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def plotFalsePositiveRates(infile, outfile): ''' barplot the false positive rates across taxonomic levels ''' R('''library(ggplot2)''') R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % infile) for i in [0, 1]: # specificity outf = P.snip(outfile, ".pdf") + ".%i.specificity.pdf" % i R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = fp_rate, fill = track, stat = "identity"))''' % i) R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''') R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''') R('''ggsave("%s")''' % outf) # sensitivity outf = P.snip(outfile, ".pdf") + ".%i.sensitivity.pdf" % i R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = tp_rate, fill = track, stat = "identity"))''' % i) R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''') R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''') R('''ggsave("%s")''' % outf) P.touch(outfile)
def importGO( infile, outfile, suffix ): '''import GO results into a table.''' x = "_expdiff.%s" % suffix assert infile.endswith( x ) track, method, control = getExpressionMatch( infile[:-len(x)] + ".expdiff" ) if track == control: return tablename = "%(track)s_vs_%(control)s_%(method)s_%(suffix)s" % locals() indir = infile + ".dir" if not os.path.exists( indir ): P.touch( outfile ) return statement = ''' python %(toolsdir)s/cat_tables.py %(indir)s/*.overall |\ python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --allow-empty \ --index=category \ --index=goid \ --table=%(tablename)s \ > %(outfile)s ''' P.run()
def runMACS( infile, outfile ): '''run MACS for peak detection. The output bed files contain the P-value as their score field. ''' to_cluster = True if infile.endswith( ".norm.bam"): track = infile[:-len(".norm.bam")] if track.startswith("control"): P.touch( outfile ) return format = "bam" suffix = ".norm.bam" elif infile.endswith( ".bam"): track = infile[:-len(".bam")] if track.startswith("control"): P.touch( outfile ) return format = "bam" suffix = ".norm.bam" elif infile.endswith(".bed.gz"): track = infile[:-len(".bed.gz")] if track.startswith("control"): outs = open( outfile, "w") outs.close() return format = "bed" suffix = ".bed.gz" control = getControl( track ) if control != None: control += suffix else: E.info("%s: no control for track %s" % (outfile, track ) ) control = None if control: control = "-c %s" % control else: control = "" statement = ''' macs -t %(infile)s %(control)s \ --diag \ --name=%(outfile)s \ --format=%(format)s \ %(macs_options)s >& %(outfile)s''' P.run( **dict( locals().items() + PARAMS.items() ) )
def loadContigLengths(infile, outfile): ''' load contig lengths ''' outname = P.snip(os.path.dirname(infile), ".dir") + \ "_" + P.snip(os.path.basename(infile), ".tsv") + ".load" P.load(infile, outname, "--index=scaffold_name") P.touch(outfile)
def buildAssemblyBowtie2Indices(infile, outfile): ''' build bowtie indices ''' outbase = P.snip(infile, ".fa") statement = '''bowtie2-build -f %(infile)s %(outbase)s''' P.run() P.touch(outfile)
def loadContigSummary(infile, outfile): ''' load contig summary stats for each assembler ''' outname = P.snip(os.path.dirname(infile), ".dir") + \ "_" + os.path.basename(infile) + ".load" P.load(infile, outname) P.touch(outfile)
def loadContigLengths(infile, outfile): ''' load contig lengths ''' outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip( os.path.basename(infile), ".tsv") + ".load" P.load(infile, outname) P.touch(outfile)
def loadContigGCContent(infile, outfile): ''' load contig GC content ''' outname = P.snip(os.path.dirname(infile), ".dir") + \ "_" + P.snip(os.path.basename(infile), ".tsv") + ".load" P.load(infile, outname, "--index=id") P.touch(outfile)
def estimateInsertSizes( infiles, outfile): """ Plots the internal insert size distribution and calculates the average and standard deviation based on the FWHM """ infiles = " ".join(infiles) to_cluster = USECLUSTER statement = ''' zcat %(infiles)s | python %(rmaadir)s/return_insert_sizes.py > %(outfile)s ''' P.run() # required to resolve strange timing issues # when trying to open the file in the next command P.touch( outfile ) ins_sizes_array=numpy.array( [map(int, x[:-1].split("\t")) for x in open(outfile, "r")] ) max_freq=ins_sizes_array[:,1].max() half_max=float(max_freq)/2.0 E.info( "maximum frequency=%i, halfwidth=%i" % (max_freq, half_max)) # get half width coordinates for bin, value in ins_sizes_array: if value < half_max: continue FWHMmin=bin break for bin, value in ins_sizes_array[::-1]: if value < half_max: continue FWHMmax=bin break FWHM=FWHMmax-FWHMmin std_dev=int(float(FWHM)/2.3548) ins_size=int(FWHMmin+float(FWHM)/2.0)-PARAMS["remove_bases_from_right"] E.info( "".join(["For ", infiles, " FWHM is ", str(FWHM), " ranging from ", str(FWHMmin), " to ", str(FWHMmax), ". std dev ", str(std_dev), " and ins size ", str(ins_size)] ) ) x, y= [], [] for bin,value in ins_sizes_array: if FWHMmin - 2 * std_dev < bin < FWHMmax + 2 * std_dev: x.append(bin) y.append(value) if PLOT: pylab.title("Insert size") pylab.xlabel('inner distance between sequenced ends') pylab.ylabel('frequency based on unique eland mappings') pylab.scatter(x,y) pylab.savefig(outfile + ".png") fwhm_file=open(outfile + ".txt", 'w') my_str='%s\t%s\n' % (ins_size, std_dev) fwhm_file.write(my_str) fwhm_file.close()
def reMergeBamfiles(infiles, sentinal): infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles] outfile = P.snip(sentinal, ".sentinal") + ".bam" bad_samples = PARAMS["options_to_remove"].split(",") to_merge = IDR.filterBadLibraries(infiles, bad_samples) IDR.mergeBams(to_merge, outfile) P.touch(sentinal)
def buildBwaIndices(infile, outfile): ''' build bowtie indices ''' to_cluster = True to_cluster = True statement = '''bwa index %(infile)s''' P.run() P.touch(outfile)
def buildAssemblyBowtieIndices(infile, outfile): ''' build bowtie indices ''' outbase = TRACKS.getTracks()[0] directory = os.path.dirname(infile) statement = '''bowtie-build -f %(infile)s %(directory)s/%(outbase)s''' P.run() P.touch(outfile)
def buildBowtie2Indices(infile, outfile): ''' build bowtie indices ''' to_cluster = True outbase = P.snip(infile, ".fa") statement = '''bowtie2-build -f %(infile)s %(outbase)s''' P.run() P.touch(outfile)
def buildAssemblyBowtieIndices(infile, outfile): ''' build bowtie indices ''' outbase = P.snip(infile, ".fa") directory = os.path.dirname(infile) statement = '''bowtie-build -f %(infile)s %(outbase)s''' P.run() P.touch(outfile)
def poolSampleBamfiles(infiles, sentinal): """ Merge filtered sample files for each tissue """ infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles] outfile = P.snip(sentinal, ".sentinal") + ".bam" IDR.mergeBams(infiles, outfile) P.touch(sentinal)
def callPeaksOnIndividualReplicates(infile, outfile): infile = P.snip(infile, ".sentinel") + ".bam" # fetch peak calling parameters PARAMS_PEAKCALLER = get_peak_caller_parameters( PARAMS["options_peak_caller"]) # call peaks IDR.callIDRPeaks(infile, outfile, PARAMS["options_peak_caller"], PARAMS["options_control_type"], PARAMS_PEAKCALLER) P.touch(outfile)
def splitPooledBamfiles(infile, sentinel): infile = P.snip(infile, ".sentinel") + ".bam" outfile = P.snip(sentinel, ".sentinel") params = '2' try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinel)
def plotRNASEQTagData( infiles, outfile ): '''perform differential expression analysis using deseq.''' design_file = infiles[0] geneset_file = infiles[1] bamfiles = infiles[2] #IMS: now running on feature counts infile = os.path.join( "feature_counts.dir", P.snip( geneset_file, ".gtf.gz") + ".feature_counts.tsv.gz" ) Expression.plotTagStats( infile, design_file, outfile ) P.touch( outfile )
def poolInputBamfiles(infiles, sentinal): """ Merge filtered input files for each tissue, with the option of excluding undesirable libraries. """ infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles] outfile = P.snip(sentinal, ".sentinal") + ".bam" bad_samples = PARAMS["filter_remove_inputs"].split(",") to_merge = IDR.filterBadLibraries(infiles, bad_samples) IDR.mergeBams(to_merge, outfile) P.touch(sentinal)
def splitPooledBamfiles(infile, sentinal): infile = P.snip(infile, ".sentinal") + ".bam" outfile = P.snip(sentinal, ".sentinal") params = '2' module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinal)
def splitPooledBamfiles(infile, sentinal): infile = P.snip(infile, ".sentinal") + ".bam" outfile = P.snip(sentinal, ".sentinal") params = '2' module = P.snip(IDR.__file__, ".py") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinal)
def callPeaksOnPooledReplicates(infile, outfile): # fetch peak calling parameters PARAMS_PEAKCALLER = get_peak_caller_parameters( PARAMS["options_peak_caller"]) # call peaks on pseudoreplicates IDR.callIDRPeaks(infile, outfile, PARAMS["options_peak_caller"], PARAMS["options_control_type"], PARAMS_PEAKCALLER, pseudoreplicate=False) P.touch(outfile)
def loadEdgeR( infile, outfile ): '''load EdgeR per-chunk summary stats.''' prefix = P.snip( outfile, ".load" ) for fn in glob.glob( infile + "*_summary.tsv" ): prefix = P.snip(fn[len(infile)+1:], "_summary.tsv") P.load( fn, prefix + ".deseq_summary.load", collapse = 0, transpose = "sample") P.touch( outfile )
def callPeaksOnIndividualReplicates(infile, outfile): infile = P.snip(infile, ".sentinal") + ".bam" # fetch peak calling parameters PARAMS_PEAKCALLER = get_peak_caller_parameters( PARAMS["options_peak_caller"]) # call peaks IDR.callIDRPeaks(infile, outfile, PARAMS["options_peak_caller"], PARAMS["options_control_type"], PARAMS_PEAKCALLER) P.touch(outfile)
def makeAnnotatorGeneSets( infile, outfile, slice ): '''compute annotator overlap between sets. ''' workspaces = ("genomic", "alignable", slice ) track = infile[:-len(".gtf.gz")] infiles = ANNOTATOR_TRACKS related = getRelatedTracks( infile, infiles ) if related: E.info("removing related tracks %s from %s" % \ ( related, infile ) ) related = set(related) infiles = [x for x in TRACKS if x not in related ] tmpdir = tempfile.mkdtemp( dir = os.getcwd() ) annotations = os.path.join( tmpdir, "annotations") PAnnotator.buildGeneSetAnnotations( infiles, annotations, slice ) segments = PAnnotator.buildAnnotatorSlicedSegments( tmpdir, outfile, track, slice ) if not segments: E.warn( "no segments for %s - no annotator results" % outfile ) shutil.rmtree( tmpdir ) P.touch( outfile ) return workspaces, synonyms = PAnnotator.buildAnnotatorWorkSpace( tmpdir, outfile, workspaces = workspaces, gc_control = True ) PAnnotator.runAnnotator( tmpdir, outfile, annotations, segments, workspaces, synonyms ) shutil.rmtree( tmpdir )
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' to_cluster = True # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) tmpdir = P.getTempDir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.asList(PARAMS['motifs_masker']), halfwidth=int(PARAMS["meme_halfwidth"]), maxsize=int(PARAMS["meme_max_size"]), proportion=PARAMS["meme_proportion"], min_sequences=PARAMS["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def splitBamfiles(infile, sentinel): """ For all tracks, split the filtered bamfile in two using pysam """ infile = P.snip(infile, ".sentinel") + ".bam" outfile = P.snip(sentinel, ".sentinel") params = '2' try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinel)
def loadGeneSummary(infile, outfile): '''summarize binding information per gene.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() cc.execute("""DROP TABLE IF EXISTS %(table)s """ % locals()) cc.execute("""CREATE TABLE %(table)s AS SELECT gene_id, SUM( tata ) AS tata, SUM( cpg ) AS cpg FROM promotorinfo_transcripts AS p, annotations.transcript_info as i WHERE i.transcript_id = p.transcript_id GROUP BY gene_id""" % locals()) cc.close() P.touch(outfile)