def generatePSP(positives, negatives, outfile): ''' generate a discrimitative PSP file from the positives and negatives that can be used to do descriminative MEME ''' psp_options = PARAMS["psp_options"] nseqs_pos = int(FastaIterator.count(positives)) nseqs_neg = int(FastaIterator.count(negatives)) if nseqs_pos < 2 or nseqs_neg < 2: E.warn("%s: input files do not have sufficent sequences" "to run psp-gen, skipping" % outfile) P.touch(outfile) return # get appropriate options from meme options if PARAMS.get("meme_revcomp", True): psp_options += " -revcomp" statement = '''psp-gen -pos %(positives)s -neg %(negatives)s %(psp_options)s > %(outfile)s ''' P.run(statement)
def calculateM3DSpikeClustersPvalue(infiles, outfile): job_options = "-l mem_free=4G -pe dedicated 1" design = infiles[-1] infiles = infiles[:-1] RRBS.calculateM3DSpikepvalue(infiles, outfile, design, submit=True, job_options=job_options) P.touch(outfile)
def removeBamfiles(infiles, outfile): for bamfile in infiles: bam_index = bamfile + ".bai" os.unlink(bamfile) if os.path.exists(bam_index): os.unlink(bam_index) P.touch(outfile)
def makeSummaryPlots(infile, outfile): job_options = "-l mem_free=48G" RRBS.summaryPlots(infile, outfile, submit=True, job_options=job_options) P.touch(outfile)
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.get_temp_dir(".") databases = " ".join(P.as_list(PARAMS["tomtom_databases"])) target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.is_empty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run(statement) # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def splitFiles(infile, outfile): ''' Arbitrarily split files into chunks for parallelisation ''' Timeseries.splitFiles(infile=infile, nchunks=PARAMS['resampling_chunks'], out_dir="parallel_files.dir") P.touch(outfile)
def reMergeBamfiles(infiles, sentinel): infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles] outfile = P.snip(sentinel, ".sentinel") + ".bam" bad_samples = PARAMS["options_to_remove"].split(",") to_merge = IDR.filterBadLibraries(infiles, bad_samples) IDR.mergeBams(to_merge, outfile) P.touch(sentinel)
def poolSampleBamfiles(infiles, sentinel): """ Merge filtered sample files for each tissue """ infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles] outfile = P.snip(sentinel, ".sentinel") + ".bam" IDR.mergeBams(infiles, outfile) P.touch(sentinel)
def callPeaksOnIndividualReplicates(infile, outfile): infile = P.snip(infile, ".sentinel") + ".bam" # fetch peak calling parameters PARAMS_PEAKCALLER = get_peak_caller_parameters( PARAMS["options_peak_caller"]) # call peaks IDR.callIDRPeaks(infile, outfile, PARAMS["options_peak_caller"], PARAMS["options_control_type"], PARAMS_PEAKCALLER) P.touch(outfile)
def splitPooledBamfiles(infile, sentinel): infile = P.snip(infile, ".sentinel") + ".bam" outfile = P.snip(sentinel, ".sentinel") params = '2' try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinel)
def runFIMO(motifs, database, outfile, exportdir, options={}): '''run fimo to look for occurances of motifs supplied in sequence database. :param:`motifs` is the path to a MEME formated motif file. :param:`database` is a fasta file. :param:`outfile` is the text output from fimo :param:`exportdir` specifies the directory to put exported files (html,gff) :param:options is a dictionary: {'option':'value'} will be passed as --option=value and will overwrite options specified in the PARAMs''' # if the motifs file is empty, then fimo will return an error # this isn't very useful behavoir. inlines = IOTools.open_file(motifs).read() #print inlines if not re.search("MOTIF", inlines): E.warning("No motifs found in %s" % motifs) P.touch(outfile) return else: E.debug("%s: %i motifs found" % (motifs, len(re.findall("MOTIF", inlines)))) fimo_options = PARAMS.get("fimo_options", "") for option, value in options.iteritems(): fimo_options = re.sub("%s=\S+" % option, "", fimo_options) if value is None: fimo_options += " --%s" % option else: fimo_options += " --%s=%s" % (option, value) tmpout = P.get_temp_filename() track = os.path.basename(outfile) exportdir = os.path.abspath(exportdir) xmlout = P.snip(outfile, ".txt") + ".xml" logfile = P.snip(outfile, ".txt") + ".log" gffout = os.path.join(exportdir, track + ".gff") htmlout = os.path.join(exportdir, track + ".html") statement = ''' fimo --oc %(tmpout)s %(fimo_options)s %(motifs)s %(database)s &> %(logfile)s; mv %(tmpout)s/fimo.txt %(outfile)s; mv %(tmpout)s/fimo.xml %(xmlout)s; mv %(tmpout)s/fimo.gff %(gffout)s; mv %(tmpout)s/fimo.html %(htmlout)s; rm -r %(tmpout)s ''' P.run(statement)
def callPeaksOnPooledReplicates(infile, outfile): # fetch peak calling parameters PARAMS_PEAKCALLER = get_peak_caller_parameters( PARAMS["options_peak_caller"]) # call peaks on pseudoreplicates IDR.callIDRPeaks(infile, outfile, PARAMS["options_peak_caller"], PARAMS["options_control_type"], PARAMS_PEAKCALLER, pseudoreplicate=False) P.touch(outfile)
def genReplicateData(infile, outfile): ''' Split each replicate into a separate file for clustering within each replicate. Relies on each replicate being the same across the whole time series. ''' outdir = outfile.split("/")[0] Timeseries.splitReplicates(infile=infile, axis="column", group_var="replicates", outdir=outdir) P.touch(outfile)
def splitBamfiles(infile, sentinel): """ For all tracks, split the filtered bamfile in two using pysam """ infile = P.snip(infile, ".sentinel") + ".bam" outfile = P.snip(sentinel, ".sentinel") params = '2' try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinel)
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) tmpdir = P.get_temp_dir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.as_list(PARAMS['motifs_masker']), halfwidth=int(PARAMS["meme_halfwidth"]), maxsize=int(PARAMS["meme_max_size"]), proportion=PARAMS["meme_proportion"], min_sequences=PARAMS["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def poolInputBamfiles(infiles, sentinel): """ Merge filtered input files for each tissue, with the option of excluding undesirable libraries. """ infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles] outfile = P.snip(sentinel, ".sentinel") + ".bam" bad_samples = PARAMS["filter_remove_inputs"].split(",") if len(infiles) > 1: to_merge = IDR.filterBadLibraries(infiles, bad_samples) IDR.mergeBams(to_merge, outfile) else: os.symlink(os.path.abspath(infiles[0]), outfile) os.symlink(os.path.abspath(infiles[0]) + ".bai", outfile + ".bai") P.touch(sentinel)
def exportIntervalSequences(infile, outfile, track, method): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' dbhandle = connect() try: halfwidth = int(PARAMS[method+"_halfwidth"]) full = False except ValueError: full = True halfwidth = None try: maxsize = int(PARAMS[method+"_max_size"]) except ValueError: maxsize = None nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full=full, masker=P.as_list(PARAMS[method+'_masker']), halfwidth=halfwidth, maxsize=maxsize, num_sequences=PARAMS[method+"_num_sequences"], proportion=PARAMS[method+"_proportion"], min_sequences=PARAMS[method+"_min_sequences"], order=PARAMS[method+'_score']) if nseq == 0: E.warn("%s: no sequences - %s skipped" % (outfile, method)) P.touch(outfile)
def loadGeneSummary(infile, outfile): '''summarize binding information per gene.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() cc.execute("""DROP TABLE IF EXISTS %(table)s """ % locals()) cc.execute("""CREATE TABLE %(table)s AS SELECT gene_id, SUM( tata ) AS tata, SUM( cpg ) AS cpg FROM promotorinfo_transcripts AS p, annotations.transcript_info as i WHERE i.transcript_id = p.transcript_id GROUP BY gene_id""" % locals()) cc.close() P.touch(outfile)
def summariseReadStart(infile, outfile): # this only works for fastq files. Fails with .sra files # this function and the next section should be replaced with a call to # fastq-dump if the file ends with .sra and then use the functions of # the fastq module to count the first bases in the fastq records. # for now, create empty outfile if infile.endswith(".sra"): P.touch(outfile) else: statement = '''zcat %(infile)s | paste - - - - | cut -f2 | cut -c1-3 | sort | uniq -c | sort -nk1 | awk -F' ' 'BEGIN{total=0; sum=0} {total+=$1; OFS"\\t"; if($2=="CGG"||$2=="TGG"||$2=="CGA"||$2=="TGA") {sum+=$1; print $1, $2}} END {print total-sum,"others"}' > %(outfile)s ''' % locals() P.run()
def timePointDiffExpression(infile, outfile): ''' Within each condition test for differentially expressed genes against the baseline time point. Uses DESeq2. ''' statement = ''' cgat timeseries2diffgenes --log=%(outfile)s.log --method=timepoint --alpha=%(deseq_alpha)s --results-directory=diff_timepoints.dir %(infile)s ''' P.run() P.touch(outfile)
def loadTomTom(infile, outfile): '''load tomtom results''' tablename = P.to_table(outfile) resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", infile) xml_file = os.path.join(resultsdir, "tomtom.xml") if not os.path.exists(xml_file): E.warn("no tomtom output - skipped loading ") P.touch(outfile) return # get the motif name from the xml file tree = xml.etree.ElementTree.ElementTree() tree.parse(xml_file) motifs = tree.find("targets") name2alt = {} for motif in motifs.getiterator("motif"): name = motif.get("id") alt = motif.get("alt") name2alt[name] = alt tmpfile = P.get_temp_file(".") # parse the text file for line in IOTools.open_file(infile): if line.startswith("#Query"): tmpfile.write('\t'.join(("target_name", "query_id", "target_id", "optimal_offset", "pvalue", "evalue", "qvalue", "Overlap", "query_consensus", "target_consensus", "orientation")) + "\n") continue data = line[:-1].split("\t") target_name = name2alt[data[1]] tmpfile.write("%s\t%s" % (target_name, line)) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def collectMEMEResults(tmpdir, target_path, outfile, method="meme"): '''collect output from a MEME run in tmpdir and copy all over to target_path convert images output by MEME (.eps files) to .png files.''' # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) E.debug("tmpdir is" + tmpdir + " target_path is " + target_path) shutil.move(tmpdir, target_path) if method == "dreme": shutil.copyfile(os.path.join(target_path, "dreme.txt"), outfile) elif method == "meme": shutil.copyfile(os.path.join(target_path, "meme.txt"), outfile) elif method == "memechip": try: shutil.copyfile(os.path.join(target_path, "combined.meme"), outfile) except IOError: E.warn("%s: No motifs found") P.touch(outfile) # convert images to png epsfiles = glob.glob(os.path.join(target_path, "*.eps")) statement = [] for epsfile in epsfiles: b, ext = os.path.splitext(epsfile) pngfile = b + ".png" statement.append('''convert %(epsfile)s %(pngfile)s ''') if len(statement) > 0: statement = " && ".join(statement) P.run(statement)
def subtractBedFiles(infile, subtractfile, outfile): '''subtract intervals in *subtractfile* from *infile* and store in *outfile*. ''' if IOTools.isEmpty(subtractfile): shutil.copyfile(infile, outfile) return elif IOTools.isEmpty(infile): P.touch(outfile) return statement = ''' intersectBed -v -a %(infile)s -b %(subtractfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %(outfile)s ; tabix -p bed %(outfile)s ''' P.run()
def conditionDiffExpression(infile, outfile): ''' Call DEGs showing statistically significantly different expression based on interaction terms between condition and time point. Uses DESeq2. ''' job_options = "-l mem_free=4G" statement = ''' zcat %(infile)s | cgat timeseries2diffgenes --log=%(outfile)s.log --method=condition --alpha=%(deseq_alpha)s --results-directory=diff_condition.dir ''' P.run() P.touch(outfile)
def intersectBedFiles(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]): P.touch(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run() else: tmpfile = P.getTempFilename(".") # need to merge incrementally fn = infiles[0] if IOTools.isEmpty(infiles[0]): P.touch(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run() for fn in infiles[1:]: if IOTools.isEmpty(infiles[0]): P.touch(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run() statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %(outfile)s ''' P.run() os.unlink(tmpfile)
def runMemeCHIP(infile, outfile, motifs=None): '''Run the MEME-CHiP pipeline on the input files. optional motifs files can be supplied as a list''' if motifs: motifs = " ".join("-db %s" % motif for motif in motifs) else: motifs = " " nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme-chip skipped") P.touch(outfile) return target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.get_temp_dir(".") statement = ''' meme-chip %(infile)s -p %(meme_threads)s -oc %(tmpdir)s -nmeme %(memechip_nmeme)s %(memechip_options)s %(motifs)s > %(outfile)s.log ''' # If running with more than one thread # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options # through job_options if int(PARAMS["memechip_threads"]) != 1: job_options = str(PARAMS["memechip_job_options"]) job_threads = int(PARAMS["memechip_threads"]) cluster_parallel_environment = str( PARAMS["memechip_cluster_parallel_environment"]) P.run(statement) collectMEMEResults(tmpdir, target_path, outfile, method="memechip")
def summarizeEffectsPerGene(infile, outfile): '''summarize effects on a per-gene level.''' tablename = outfile[:-len(".load")] track = infile[:-len("_effects.load")] dbhandle = connect() statement = ''' CREATE TABLE %(tablename)s AS SELECT DISTINCT gene_id, COUNT(*) AS ntranscripts, MIN(e.nalleles) AS min_nalleles, MAX(e.nalleles) AS max_nalleles, MIN(e.stop_min) AS min_stop_min, MAX(e.stop_min) AS max_stop_min, MIN(e.stop_max) AS min_stop_max, MAX(e.stop_max) AS max_stop_max, SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_knockout, SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_affected FROM annotations.transcript_info as i, %(track)s_effects AS e WHERE i.transcript_id = e.transcript_id GROUP BY i.gene_id ''' % locals() Database.executewait(dbhandle, "DROP TABLE IF EXISTS %(tablename)s" % locals()) Database.executewait(dbhandle, statement) Database.executewait( dbhandle, "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" % locals()) dbhandle.commit() P.touch(outfile)
def genResampleData(infile, outfile): ''' Resample the data n-times with replacement - generates n flat files which are then propagated at later stages. Files are generally small though. ''' time_agg = list(TIME.__dict__['track2groups'].keys()) time_points = [int(str(x).split("-")[1]) for x in time_agg] time_points.sort() time_points = list(set(time_points)) rep_agg = list(REPLICATE.__dict__['track2groups'].keys()) replicates = [str(x).split("-")[2] for x in rep_agg] time_rep_comb = [x for x in itertools.product(time_points, replicates)] time_cond = ro.StrVector([x[0] for x in time_rep_comb]) rep_cond = ro.StrVector([x[1] for x in time_rep_comb]) ref_gtf = str(infile).split("-")[1] condition = (str(infile).split("-")[0]).strip("deseq.dir/") time_points = ",".join([str(i) for i in time_points]) replicates = ",".join(replicates) statement = ''' cgat data2resamples --log=%(outfile)s.log --time=%(time_points)s --replicates=%(replicates)s --condition=%(condition)s --resamples=%(resampling_resample)s --input-gtf=%(ref_gtf)s --output-file-directory=clustering.dir --seed=%(resampling_seed)s %(infile)s ''' P.run() P.touch(outfile)
def runDREME(infile, outfile, neg_file="", options=""): ''' Run DREME on fasta file. If a neg_file is passed then DREME will use this as the negative set, otherwise the default is to shuffle the input ''' nseqs_pos = int(FastaIterator.count(infile)) if nseqs_pos < 2: E.warn("%s: less than 2 sequences - dreme skipped" % outfile) P.touch(outfile) return if neg_file: nseqs_neg = int(FastaIterator.count(neg_file)) if nseqs_neg < 2: E.warn( "%s: less than 2 sequences in negatives file - dreme skipped" % outfile) P.touch(outfile) return else: neg_file = "-n %s" % neg_file logfile = outfile + ".log" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.get_temp_dir(".") statement = ''' dreme -p %(infile)s %(neg_file)s -png -oc %(tmpdir)s %(dreme_options)s %(options)s > %(logfile)s ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile, method="dreme")
def runMEMEOnSequences(infile, outfile, background=None, psp=None): '''run MEME on fasta sequences to find motifs By defualt MEME calculates a zero-th order background model from the nucleotide frequencies in the input set. To use a different background set, a background file created by fasta-get-markov must be supplied. To perform descrimantive analysis a position specific prior (psp) file must be provided. This can be generated used generatePSP. ''' # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs < 2: E.warn("%s: less than 2 sequences - meme skipped" % outfile) P.touch(outfile) return # Get the total length of the sequences to decide the memory total_seqs_length = 0 with IOTools.open_file(infile, "r") as fasta_reader: iterator_fasta = FastaIterator.iterate(fasta_reader) for fasta_seq in iterator_fasta: total_seqs_length += len(fasta_seq.sequence) fasta_reader.close() # If the length of all sequences is higher than 160,000bp # Up the memory job_memory = "2G" if (total_seqs_length > 160000): job_memory = "4G" if PARAMS.get("meme_revcomp", True): revcomp = "-revcomp" else: revcomp = "" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.get_temp_dir(".") if background: background_model = "-bfile %s" % background else: background_model = "" if psp: E.info("Running MEME in descriminative mode") psp_file = "-psp %s" % psp else: psp_file = "" statement = ''' meme %(infile)s -dna %(revcomp)s -p %(meme_threads)s -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(background_model)s %(psp_file)s %(meme_options)s 2> %(outfile)s.log ''' # If running with more than one thread # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options # through job_options if int(PARAMS["meme_threads"]) != 1: job_options = str(PARAMS["meme_job_options"]) job_threads = int(PARAMS["meme_threads"]) cluster_parallel_environment = str( PARAMS["meme_cluster_parallel_environment"]) P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)