def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.get_temp_dir(".") databases = " ".join(P.as_list(P.get_params()["tomtom_databases"])) target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "tomtom", outfile) if IOTools.is_empty(infile): E.warn("input is empty - no computation performed") IOTools.touch_file(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run(statement) # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def runGLAM2(infile, outfile, dbhandle): '''run glam2 on all intervals and motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' to_cluster = True target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "glam2", outfile) track = infile[:-len(".fasta")] tmpdir = tempfile.mkdtemp() tmpfasta = os.path.join(tmpdir, "in.fa") nseq = PipelineMotifs.writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, halfwidth=int(P.get_params()["meme_halfwidth"]), maxsize=int(P.get_params()["meme_max_size"]), proportion=P.get_params()["meme_proportion"]) min_sequences = int(nseq / 10.0) statement = ''' %(execglam2)s -2 -O %(tmpdir)s %(glam2_options)s -z %(min_sequences)i n %(tmpfasta)s > %(outfile)s.log ''' P.run(statement) # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "glam2.txt"), outfile)
def main(argv=None): workflow_options = [] if "--local" in argv: workflow_options.append("--local") workflow_options.append("-p {}".format( P.get_params()["cluster"]["num_jobs"])) P.get_params()["workflow_options"] == "".join(workflow_options) # manually set location of test scripts - this needs to be better organized # 1. make scripts live alongside pipeline_testing.py # 2. make scripts available via cgatflow CLI # 3. include scripts in pipeline_testing P.get_params()["scriptsdir"] = os.path.join( os.path.dirname(os.path.dirname(__file__)), "scripts") P.main(argv)
def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.get_temp_filename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=True, masker="dust", proportion=P.get_params()["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) IOTools.touch_file(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run(statement) os.unlink(tmpfasta)
def buildGeneSetAnnotations(infiles, outfile, slice): '''build annotations of all sets from database. ``slice`` can be any of the slices in the ``annotation`` tables.''' statement = '''SELECT gene_id FROM %(track)s_annotation as a WHERE %(where)s''' if slice == "all": where = "'1'" else: where = "is_%(slice)s" % locals() dbhandle = sqlite3.connect(P.get_params()["database_name"]) subsets = [] for f in infiles: assert f.endswith(".gtf.gz") track = f[:-len(".gtf.gz")] key = "%s.%s" % (track, slice) cc = dbhandle.cursor() data = [x[0] for x in cc.execute(statement % locals()).fetchall()] E.info("%s: adding %i genes" % (key, len(data))) filename = outfile + ".tmp.%s" % key outf = open(filename, "w") outf.write("gene_id\n%s\n" % "\n".join(map(str, data))) outf.close() subsets.append("--subset=%s" % ",".join((track, key, filename))) infiles = " ".join(infiles) subsets = " ".join(subsets) statement = ''' cgat gff2annotator2tsv --section=annotations-genes --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' %(subsets)s %(infiles)s > %(outfile)s ''' P.run(statement) statement = ''' rm -f %(outfile)s.tmp* ''' P.run(statement)
def loadBioProspector(infile, outfile): '''load results from bioprospector.''' target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "bioprospector") try: os.makedirs(target_path) except OSError: pass track = infile[:-len(".bioprospector")] results = Bioprospector.parse(IOTools.open_file(infile, "r")) tmpfile = P.get_temp_file() tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n") for x, motifs in enumerate(results): outname = os.path.join(target_path, "%s_%02i.png" % (track, x)) Bioprospector.build_logo([y.sequence for y in motifs.matches], outname) for match in motifs.matches: distance = abs(match.start + match.width1 - (match.end - match.width2)) if match.strand in ("+-", "-+"): arrangement = "ER" elif match.strand in ("++", "--"): arrangement = "DR" else: arrangement = "SM" distance = 0 arrangement += "%i" % distance strand = match.strand[0] id = re.sub(".*_", "", match.id) tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" % (id, x, match.start, match.end, strand, arrangement)) tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) tmpdir = P.get_temp_dir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.as_list(P.get_params()['motifs_masker']), halfwidth=int(P.get_params()["meme_halfwidth"]), maxsize=int(P.get_params()["meme_max_size"]), proportion=P.get_params()["meme_proportion"], min_sequences=P.get_params()["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) IOTools.touch_file(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"): '''export sequences for intervals in :term:`bed`-formatted *infile* to :term:`fasta` formatted *outfile* ''' track = P.snip(infile, ".bed.gz") fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) outs = IOTools.open_file(outfile, "w") ids, seqs = [], [] for bed in Bed.setName(Bed.iterator(IOTools.open_file(infile))): lcontig = fasta.getLength(bed.contig) if mode == "intervals": seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end)) ids.append("%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end)) elif mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) masked = maskSequences(seqs, masker) outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)])) outs.close()
def buildAnnotatorSegmentsROI(tmpdir, roi_class, outfile, overlap=None): '''convert segments in bed format to annotator format from infile to outfile. ''' tmpsegments = os.path.join(tmpdir, "segments") to_cluster = True dbhandle = sqlite3.connect(P.get_params()["database_name"]) if overlap: statement = ''' SELECT roi.contig, roi.start, roi.end FROM regions_of_interest AS roi, %(overlap)s_intervals AS i WHERE roi.class='%(roi_class)s' AND i.contig = roi.contig AND min(roi.end, i.end) - max(roi.start, i.start) > 0 ''' else: statement = ''' SELECT roi.contig, roi.start, roi.end FROM regions_of_interest AS roi WHERE class='%(roi_class)s' ''' cc = dbhandle.cursor() cc.execute(statement % locals()) noutput = 0 contigs = collections.defaultdict(list) for result in cc: contig, start, end = result contigs[contig].append((start, end)) noutput += 1 E.info("segments for roi_class `%s` and overlap `%s`: %i" % (roi_class, overlap, noutput)) outs = open(tmpsegments, "w") gff2annotator.outputSegments(outs, contigs, section="segments") outs.close() if noutput == 0: return None else: return tmpsegments
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset, fdr_method): '''generic import of annotator results. Assumes that the suffix of all infiles is the same. ''' infile = " ".join(infiles) x, suffix = os.path.splitext(infiles[0]) tmpfilename = P.get_temp_filename() statement = ''' cgat annotator2tsv \ --method=fdr-table \ --fdr-method=%(fdr_method)s \ --log=%(outfile)s.log \ --regex-identifier="(.*)%(suffix)s" \ %(infile)s > %(tmpfilename)s ''' P.run(statement) tmpfile = P.get_temp_file() for line in open(tmpfilename, "r"): if line.startswith("id"): line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line) else: line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line) tmpfile.write(line) tmpfile.close() tmpfilename2 = tmpfile.name statement = ''' cgat csv2db %(csv2db_options)s \ --table=%(table)s < %(tmpfilename2)s > %(outfile)s''' P.run(**dict(list(locals().items()) + list(P.get_params().items()))) os.unlink(tmpfilename) os.unlink(tmpfilename2)
def runMEMEOnSequences(infile, outfile): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme skipped" % outfile) IOTools.touch_file(outfile) return target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) tmpdir = P.get_temp_dir(".") statement = ''' meme %(infile)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(motifs_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def loadMAST(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. Add columns for the control data as well. ''' tablename = P.to_table(outfile) tmpfile = P.get_temp_file(".") tmpfile.write(MAST.Match().header + "\tmotif\tcontig" "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end" "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end" "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n") lines = IOTools.open_file(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.get_temp_file(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise ValueError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.open_file(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast def splitId(s, mode): '''split background match id has three parts: track _ id _ pos track might contain '_'. ''' d = match.id.split("_") if mode == "bg": return "_".join(d[:-2]), d[-2], d[-1] elif mode == "fg": return "_".join(d[:-1]), d[-1] for chunk in range(0, len(chunks) - 1, 2): motif_fg, part, mast_fg = readChunk(lines, chunk) assert part == "foreground" motif_bg, part, mast_bg = readChunk(lines, chunk + 1) assert part == "background" assert motif_fg == motif_bg # index control data controls = collections.defaultdict(dict) for match in mast_bg.matches: track, id, pos = splitId(match.id, "bg") controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs, match.length, match.start, match.end) for match in mast_fg.matches: # remove track and pos track, match.id = splitId(match.id, "fg") # move to genomic coordinates contig, start, end = re.match("(\S+):(\d+)..(\d+)", match.description).groups() if match.nmotifs > 0: start, end = int(start), int(end) match.start += start match.end += start match.positions = [x + start for x in match.positions] id = match.id if id not in controls: P.warn("no controls for %s - increase MAST evalue" % id) if "l" not in controls[id]: controls[id]["l"] = (float(P.get_params()["mast_evalue"]), 1, 0, 0, 0, 0) if "r" not in controls[id]: controls[id]["r"] = (float(P.get_params()["mast_evalue"]), 1, 0, 0, 0, 0) min_evalue = min(controls[id]["l"][0], controls[id]["r"][0]) min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1]) max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2]) tmpfile.write( str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( motif_fg, contig, "\t".join(map(str, controls[id]["l"])), "\t".join(map(str, controls[id]["r"])), str(min_evalue), str(min_pvalue), str(max_nmatches), ) + "\n") tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)
def writeSequencesForIntervals(track, filename, dbhandle, full=False, halfwidth=None, maxsize=None, proportion=None, masker=[], offset=0, shuffled=False, num_sequences=None, min_sequences=None, order="peakval", shift=None): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters in order to create jobs that take too long. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). If *num_sequences* is set, the first *num_sequences* will be used. *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions *order* is the order by which peaks should be sorted. Possible values are 'peakval' (peak value, descending order), 'score' (peak score, descending order) If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals on the left and right of the actual interval. The intervals will be centered around the mid-point and truncated the same way as the main intervals. ''' fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) if order == "peakval": orderby = " ORDER BY peakval DESC" elif order == "max": orderby = " ORDER BY score DESC" else: raise ValueError( "Unknown value passed as order parameter, check your ini file") tablename = "%s_intervals" % P.tablequote(track) statement = '''SELECT contig, start, end, interval_id, peakcenter FROM %(tablename)s ''' % locals() + orderby cc = dbhandle.execute(statement) data = cc.fetchall() cc.close() if proportion: cutoff = int(len(data) * proportion) + 1 if min_sequences: cutoff = max(cutoff, min_sequences) elif num_sequences: cutoff = num_sequences else: cutoff = len(data) L.info( "writeSequencesForIntervals %s: using at most %i sequences for pattern finding" % (track, cutoff)) data = data[:cutoff] L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) # modify the ranges if shift: if shift == "leftright": new_data = [(contig, start - (end - start), start, str(interval_id) + "_left", peakcenter) for contig, start, end, interval_id, peakcenter in data ] new_data.extend([ (contig, end, end + (end - start), str(interval_id) + "_right", peakcenter) for contig, start, end, interval_id, peakcenter in data ]) data = new_data if halfwidth: # center around peakcenter, add halfwidth on either side data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth, interval_id) for contig, start, end, interval_id, peakcenter in data] else: # remove peakcenter data = [(contig, start, end, interval_id) for contig, start, end, interval_id, peakcenter in data] # get the sequences - cut at number of nucleotides sequences = [] current_size, nseq = 0, 0 new_data = [] for contig, start, end, interval_id in data: lcontig = fasta.getLength(contig) start, end = max(0, start + offset), min(end + offset, lcontig) if start >= end: L.info( "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored" % (track, id, start, end, offset)) continue seq = fasta.getSequence(contig, "+", start, end) sequences.append(seq) new_data.append((start, end, interval_id, contig)) current_size += len(seq) if maxsize and current_size >= maxsize: L.info( "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" % (track, maxsize, nseq, len(data) - nseq)) break nseq += 1 data = new_data if shuffled: # note that shuffling is done on the unmasked sequences # Otherwise N's would be interspersed with real sequence # messing up motif finding unfairly. Instead, masking is # done on the shuffled sequence. sequences = [list(x) for x in sequences] for sequence in sequences: random.shuffle(sequence) sequences = maskSequences(["".join(x) for x in sequences], masker) c = E.Counter() outs = IOTools.open_file(filename, "w") for masker in masker: if masker not in ("unmasked", "none", None): sequences = maskSequences(sequences, masker) for sequence, d in zip(sequences, data): c.input += 1 if len(sequence) == 0: c.empty += 1 continue start, end, id, contig = d id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end) outs.write(">%s\n%s\n" % (id, sequence)) c.output += 1 outs.close() E.info("%s" % c) return c.output
def run_report(clean=True, with_pipeline_status=True, pipeline_status_format="svg"): '''run CGATreport. This will also run ruffus to create an svg image of the pipeline status unless *with_pipeline_status* is set to False. The image will be saved into the export directory. ''' params = P.get_params() if with_pipeline_status: targetdir = params["exportdir"] if not os.path.exists(targetdir): os.mkdir(targetdir) ruffus.pipeline_printout_graph( os.path.join( targetdir, "pipeline.%s" % pipeline_status_format), pipeline_status_format, ["full"], checksum_level=params["ruffus_checksums_level"] ) dirname, basename = os.path.split(P.get_caller().__file__) report_engine = params.get("report_engine", "cgatreport") assert report_engine in ('sphinxreport', 'cgatreport') docdir = os.path.join(dirname, "pipeline_docs", IOTools.snip(basename, ".py")) themedir = os.path.join(dirname, "pipeline_docs", "themes") relpath = os.path.relpath(docdir) trackerdir = os.path.join(docdir, "trackers") # use a fake X display in order to avoid windows popping up # from R plots. xvfb_command = IOTools.which("xvfb-run") # permit multiple servers using -d option if xvfb_command: xvfb_command += " -d " else: xvfb_command = "" # if there is no DISPLAY variable set, xvfb runs, but # exits with error when killing process. Thus, ignore return # value. # print os.getenv("DISPLAY"), "command=", xvfb_command if not os.getenv("DISPLAY"): erase_return = "|| true" else: erase_return = "" if os.path.exists("conf.py"): conf_dir = os.path.abspath(".") else: conf_dir = os.path.join(os.path.dirname(__file__), "configuration") # in the current version, xvfb always returns with an error, thus # ignore these. erase_return = "|| true" if clean: clean = "rm -rf report _cache _static;" else: clean = "" # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as # the virtual environment seems to be stripped. It is thus set to # the contents of the current sys.path syspath = ":".join(sys.path) statement = ''' %(clean)s (export SPHINX_DOCSDIR=%(docdir)s; export SPHINX_THEMEDIR=%(themedir)s; export PYTHONPATH=%(syspath)s; %(xvfb_command)s %(report_engine)s-build --num-jobs=%(report_threads)s sphinx-build -b html -d %(report_doctrees)s -c %(conf_dir)s -j %(report_threads)s %(docdir)s %(report_html)s >& report.log %(erase_return)s ) ''' P.run(statement) E.info('the report is available at %s' % os.path.abspath( os.path.join(params['report_html'], "contents.html")))