def checkFileExistence(infile, outfile): '''check whether file exists. Files are uncompressed before checking existence. ''' track = P.snip(infile, ".log") compute_file_metrics(infile, outfile, metric="file", suffixes=P.as_list( P.as_list(PARAMS.get('%s_regex_exist' % track, ""))))
def buildCheckSums(infile, outfile): '''build checksums for files in the build directory. Files are uncompressed before computing the checksum as gzip stores meta information such as the time stamp. ''' track = P.snip(infile, ".log") compute_file_metrics(infile, outfile, metric="md5sum", suffixes=P.as_list( P.as_list(PARAMS.get('%s_regex_md5' % track, ""))))
def buildLineCounts(infile, outfile): '''compute line counts. Files are uncompressed before computing the number of lines. ''' track = P.snip(infile, ".log") compute_file_metrics(infile, outfile, metric="wc -l", suffixes=P.as_list( P.as_list( PARAMS.get('%s_regex_linecount' % track, ""))))
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.get_temp_dir(".") databases = " ".join(P.as_list(PARAMS["tomtom_databases"])) target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.is_empty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run(statement) # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def exportMotifDiscoverySequences(infile, outfile): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' track = P.snip(infile, "_intervals.load") dbhandle = connect() p = P.substitute_parameters(**locals()) nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full=False, masker=P.as_list(p['motifs_masker']), halfwidth=int(p["motifs_halfwidth"]), maxsize=int(p["motifs_max_size"]), proportion=p["motifs_proportion"], min_sequences=p["motifs_min_sequences"], num_sequences=p["motifs_num_sequences"], order=p['motifs_score']) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) IOTools.touch_file(outfile)
def run_test(infile, outfile): '''run a test. Multiple targets are run iteratively. ''' track = P.snip(outfile, ".log") pipeline_name = PARAMS.get("%s_pipeline" % track, track[len("test_"):]) pipeline_targets = P.as_list(PARAMS.get("%s_target" % track, "full")) # do not run on cluster, mirror # that a pipeline is started from # the head node #to_cluster = False template_statement = ("cd %%(track)s.dir; " "xvfb-run -d cgatflow %%(pipeline_name)s " "%%(pipeline_options)s " "%%(workflow_options)s make %s " "-L ../%%(outfile)s " "-S ../%%(outfile)s.stdout " "-E ../%%(outfile)s.stderr") if len(pipeline_targets) == 1: statement = template_statement % pipeline_targets[0] P.run(statement, ignore_errors=True, job_memory="unlimited") else: statements = [] for pipeline_target in pipeline_targets: statements.append(template_statement % pipeline_target) P.run(statement, ignore_errors=True, job_memory="unlimited")
def importRNAAnnotationFromUCSC(outfile): """This task downloads UCSC repetetive RNA types. """ PipelineGtfsubset.getRepeatDataFromUCSC( dbhandle=connectToUCSC(), repclasses=P.as_list(PARAMS["ucsc_rnatypes"]), outfile=outfile, remove_contigs_regex=PARAMS["ncbi_remove_contigs"], job_memory=PARAMS["job_memory"])
def importRepeatsFromUCSC(outfile): """This task downloads UCSC repeats types as identified in the configuration file. """ PipelineGtfsubset.getRepeatDataFromUCSC(dbhandle=connectToUCSC(), repclasses=P.as_list( PARAMS["ucsc_repeattypes"]), outfile=outfile, job_memory=PARAMS["job_memory"])
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) tmpdir = P.get_temp_dir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.as_list(P.get_params()['motifs_masker']), halfwidth=int(P.get_params()["meme_halfwidth"]), maxsize=int(P.get_params()["meme_max_size"]), proportion=P.get_params()["meme_proportion"], min_sequences=P.get_params()["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) IOTools.touch_file(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def exportIntervalSequences(infile, outfile, track, method): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' dbhandle = connect() try: halfwidth = int(PARAMS[method+"_halfwidth"]) full = False except ValueError: full = True halfwidth = None try: maxsize = int(PARAMS[method+"_max_size"]) except ValueError: maxsize = None nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full=full, masker=P.as_list(PARAMS[method+'_masker']), halfwidth=halfwidth, maxsize=maxsize, num_sequences=PARAMS[method+"_num_sequences"], proportion=PARAMS[method+"_proportion"], min_sequences=PARAMS[method+"_min_sequences"], order=PARAMS[method+'_score']) if nseq == 0: E.warn("%s: no sequences - %s skipped" % (outfile, method)) P.touch(outfile)
def getAssociatedBAMFiles(track): '''return a list of BAM files associated with a track. By default, this method searches for ``track.bam`` file in the current directory and returns an offset of 0. Associations can be defined in the .yml file in the section [bams]. For example, the following snippet associates track track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`:: [bams] track1=track1.bam,track2.bam Glob expressions are permitted. Offsets are used to shift tags in ChIP experiments. Offsets need to be defined in the [offsets] sections. If no offsets are defined, the method returns a list of 0 offsets. Offsets need to be defined in the same order as the bam files:: [offsets] track1=120,200 returns a list of BAM files and offsets. Default tracks and offsets can be specified using a placeholder ``%``. The following will associate all tracks with the same bam file:: [bams] %=all.bam ''' fn = track.asFile() bamfiles = glob.glob("%s.bam" % fn) if bamfiles == []: if "bams_%s" % fn.lower() in PARAMS: for ff in P.as_list(PARAMS["bams_%s" % fn.lower()]): bamfiles.extend(glob.glob(ff)) else: for pattern, value in P.CONFIG.items("bams"): if "%" in pattern: p = re.sub("%", "\S+", pattern) if re.search(p, fn, re.IGNORECASE): bamfiles.extend(glob.glob(value)) offsets = [] if "offsets_%s" % fn.lower() in PARAMS: offsets = list(map(int, P.as_list(PARAMS["offsets_%s" % fn.lower()]))) else: for pattern, value in P.CONFIG.items("offsets"): if "%" in pattern: p = re.sub("%", "\S+", pattern) if re.search(p, fn, re.IGNORECASE): offsets.extend(list(map(int, value.split(",")))) if offsets == []: offsets = [0] * len(bamfiles) if len(bamfiles) != len(offsets): raise ValueError("number of BAM files %s is not the " "same as number of offsets: %s" % (str(bamfiles), str(offsets))) return bamfiles, offsets
def compareCheckSums(infiles, outfile): '''compare checksum files against existing reference data. ''' outf = IOTools.open_file(outfile, "w") outf.write("\t".join(( ("track", "status", "job_finished", "nfiles", "nref", "missing", "extra", "different", "different_md5", "different_lines", "same", "same_md5", "same_lines", "same_exist", "files_missing", "files_extra", "files_different_md5", "files_different_lines"))) + "\n") for infile in infiles: E.info("working on {}".format(infile)) track = P.snip(infile, ".stats") logfiles = glob.glob(track + "*.log") job_finished = True for logfile in logfiles: is_complete = IOTools.is_complete(logfile) E.debug("logcheck: {} = {}".format(logfile, is_complete)) job_finished = job_finished and is_complete reffile = track + ".ref" # regular expression of files to test only for existence regex_exist = PARAMS.get('%s_regex_exist' % track, None) if regex_exist: regex_exist = re.compile("|".join(P.as_list(regex_exist))) regex_linecount = PARAMS.get('%s_regex_linecount' % track, None) if regex_linecount: regex_linecount = re.compile("|".join(P.as_list(regex_linecount))) regex_md5 = PARAMS.get('%s_regex_md5' % track, None) if regex_md5: regex_md5 = re.compile("|".join(P.as_list(regex_md5))) if not os.path.exists(reffile): raise ValueError('no reference data defined for %s' % track) cmp_data = pandas.read_csv(IOTools.open_file(infile), sep="\t", index_col=0) ref_data = pandas.read_csv(IOTools.open_file(reffile), sep="\t", index_col=0) shared_files = set(cmp_data.index).intersection(ref_data.index) missing = set(ref_data.index).difference(cmp_data.index) extra = set(cmp_data.index).difference(ref_data.index) different = set(shared_files) # remove those for which only check for existence if regex_exist: same_exist = set([x for x in different if regex_exist.search(x)]) different = set( [x for x in different if not regex_exist.search(x)]) else: same_exist = set() # select those for which only check for number of lines if regex_linecount: check_lines = [x for x in different if regex_linecount.search(x)] dd = (cmp_data['nlines'][check_lines] != ref_data['nlines'][check_lines]) different_lines = set(dd.index[dd]) different = different.difference(check_lines) dd = (cmp_data['nlines'][check_lines] == ref_data['nlines'] [check_lines]) same_lines = set(dd.index[dd]) else: different_lines = set() same_lines = set() # remainder - check md5 if regex_md5: check_md5 = [x for x in different if regex_md5.search(x)] dd = (cmp_data['md5'][check_md5] != ref_data['md5'][check_md5]) different_md5 = set(dd.index[dd]) dd = (cmp_data['md5'][check_md5] == ref_data['md5'][check_md5]) same_md5 = set(dd.index[dd]) else: different_md5 = set() same_md5 = set() if job_finished and (len(missing) + len(extra) + len(different_md5) + len(different_lines) == 0): status = "OK" else: status = "FAIL" outf.write("\t".join( map(str, ( track, status, job_finished, len(cmp_data), len(ref_data), len(missing), len(extra), len(different_md5) + len(different_lines), len(different_md5), len(different_lines), len(same_md5) + len(same_lines) + len(same_exist), len(same_md5), len(same_lines), len(same_exist), ",".join(missing), ",".join(extra), ",".join(different_md5), ",".join(different_lines), ))) + "\n") outf.close()
PARAMS = P.get_parameters([ "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml", "pipeline.yml" ]) # WARNING: pipeline names with underscores in their name are not allowed TESTS = sorted( set([ "test_{}".format(x.split("_")[1]) for x in PARAMS.keys() if x.startswith("test_") ])) # obtain prerequisite generic data @files([(None, "%s.tgz" % x) for x in P.as_list(PARAMS.get("prerequisites", ""))]) def setupPrerequisites(infile, outfile): '''setup pre-requisites. These are tar-balls that are unpacked, but not run. ''' #to_cluster = False track = P.snip(outfile, ".tgz") # obtain data - should overwrite pipeline.yml file statement = ''' wget --no-check-certificate -O %(track)s.tgz %(data_url)s/%(track)s.tgz''' P.run(statement) tf = tarfile.open(outfile)
def processReads(infile, outfiles): '''process reads from .fastq and other sequence files. ''' trimmomatic_options = PARAMS["trimmomatic_options"] if PARAMS["auto_remove"]: trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % ( "contaminants.fasta", PARAMS["trimmomatic_mismatches"], PARAMS["trimmomatic_p_thresh"], PARAMS["trimmomatic_c_thresh"], PARAMS["trimmomatic_min_adapter_len"], PARAMS["trimmomatic_keep_both_reads"]) + trimmomatic_options elif PARAMS["trimmomatic_adapter"]: trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % ( PARAMS["trimmomatic_adapter"], PARAMS["trimmomatic_mismatches"], PARAMS["trimmomatic_p_thresh"], PARAMS["trimmomatic_c_thresh"], PARAMS["trimmomatic_min_adapter_len"], PARAMS["trimmomatic_keep_both_reads"]) + trimmomatic_options job_threads = PARAMS["threads"] job_memory = "12G" track = re.match(REGEX_TRACK, infile).groups()[0] m = PipelinePreprocess.MasterProcessor( save=PARAMS["save"], summarize=PARAMS["summarize"], threads=PARAMS["threads"], qual_format=PARAMS['qual_format']) for tool in P.as_list(PARAMS["preprocessors"]): if tool == "fastx_trimmer": m.add( PipelinePreprocess.FastxTrimmer( PARAMS["fastx_trimmer_options"], threads=PARAMS["threads"])) elif tool == "trimmomatic": m.add( PipelinePreprocess.Trimmomatic(trimmomatic_options, threads=PARAMS["threads"])) elif tool == "sickle": m.add( PipelinePreprocess.Sickle(PARAMS["sickle_options"], threads=PARAMS["threads"])) elif tool == "trimgalore": m.add( PipelinePreprocess.Trimgalore(PARAMS["trimgalore_options"], threads=PARAMS["threads"])) elif tool == "flash": m.add( PipelinePreprocess.Flash(PARAMS["flash_options"], threads=PARAMS["threads"])) elif tool == "reversecomplement": m.add( PipelinePreprocess.ReverseComplement( PARAMS["reversecomplement_options"])) elif tool == "pandaseq": m.add( PipelinePreprocess.Pandaseq(PARAMS["pandaseq_options"], threads=PARAMS["threads"])) elif tool == "cutadapt": cutadapt_options = PARAMS["cutadapt_options"] if PARAMS["auto_remove"]: cutadapt_options += " -a file:contaminants.fasta " m.add( PipelinePreprocess.Cutadapt( cutadapt_options, threads=PARAMS["threads"], untrimmed=PARAMS['cutadapt_reroute_untrimmed'], process_paired=PARAMS["cutadapt_process_paired"])) else: raise NotImplementedError("tool '%s' not implemented" % tool) statement = m.build((infile, ), "processed.dir/trimmed-", track) P.run(statement)
regex(".*/(.*).bed.gz"), r"motifs/\1.control.fasta") def exportMotifControlSequences(infile, outfile): '''for each interval, export the left and right sequence segment of the same size. ''' PipelineMotifs.exportSequencesFromBedFile( infile, outfile, masker=PARAMS['motifs_masker'], mode="leftright") ############################################################ ############################################################ ############################################################ @active_if("meme" in P.as_list(PARAMS["methods"]) or "disc_meme" in P.as_list(PARAMS["methods"])) @transform(loadIntervals, suffix("_intervals.load"), ".meme.fasta") def exportMemeIntervalSequences(infile, outfile): track = os.path.basename(P.snip(infile, "_intervals.load")) exportIntervalSequences(infile, outfile, track, "meme") ############################################################ @follows(mkdir("meme.dir")) @active_if("meme" in P.as_list(PARAMS["methods"])) @transform(exportMemeIntervalSequences, regex("(.+).meme.fasta"),
PARAMS = P.get_parameters([ "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml", "pipeline.yml" ]) dbname = PARAMS['db_name'] unmapped = PipelineEnrichment.getUnmapped(PARAMS) outfilesuffixes = [ "_genestoterms.tsv", "_termstogenes.tsv", "_termstodetails.tsv", "_termstoont.tsv" ] unmappedouts = [["annotations.dir/%s%s" % (u, s) for s in outfilesuffixes] for u in unmapped] hpatissues = P.as_list(PARAMS.get('hpa_tissue', {})) hpatissues = [ 'clean_backgrounds.dir/%s_hpa_background.tsv' % tissue.replace(" ", "_") for tissue in hpatissues ] ######################################################## # Set up database connection ######################################################## def connect(): '''utility function to connect to database. Use this method to connect to the pipeline database. Additional databases can be attached here as well.