def checkFileExistence(infile, outfile): '''check whether file exists. Files are uncompressed before checking existence. ''' track = P.snip(infile, ".log") compute_file_metrics(infile, outfile, metric="file", suffixes=P.as_list( P.as_list(PARAMS.get('%s_regex_exist' % track, ""))))
def buildCheckSums(infile, outfile): '''build checksums for files in the build directory. Files are uncompressed before computing the checksum as gzip stores meta information such as the time stamp. ''' track = P.snip(infile, ".log") compute_file_metrics(infile, outfile, metric="md5sum", suffixes=P.as_list( P.as_list(PARAMS.get('%s_regex_md5' % track, ""))))
def buildLineCounts(infile, outfile): '''compute line counts. Files are uncompressed before computing the number of lines. ''' track = P.snip(infile, ".log") compute_file_metrics(infile, outfile, metric="wc -l", suffixes=P.as_list( P.as_list( PARAMS.get('%s_regex_linecount' % track, ""))))
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.get_temp_dir(".") databases = " ".join(P.as_list(P.get_params()["tomtom_databases"])) target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "tomtom", outfile) if iotools.is_empty(infile): E.warn("input is empty - no computation performed") iotools.touch_file(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run(statement) # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def exportMotifDiscoverySequences(infile, outfile): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' track = P.snip(infile, "_intervals.load") dbhandle = connect() p = P.substitute_parameters(**locals()) nseq = motifs.writeSequencesForIntervals( track, outfile, dbhandle, full=False, masker=P.as_list(p['motifs_masker']), halfwidth=int(p["motifs_halfwidth"]), maxsize=int(p["motifs_max_size"]), proportion=p["motifs_proportion"], min_sequences=p["motifs_min_sequences"], num_sequences=p["motifs_num_sequences"], order=p['motifs_score']) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) iotools.touch_file(outfile)
def run_test(infile, outfile): '''run a test. Multiple targets are run iteratively. ''' track = P.snip(outfile, ".log") pipeline_name = PARAMS.get("%s_pipeline" % track, track[len("test_"):]) pipeline_targets = P.as_list(PARAMS.get("%s_target" % track, "full")) # do not run on cluster, mirror # that a pipeline is started from # the head node #to_cluster = False template_statement = ("cd %%(track)s.dir; " "xvfb-run -d cgatflow %%(pipeline_name)s " "%%(pipeline_options)s " "%%(workflow_options)s make %s " "-L ../%%(outfile)s " "-S ../%%(outfile)s.stdout " "-E ../%%(outfile)s.stderr") if len(pipeline_targets) == 1: statement = template_statement % pipeline_targets[0] P.run(statement, ignore_errors=True, job_memory="unlimited") else: statements = [] for pipeline_target in pipeline_targets: statements.append(template_statement % pipeline_target) P.run(statement, ignore_errors=True, job_memory="unlimited")
def get_repeat_gff(outfile): """This task downloads UCSC repetetive RNA types. """ ModuleTrna.getRepeatDataFromUCSC( dbhandle=connectToUCSC(), repclasses=P.as_list(PARAMS["ucsc_rnatypes"]), outfile=outfile, remove_contigs_regex=PARAMS["ucsc_remove_contigs"], job_memory="3G")
def importRepeatsFromUCSC(outfile): """This task downloads UCSC repeats types as identified in the configuration file. """ gtfsubset.getRepeatDataFromUCSC(dbhandle=connectToUCSC(), repclasses=P.as_list( PARAMS["ucsc_repeattypes"]), outfile=outfile, job_memory=PARAMS["job_memory"])
def importRNAAnnotationFromUCSC(outfile): """This task downloads UCSC repetetive RNA types. """ gtfsubset.getRepeatDataFromUCSC( dbhandle=connectToUCSC(), repclasses=P.as_list(PARAMS["ucsc_rnatypes"]), outfile=outfile, remove_contigs_regex=PARAMS["ncbi_remove_contigs"], job_memory=PARAMS["job_memory"])
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) tmpdir = P.get_temp_dir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.as_list(P.get_params()['motifs_masker']), halfwidth=int(P.get_params()["meme_halfwidth"]), maxsize=int(P.get_params()["meme_max_size"]), proportion=P.get_params()["meme_proportion"], min_sequences=P.get_params()["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) iotools.touch_file(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def exportIntervalSequences(infile, outfile, track, method): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' dbhandle = connect() try: halfwidth = int(PARAMS[method+"_halfwidth"]) full = False except ValueError: full = True halfwidth = None try: maxsize = int(PARAMS[method+"_max_size"]) except ValueError: maxsize = None nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full=full, masker=P.as_list(PARAMS[method+'_masker']), halfwidth=halfwidth, maxsize=maxsize, num_sequences=PARAMS[method+"_num_sequences"], proportion=PARAMS[method+"_proportion"], min_sequences=PARAMS[method+"_min_sequences"], order=PARAMS[method+'_score']) if nseq == 0: E.warn("%s: no sequences - %s skipped" % (outfile, method)) P.touch_file(outfile)
def processReads(infile, outfiles): '''process reads from .fastq and other sequence files. ''' trimmomatic_options = P.get_params()["trimmomatic_options"] if P.get_params()["auto_remove"]: trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % ( "contaminants.fasta", P.get_params()["trimmomatic_mismatches"], P.get_params()["trimmomatic_p_thresh"], P.get_params()["trimmomatic_c_thresh"], P.get_params()["trimmomatic_min_adapter_len"], P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options elif P.get_params()["trimmomatic_adapter"]: trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % ( P.get_params()["trimmomatic_adapter"], P.get_params()["trimmomatic_mismatches"], P.get_params()["trimmomatic_p_thresh"], P.get_params()["trimmomatic_c_thresh"], P.get_params()["trimmomatic_min_adapter_len"], P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options job_threads = P.get_params()["threads"] job_memory = "12G" track = re.match(REGEX_TRACK, infile).groups()[0] m = preprocess.MasterProcessor( save=P.get_params()["save"], summarize=P.get_params()["summarize"], threads=P.get_params()["threads"], qual_format=P.get_params()['qual_format']) for tool in P.as_list(P.get_params()["preprocessors"]): if tool == "fastx_trimmer": m.add(preprocess.FastxTrimmer( P.get_params()["fastx_trimmer_options"], threads=P.get_params()["threads"])) elif tool == "trimmomatic": m.add(preprocess.Trimmomatic( trimmomatic_options, threads=P.get_params()["threads"])) elif tool == "sickle": m.add(preprocess.Sickle( P.get_params()["sickle_options"], threads=P.get_params()["threads"])) elif tool == "trimgalore": m.add(preprocess.Trimgalore( P.get_params()["trimgalore_options"], threads=P.get_params()["threads"])) elif tool == "flash": m.add(preprocess.Flash( P.get_params()["flash_options"], threads=P.get_params()["threads"])) elif tool == "reversecomplement": m.add(preprocess.ReverseComplement( P.get_params()["reversecomplement_options"])) elif tool == "pandaseq": m.add(preprocess.Pandaseq( P.get_params()["pandaseq_options"], threads=P.get_params()["threads"])) elif tool == "cutadapt": cutadapt_options = P.get_params()["cutadapt_options"] if P.get_params()["auto_remove"]: cutadapt_options += " -a file:contaminants.fasta " m.add(preprocess.Cutadapt( cutadapt_options, threads=P.get_params()["threads"], untrimmed=P.get_params()['cutadapt_reroute_untrimmed'], process_paired=P.get_params()["cutadapt_process_paired"])) else: raise NotImplementedError("tool '%s' not implemented" % tool) statement = m.build((infile,), "processed.dir/trimmed-", track) P.run(statement)
def checkFile(infile, outfile): seqdat = PipelineAssembly.SequencingData(infile) outf = open(outfile, 'w') outf.write( "name\t{}\nformat\t{}\ncompressed\t{}\npaired\t{}\ninterleaved\t{}\n". format(seqdat.filename, seqdat.fileformat, seqdat.compressed, seqdat.paired, seqdat.interleaved)) outf.close() ################################################## #Run Selected Assemblers ################################################## #get the list of assemblers to run on the data ASSEMBLERS = P.as_list(PARAMS.get("Assembler_assemblers", "")) ################################################### # Run Megahit ################################################### @active_if("megahit" in ASSEMBLERS) @follows(checkFile) @follows(mkdir("megahit_out.dir")) @transform(SEQUENCEFILES, SEQUENCEFILES_REGEX, r"megahit_out.dir/\1_complete.log") def runMegahit(infile, outfile): job_memory = str(PARAMS["Megahit_clus_memory"]) + "G" job_threads = int(PARAMS["Megahit_clus_threads"]) seqdat = PipelineAssembly.SequencingData(infile) assembler = PipelineAssembly.Megahit(seqdat, "megahit_out.dir", PARAMS)
import subprocess ################################################### ################################################### ################################################### # Pipeline configuration ################################################### # load options from the config file import cgatcore.pipeline as P P.get_parameters([ "%s/pipeline.yml" % __file__[:-len(".py")], "../pipeline.yml", "pipeline.yml" ]) PARAMS = P.PARAMS FEATURES = P.as_list(PARAMS.get("General_feature_list")) FEATUREPAIRS = P.as_list(PARAMS.get("General_feature_pairs")) FEATUREPAIRS = [ "{}_BY_{}".format(x.split(":")[0], x.split(":")[1]) for x in FEATUREPAIRS ] ALLFEATURES = FEATURES + FEATUREPAIRS from pipeline_assembly import PipelineAssembly from pipeline_enumerate import PipelineEnumerate from pipeline_filter import PipelineFilter #get all files within the directory to process SEQUENCEFILES = ("*.fasta", "*.fasta.gz", "*.fasta.1.gz", "*.fasta.1", "*.fna", "*.fna.gz", "*.fna.1.gz", "*.fna.1", "*.fa", "*.fa.gz", "*.fa.1.gz", "*.fa.1", "*.fastq", "*.fastq.gz",
["%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml", "pipeline.yml"]) dbname = PARAMS['db_name'] unmapped = enrichment.getUnmapped(PARAMS) outfilesuffixes = ["_genestoterms.tsv", "_termstogenes.tsv", "_termstodetails.tsv", "_termstoont.tsv"] unmappedouts = [["annotations.dir/%s%s" % (u, s) for s in outfilesuffixes] for u in unmapped] hpatissues = P.as_list(PARAMS.get('hpa_tissue', [])) hpatissues = ['clean_backgrounds.dir/%s_hpa_background.tsv' % tissue.replace(" ", "_") for tissue in hpatissues] ######################################################## # Set up database connection ######################################################## def connect(): '''utility function to connect to database. Use this method to connect to the pipeline database. Additional databases can be attached here as well. Returns an sqlite3 database handle.
regex(".*/(.*).bed.gz"), r"motifs/\1.control.fasta") def exportMotifControlSequences(infile, outfile): '''for each interval, export the left and right sequence segment of the same size. ''' PipelineMotifs.exportSequencesFromBedFile( infile, outfile, masker=PARAMS['motifs_masker'], mode="leftright") ############################################################ ############################################################ ############################################################ @active_if("meme" in P.as_list(PARAMS["methods"]) or "disc_meme" in P.as_list(PARAMS["methods"])) @transform(loadIntervals, suffix("_intervals.load"), ".meme.fasta") def exportMemeIntervalSequences(infile, outfile): track = os.path.basename(P.snip(infile, "_intervals.load")) exportIntervalSequences(infile, outfile, track, "meme") ############################################################ @follows(mkdir("meme.dir")) @active_if("meme" in P.as_list(PARAMS["methods"])) @transform(exportMemeIntervalSequences, regex("(.+).meme.fasta"),
def getAssociatedBAMFiles(track): '''return a list of BAM files associated with a track. By default, this method searches for ``track.bam`` file in the current directory and returns an offset of 0. Associations can be defined in the .yml file in the section [bams]. For example, the following snippet associates track track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`:: [bams] track1=track1.bam,track2.bam Glob expressions are permitted. Offsets are used to shift tags in ChIP experiments. Offsets need to be defined in the [offsets] sections. If no offsets are defined, the method returns a list of 0 offsets. Offsets need to be defined in the same order as the bam files:: [offsets] track1=120,200 returns a list of BAM files and offsets. Default tracks and offsets can be specified using a placeholder ``%``. The following will associate all tracks with the same bam file:: [bams] %=all.bam ''' fn = track.asFile() bamfiles = glob.glob("%s.bam" % fn) if bamfiles == []: if "bams_%s" % fn.lower() in PARAMS: for ff in P.as_list(PARAMS["bams_%s" % fn.lower()]): bamfiles.extend(glob.glob(ff)) else: for pattern, value in P.CONFIG.items("bams"): if "%" in pattern: p = re.sub("%", "\S+", pattern) if re.search(p, fn, re.IGNORECASE): bamfiles.extend(glob.glob(value)) offsets = [] if "offsets_%s" % fn.lower() in PARAMS: offsets = list(map(int, P.as_list(PARAMS["offsets_%s" % fn.lower()]))) else: for pattern, value in P.CONFIG.items("offsets"): if "%" in pattern: p = re.sub("%", "\S+", pattern) if re.search(p, fn, re.IGNORECASE): offsets.extend(list(map(int, value.split(",")))) if offsets == []: offsets = [0] * len(bamfiles) if len(bamfiles) != len(offsets): raise ValueError("number of BAM files %s is not the " "same as number of offsets: %s" % (str(bamfiles), str(offsets))) return bamfiles, offsets
PARAMS = P.get_parameters([ "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml", "pipeline.yml" ]) # WARNING: pipeline names with underscores in their name are not allowed TESTS = sorted( set([ "test_{}".format(x.split("_")[1]) for x in PARAMS.keys() if x.startswith("test_") ])) # obtain prerequisite generic data @files([(None, "%s.tgz" % x) for x in P.as_list(PARAMS.get("prerequisites", ""))]) def setupPrerequisites(infile, outfile): '''setup pre-requisites. These are tar-balls that are unpacked, but not run. ''' #to_cluster = False track = P.snip(outfile, ".tgz") # obtain data - should overwrite pipeline.yml file statement = ''' wget --no-check-certificate -O %(track)s.tgz %(data_url)s/%(track)s.tgz''' P.run(statement) tf = tarfile.open(outfile)
def compareCheckSums(infiles, outfile): '''compare checksum files against existing reference data. ''' outf = iotools.open_file(outfile, "w") outf.write("\t".join(( ("track", "status", "job_finished", "nfiles", "nref", "missing", "extra", "different", "different_md5", "different_lines", "same", "same_md5", "same_lines", "same_exist", "files_missing", "files_extra", "files_different_md5", "files_different_lines"))) + "\n") for infile in infiles: E.info("working on {}".format(infile)) track = P.snip(infile, ".stats") logfiles = glob.glob(track + "*.log") job_finished = True for logfile in logfiles: is_complete = iotools.is_complete(logfile) E.debug("logcheck: {} = {}".format(logfile, is_complete)) job_finished = job_finished and is_complete reffile = track + ".ref" # regular expression of files to test only for existence regex_exist = PARAMS.get('%s_regex_exist' % track, None) if regex_exist: regex_exist = re.compile("|".join(P.as_list(regex_exist))) regex_linecount = PARAMS.get('%s_regex_linecount' % track, None) if regex_linecount: regex_linecount = re.compile("|".join(P.as_list(regex_linecount))) regex_md5 = PARAMS.get('%s_regex_md5' % track, None) if regex_md5: regex_md5 = re.compile("|".join(P.as_list(regex_md5))) if not os.path.exists(reffile): raise ValueError('no reference data defined for %s' % track) cmp_data = pandas.read_csv(iotools.open_file(infile), sep="\t", index_col=0) ref_data = pandas.read_csv(iotools.open_file(reffile), sep="\t", index_col=0) shared_files = set(cmp_data.index).intersection(ref_data.index) missing = set(ref_data.index).difference(cmp_data.index) extra = set(cmp_data.index).difference(ref_data.index) different = set(shared_files) # remove those for which only check for existence if regex_exist: same_exist = set([x for x in different if regex_exist.search(x)]) different = set( [x for x in different if not regex_exist.search(x)]) else: same_exist = set() # select those for which only check for number of lines if regex_linecount: check_lines = [x for x in different if regex_linecount.search(x)] dd = (cmp_data['nlines'][check_lines] != ref_data['nlines'][check_lines]) different_lines = set(dd.index[dd]) different = different.difference(check_lines) dd = (cmp_data['nlines'][check_lines] == ref_data['nlines'] [check_lines]) same_lines = set(dd.index[dd]) else: different_lines = set() same_lines = set() # remainder - check md5 if regex_md5: check_md5 = [x for x in different if regex_md5.search(x)] dd = (cmp_data['md5'][check_md5] != ref_data['md5'][check_md5]) different_md5 = set(dd.index[dd]) dd = (cmp_data['md5'][check_md5] == ref_data['md5'][check_md5]) same_md5 = set(dd.index[dd]) else: different_md5 = set() same_md5 = set() if job_finished and (len(missing) + len(extra) + len(different_md5) + len(different_lines) == 0): status = "OK" else: status = "FAIL" outf.write("\t".join( map(str, ( track, status, job_finished, len(cmp_data), len(ref_data), len(missing), len(extra), len(different_md5) + len(different_lines), len(different_md5), len(different_lines), len(same_md5) + len(same_lines) + len(same_exist), len(same_md5), len(same_lines), len(same_exist), ",".join(missing), ",".join(extra), ",".join(different_md5), ",".join(different_lines), ))) + "\n") outf.close()
(entry.gene_id, transcript2gene_dict[entry.transcript_id])) else: transcript2gene_dict[entry.transcript_id] = entry.gene_id with iotools.open_file(outfile, "w") as outf: outf.write("transcript_id\tgene_id\n") for key, value in sorted(transcript2gene_dict.items()): outf.write("%s\t%s\n" % (key, value)) ################################################### # count-based quantifiers ################################################### @active_if("featurecounts" in P.as_list(PARAMS["quantifiers"])) @follows(mkdir("featurecounts.dir")) @transform(["%s.bam" % x.asFile() for x in BAM_TRACKS], regex("(\S+).bam"), add_inputs(PARAMS['geneset']), [ r"featurecounts.dir/\1/transcripts.tsv.gz", r"featurecounts.dir/\1/genes.tsv.gz" ]) def runFeatureCounts(infiles, outfiles): ''' Counts reads falling into "features" - in each transcript and each gene. A read is counted as overlapping with a feature if at least one bp overlaps. Pairs and strandedness can be used to resolve reads falling into