def buildLineCounts(infile, outfile): '''compute line counts. Files are uncompressed before computing the number of lines. ''' track = P.snip(infile, ".log") compute_file_metrics( infile, outfile, metric="wc -l", suffixes=P.asList(P.asList(PARAMS.get('%s_regex_linecount' % track, ""))))
def checkFileExistence(infile, outfile): '''check whether file exists. Files are uncompressed before checking existence. ''' track = P.snip(infile, ".log") compute_file_metrics( infile, outfile, metric="file", suffixes=P.asList(P.asList(PARAMS.get('%s_regex_exist' % track, ""))))
def buildCheckSums(infile, outfile): '''build checksums for files in the build directory. Files are uncompressed before computing the checksum as gzip stores meta information such as the time stamp. ''' track = P.snip(infile, ".log") compute_file_metrics( infile, outfile, metric="md5sum", suffixes=P.asList(P.asList(PARAMS.get('%s_regex_md5' % track, ""))))
def __call__(self, track, slice=None): exp_statement = """ SELECT TPM, gene_id, sample_name FROM sailfish_genes AS A JOIN samples AS B ON A.sample_id = B.id""" exp_df = self.getDataFrame(exp_statement) factors_statement = ''' SELECT factor, factor_value, sample_name FROM samples AS A JOIN factors AS B ON A.id = B.sample_id WHERE factor != 'genome' ''' factors_df = self.getDataFrame(factors_statement) merged_df = pd.merge(exp_df, factors_df, left_on="sample_name", right_on="sample_name") genes = Pipeline.asList( Pipeline.peekParameters( ".", "pipeline_rnaseqqc.py")['genes_of_interest']) interest_df = merged_df[merged_df['gene_id'].isin(genes)] interest_df['TPM'] = interest_df['TPM'].astype(float) return interest_df.reset_index().set_index("factor")
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def runTest(infile, outfile): '''run a test. Multiple targets are run iteratively. ''' track = P.snip(outfile, ".log") pipeline_name = PARAMS.get( "%s_pipeline" % track, "pipeline_" + track[len("test_"):]) pipeline_targets = P.asList( PARAMS.get("%s_target" % track, "full")) # do not run on cluster, mirror # that a pipeline is started from # the head node to_cluster = False template_statement = ''' (cd %%(track)s.dir; python %%(pipelinedir)s/%%(pipeline_name)s.py %%(pipeline_options)s make %s) >& %%(outfile)s ''' if len(pipeline_targets) == 1: statement = template_statement % pipeline_targets[0] P.run(ignore_errors=True) else: statements = [] for pipeline_target in pipeline_targets: statements.append(template_statement % pipeline_target) P.run(ignore_errors=True)
def buildCheckSums(infile, outfile): '''build checksums for files in the build directory. Files are uncompressed before computing the checksum as gzip stores meta information such as the time stamp. ''' track = P.snip(infile, ".log") suffixes = P.asList(PARAMS.get( '%s_suffixes' % track, PARAMS["suffixes"])) if len(suffixes) == 0: raise ValueError('no file types defined for test') regex_pattern = ".*\(%s\)" % "\|".join(suffixes) regex_pattern = pipes.quote(regex_pattern) # ignore log files as time stamps will # be different statement = '''find %(track)s.dir -type f -not -regex ".*.log" -regex %(regex_pattern)s -exec %(pipeline_scriptsdir)s/cgat_file_apply.sh {} md5sum \; | perl -p -e "s/ +/\\t/g" | sort -k1,1 > %(outfile)s''' P.run()
def buildLineCounts(infile, outfile): '''compute line counts. Files are uncompressed before computing the number of lines. ''' track = P.snip(infile, ".log") suffixes = P.asList(PARAMS.get( '%s_suffixes' % track, PARAMS["suffixes"])) if len(suffixes) == 0: raise ValueError('no file types defined for test') regex_pattern = ".*\(%s\)" % "\|".join(suffixes) regex_pattern = pipes.quote(regex_pattern) # ignore log files as time stamps will # be different statement = '''find %(track)s.dir -type f -not -regex ".*.log" -regex %(regex_pattern)s -exec %(pipeline_scriptsdir)s/cgat_file_apply.sh {} wc -l \; | sort -k1,1 > %(outfile)s''' P.run()
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def buildIndirectMaps(infile, outfile, track): '''build a map between query and target, linking via intermediate targets.''' to_cluster = True path = P.asList(PARAMS["%s_path" % track]) E.info("path=%s" % str(path)) statement = [] for stage, part in enumerate(path): filename = part + ".over.psl.gz" if not os.path.exists(filename): raise ValueError("required file %s for %s (stage %i) not exist." % (filename, outfile, stage)) if stage == 0: statement.append('''gunzip < %(filename)s''' % locals()) else: statement.append(''' pslMap stdin <(gunzip < %(filename)s) stdout ''' % locals()) statement.append("gzip") statement = " | ".join(statement) + " > %(outfile)s " % locals() P.run()
def exportMotifDiscoverySequences(infile, outfile): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' track = P.snip(infile, "_intervals.load") dbhandle = connect() p = P.substituteParameters(**locals()) nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full=False, masker=P.asList(p['motifs_masker']), halfwidth=int(p["motifs_halfwidth"]), maxsize=int(p["motifs_max_size"]), proportion=p["motifs_proportion"], min_sequences=p["motifs_min_sequences"], num_sequences=p["motifs_num_sequences"], order=p['motifs_score']) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile)
def buildIndirectMaps(infile, outfile, track): '''build a map between query and target, linking via intermediate targets.''' to_cluster = True path = P.asList(PARAMS["%s_path" % track]) E.info("path=%s" % str(path)) statement = [] for stage, part in enumerate(path): filename = part + ".over.psl.gz" if not os.path.exists(filename): raise ValueError( "required file %s for %s (stage %i) not exist." % (filename, outfile, stage)) if stage == 0: statement.append( '''gunzip < %(filename)s''' % locals() ) else: statement.append( ''' pslMap stdin <(gunzip < %(filename)s) stdout ''' % locals() ) statement.append("gzip") statement = " | ".join(statement) + " > %(outfile)s " % locals() P.run()
def importRepeatsFromUCSC(outfile): """This task downloads UCSC repeats types as identified in the configuration file. """ PipelineGtfsubset.getRepeatDataFromUCSC(dbhandle=connectToUCSC(), repclasses=P.asList( PARAMS["ucsc_repeattypes"]), outfile=outfile)
def importRNAAnnotationFromUCSC(outfile): """This task downloads UCSC repetetive RNA types. """ PipelineGtfsubset.getRepeatDataFromUCSC( dbhandle=connectToUCSC(), repclasses=P.asList(PARAMS["ucsc_rnatypes"]), outfile=outfile, remove_contigs_regex=PARAMS["ncbi_remove_contigs"])
def importRepeatsFromUCSC(outfile): """This task downloads UCSC repeats types as identified in the configuration file. """ PipelineGtfsubset.getRepeatDataFromUCSC( dbhandle=connectToUCSC(), repclasses=P.asList(PARAMS["ucsc_repeattypes"]), outfile=outfile, job_memory=PARAMS["job_memory"])
def importRNAAnnotationFromUCSC(outfile): """This task downloads UCSC repetetive RNA types. """ PipelineGtfsubset.getRepeatDataFromUCSC( dbhandle=connectToUCSC(), repclasses=P.asList(PARAMS["ucsc_rnatypes"]), outfile=outfile, remove_contigs_regex=PARAMS["ncbi_remove_contigs"], job_memory=PARAMS["job_memory"])
def runMemeChIP(infile, outfile): '''run MemeChIP''' outdir = outfile.replace(".memechip", "") bfile = infile.replace(".foreground.fasta", ".background.bfile") motifDb = " -db ".join( P.asList(PARAMS["meme_motif_db"] )) # Meme-Chip needs eac db in list to have "-db" flag nmotifs = PARAMS["meme_nmotif"] meme_max_jobs = PARAMS["meme_meme_maxsize"] # nmeme - The upper bound on the number of sequences that are passed to MEME. # This is required because MEME takes too long to run for very large sequence sets. # All input sequences are passed to MEME if there are not more than limit. # default nmeme = 600 # ccut - The maximum length of a sequence to use before it is trimmed to a central region of this size. # A value of 0 indicates that sequences should not be trimmed. # meme-maxsize - Change the largest allowed dataset to be size. # default meme-maxsize is 100,000. # Fine with the default settings for -nmeme (600) and -ccut (100), largest possible dataset size would be 60000. # meme-maxsize 10x10^6 - this is far to large, runs take >24hrs # will try 600,000, equivalent to max of 600 1000bp seq # in order to check 2,000 <= 1,000bp seq will need meme-maxsize of 2x10^6 job_memory = "5G" job_threads = "2" statement = '''meme-chip -oc %(outdir)s -db %(motifDb)s -bfile %(bfile)s -ccut 0 -meme-mod zoops -meme-minw 5 -meme-maxw 30 -meme-nmotifs %(nmotifs)s -meme-maxsize %(meme_max_jobs)s %(infile)s > %(outfile)s ''' % locals() print statement P.run()
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) tmpdir = P.getTempDir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.asList(PARAMS['motifs_masker']), halfwidth=int(PARAMS["meme_halfwidth"]), maxsize=int(PARAMS["meme_max_size"]), proportion=PARAMS["meme_proportion"], min_sequences=PARAMS["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) tmpdir = P.getTempDir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.asList(PARAMS['motifs_masker']), halfwidth=int(PARAMS["meme_halfwidth"]), maxsize=int(PARAMS["meme_max_size"]), proportion=PARAMS["meme_proportion"], min_sequences=PARAMS["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def exportIntervalSequences(infile, outfile, track, method): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' dbhandle = connect() try: halfwidth = int(PARAMS[method + "_halfwidth"]) full = False except ValueError: full = True halfwidth = None try: maxsize = int(PARAMS[method + "_max_size"]) except ValueError: maxsize = None nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full=full, masker=P.asList(PARAMS[method + '_masker']), halfwidth=halfwidth, maxsize=maxsize, num_sequences=PARAMS[method + "_num_sequences"], proportion=PARAMS[method + "_proportion"], min_sequences=PARAMS[method + "_min_sequences"], order=PARAMS[method + '_score']) if nseq == 0: E.warn("%s: no sequences - %s skipped" % (outfile, method)) P.touch(outfile)
def exportMotifDiscoverySequences(infile, outfile): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' track = P.snip(infile, "_intervals.load") dbhandle = connect() p = P.substituteParameters(**locals()) nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full=False, masker=P.asList( p['motifs_masker']), halfwidth=int( p["motifs_halfwidth"]), maxsize=int( p["motifs_max_size"]), proportion=p[ "motifs_proportion"], min_sequences=p[ "motifs_min_sequences"], num_sequences=p[ "motifs_num_sequences"], order=p['motifs_score']) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile)
def publish(): '''publish files.''' # directory, files export_files = {"bigwigfiles": glob.glob("*/*.bigwig")} if PARAMS['ucsc_exclude']: for filetype, files in export_files.items(): new_files = set(files) for f in files: for regex in P.asList(PARAMS['ucsc_exclude']): if re.match(regex, f): new_files.remove(f) break export_files[filetype] = list(new_files) # publish web pages E.info("publishing report") P.publish_report(export_files=export_files) E.info("publishing UCSC data hub") P.publish_tracks(export_files)
def processReads(infile, outfiles): """process reads from .fastq and other sequence files. """ trimmomatic_options = PARAMS["trimmomatic_options"] if PARAMS["trimmomatic_adapter"]: trimmomatic_options = ( " ILLUMINACLIP:%s:%s:%s:%s " % ( PARAMS["trimmomatic_adapter"], PARAMS["trimmomatic_mismatches"], PARAMS["trimmomatic_p_thresh"], PARAMS["trimmomatic_c_thresh"], ) + trimmomatic_options ) if PARAMS["auto_remove"]: trimmomatic_options = ( " ILLUMINACLIP:%s:%s:%s:%s " % ( "contaminants.fasta", PARAMS["trimmomatic_mismatches"], PARAMS["trimmomatic_p_thresh"], PARAMS["trimmomatic_c_thresh"], ) + trimmomatic_options ) job_threads = PARAMS["threads"] job_memory = "7G" track = re.match(REGEX_TRACK, infile).groups()[0] m = PipelinePreprocess.MasterProcessor( save=PARAMS["save"], summarize=PARAMS["summarize"], threads=PARAMS["threads"] ) for tool in P.asList(PARAMS["preprocessors"]): if tool == "fastx_trimmer": m.add(PipelinePreprocess.FastxTrimmer(PARAMS["fastx_trimmer_options"], threads=PARAMS["threads"])) elif tool == "trimmomatic": m.add(PipelinePreprocess.Trimmomatic(trimmomatic_options, threads=PARAMS["threads"])) elif tool == "sickle": m.add(PipelinePreprocess.Sickle(PARAMS["sickle_options"], threads=PARAMS["threads"])) elif tool == "trimgalore": m.add(PipelinePreprocess.Trimgalore(PARAMS["trimgalore_options"], threads=PARAMS["threads"])) elif tool == "flash": m.add(PipelinePreprocess.Flash(PARAMS["flash_options"], threads=PARAMS["threads"])) elif tool == "cutadapt": cutadapt_options = PARAMS["cutadapt_options"] if PARAMS["auto_remove"]: cutadapt_options += " -a file:contaminants.fasta " m.add( PipelinePreprocess.Cutadapt( cutadapt_options, threads=PARAMS["threads"], untrimmed=PARAMS["cutadapt_reroute_untrimmed"] ) ) statement = m.build((infile,), "processed.dir/trimmed-", track) P.run()
> %(outfile)s ''' P.run() ########################################################################## ########################################################################## ########################################################################## # extracting alignments from maf files ########################################################################## if "maf_dir" in PARAMS and "maf_tracks" in PARAMS: @files([(("%s/*.maf.gz" % PARAMS["maf_dir"]), "%sTo%s.raw.psl.gz" % (PARAMS["%s_label" % track], PARAMS["maf_master"]), track) for track in P.asList(PARAMS["maf_tracks"])]) def extractPairwiseAlignmentSingleFile(infiles, outfile, track): '''build pairwise genomic aligment from maf files.''' try: os.remove(outfile) except OSError: pass genomefile = PARAMS["%s_genome" % track] to_cluster = True for infile in infiles: E.info("adding %s" % infile)
"../pipeline.ini", "pipeline.ini"], defaults={ 'annotations_dir': "", 'paired_end': False}) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters( PARAMS["annotations_dir"], "pipeline_annotations.py") # get options that are to be tested cufflinks_options = {} if "cufflinks_test_options" in PARAMS: options = P.asList(PARAMS["cufflinks_test_options"]) for option in options: if option == "--pre-mrna-fraction" \ or option == "--small-anchor-fraction" \ or option == "--max-multiread-fraction": cufflinks_options[option] = [0, 0.5, 0.75, 1] elif option == "--min-isoform-fraction": cufflinks_options[option] = [0.05, 0.1, 0.5, 1] elif option == "--junc-alpha": cufflinks_options[option] = [0.001, 0.01, 0.1] elif option == "--min-frags-per-transfrag": cufflinks_options[option] = [1, 5, 10] elif option == "--overhang-tolerance": cufflinks_options[option] = [0, 2, 5, 8] elif option == "--overlap-radius": cufflinks_options[option] = [50, 100, 200]
| %(cmd-farm)s --split-at-regex="^chain" --chunk-size=1000 --max-lines=1000000 --log=%(outfile)s.log " cgat chain2psl --log=%(outfile)s.log | pslSwap stdin stdout " | gzip > %(outfile)s ''' P.run() ########################################################################## ########################################################################## ########################################################################## # extracting alignments from maf files ########################################################################## if "maf_dir" in PARAMS and "maf_tracks" in PARAMS: @files([(("%s/*.maf.gz" % PARAMS["maf_dir"]), "%sTo%s.raw.psl.gz" % (PARAMS["%s_label" % track], PARAMS["maf_master"]), track) for track in P.asList(PARAMS["maf_tracks"])]) def extractPairwiseAlignmentSingleFile(infiles, outfile, track): '''build pairwise genomic aligment from maf files.''' try: os.remove(outfile) except OSError: pass genomefile = PARAMS["%s_genome" % track] to_cluster = True for infile in infiles: E.info("adding %s" % infile)
"pipeline.ini" ], defaults={ 'annotations_dir': "", 'paired_end': False }) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") # get options that are to be tested cufflinks_options = {} if "cufflinks_test_options" in PARAMS: options = P.asList(PARAMS["cufflinks_test_options"]) for option in options: if option == "--pre-mrna-fraction" \ or option == "--small-anchor-fraction" \ or option == "--max-multiread-fraction": cufflinks_options[option] = [0, 0.5, 0.75, 1] elif option == "--min-isoform-fraction": cufflinks_options[option] = [0.05, 0.1, 0.5, 1] elif option == "--junc-alpha": cufflinks_options[option] = [0.001, 0.01, 0.1] elif option == "--min-frags-per-transfrag": cufflinks_options[option] = [1, 5, 10] elif option == "--overhang-tolerance": cufflinks_options[option] = [0, 2, 5, 8] elif option == "--overlap-radius": cufflinks_options[option] = [50, 100, 200]
def getAssociatedBAMFiles(track): '''return a list of BAM files associated with a track. By default, this method searches for ``track.bam`` file in the data directory and returns an offset of 0. Associations can be defined in the .ini file in the section [bams]. For example, the following snippet associates track track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`:: [bams] track1=track1.bam,track2.bam Glob expressions are permitted. Offsets are used to shift tags in ChIP experiments. Offsets need to be defined in the [offsets] sections. If no offsets are defined, the method returns a list of 0 offsets. Offsets need to be defined in the same order as the bam files:: [offsets] track1=120,200 returns a list of BAM files and offsets. Default tracks and offsets can be specified using a placeholder ``%``. The following will associate all tracks with the same bam file:: [bams] %=all.bam ''' fn = os.path.basename(track.asFile()) bamfiles = glob.glob("%s.bam" % fn) if bamfiles == []: if "bams_%s" % fn.lower() in PARAMS: for ff in P.asList(PARAMS["bams_%s" % fn.lower()]): bamfiles.extend(glob.glob(ff)) else: for pattern, value in P.CONFIG.items("bams"): if "%" in pattern: p = re.sub("%", "\S+", pattern.lower()) if re.search(p, fn.lower()): bamfiles.extend(glob.glob(value)) offsets = [] if "offsets_%s" % fn.lower() in PARAMS: offsets = map(int, P.asList(PARAMS["offsets_%s" % fn.lower()])) else: for pattern, value in P.CONFIG.items("offsets"): if "%" in pattern: p = re.sub("%", "\S+", pattern) if re.search(p, fn): offsets.extend(map(int, value.split(","))) if offsets == []: offsets = [0] * len(bamfiles) if len(bamfiles) != len(offsets): raise ValueError("number of BAM files %s is not the " "same as number of offsets: %s" % (str(bamfiles), str(offsets))) return bamfiles, offsets
> %(outfile)s.log.gz''' P.run() statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; | gzip > %(outfile)s''' P.run() ################################################################### ################################################################### ################################################################### @files([(x, "%s_%s.output.gz" % (x[:-len(".features.gz")], y), y) for x, y in itertools.product( glob.glob("*.features.gz"), P.asList(PARAMS["polyphen_models"]))]) def runPolyphen(infile, outfile, model): '''run POLYPHEN on feature tables to classify SNPs. ''' to_cluster = False # need to run in chunks for large feature files statement = """gunzip < %(infile)s | %(cmd-farm)s --split-at-lines=10000 --output-header "perl %(polyphen_home)s/bin/run_weka_cpp.pl -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model -p
def processReads(infile, outfiles): '''process reads from .fastq and other sequence files. ''' trimmomatic_options = PARAMS["trimmomatic_options"] if PARAMS["auto_remove"]: trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % ( "contaminants.fasta", PARAMS["trimmomatic_mismatches"], PARAMS["trimmomatic_p_thresh"], PARAMS["trimmomatic_c_thresh"], PARAMS["trimmomatic_min_adapter_len"], PARAMS["trimmomatic_keep_both_reads"]) + trimmomatic_options elif PARAMS["trimmomatic_adapter"]: trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % ( PARAMS["trimmomatic_adapter"], PARAMS["trimmomatic_mismatches"], PARAMS["trimmomatic_p_thresh"], PARAMS["trimmomatic_c_thresh"], PARAMS["trimmomatic_min_adapter_len"], PARAMS["trimmomatic_keep_both_reads"]) + trimmomatic_options job_threads = PARAMS["threads"] job_memory = "12G" track = re.match(REGEX_TRACK, infile).groups()[0] m = PipelinePreprocess.MasterProcessor( save=PARAMS["save"], summarize=PARAMS["summarize"], threads=PARAMS["threads"], qual_format=PARAMS['qual_format']) for tool in P.asList(PARAMS["preprocessors"]): if tool == "fastx_trimmer": m.add(PipelinePreprocess.FastxTrimmer( PARAMS["fastx_trimmer_options"], threads=PARAMS["threads"])) elif tool == "trimmomatic": m.add(PipelinePreprocess.Trimmomatic( trimmomatic_options, threads=PARAMS["threads"])) elif tool == "sickle": m.add(PipelinePreprocess.Sickle( PARAMS["sickle_options"], threads=PARAMS["threads"])) elif tool == "trimgalore": m.add(PipelinePreprocess.Trimgalore( PARAMS["trimgalore_options"], threads=PARAMS["threads"])) elif tool == "flash": m.add(PipelinePreprocess.Flash( PARAMS["flash_options"], threads=PARAMS["threads"])) elif tool == "reversecomplement": m.add(PipelinePreprocess.ReverseComplement( PARAMS["reversecomplement_options"])) elif tool == "pandaseq": m.add(PipelinePreprocess.Pandaseq( PARAMS["pandaseq_options"], threads=PARAMS["threads"])) elif tool == "cutadapt": cutadapt_options = PARAMS["cutadapt_options"] if PARAMS["auto_remove"]: cutadapt_options += " -a file:contaminants.fasta " m.add(PipelinePreprocess.Cutadapt( cutadapt_options, threads=PARAMS["threads"], untrimmed=PARAMS['cutadapt_reroute_untrimmed'], process_paired=PARAMS["cutadapt_process_paired"])) else: raise NotImplementedError("tool '%s' not implemented" % tool) statement = m.build((infile,), "processed.dir/trimmed-", track) P.run()
def compareCheckSums(infiles, outfile): '''compare checksum files against existing reference data. ''' to_cluster = False outf = IOTools.openFile(outfile, "w") outf.write("\t".join(( ("track", "status", "job_finished", "nfiles", "nref", "missing", "extra", "different", "different_md5", "different_lines", "same", "same_md5", "same_lines", "same_exist", "files_missing", "files_extra", "files_different_md5", "files_different_lines"))) + "\n") for infile in infiles: E.info("working on {}".format(infile)) track = P.snip(infile, ".stats") logfiles = glob.glob(track + "*.log") job_finished = True for logfile in logfiles: is_complete = IOTools.isComplete(logfile) E.debug("logcheck: {} = {}".format(logfile, is_complete)) job_finished = job_finished and is_complete reffile = track + ".ref" # regular expression of files to test only for existence regex_exist = PARAMS.get('%s_regex_exist' % track, None) if regex_exist: regex_exist = re.compile("|".join(P.asList(regex_exist))) regex_linecount = PARAMS.get('%s_regex_linecount' % track, None) if regex_linecount: regex_linecount = re.compile("|".join(P.asList(regex_linecount))) regex_md5 = PARAMS.get('%s_regex_md5' % track, None) if regex_md5: regex_md5 = re.compile("|".join(P.asList(regex_md5))) if not os.path.exists(reffile): raise ValueError('no reference data defined for %s' % track) cmp_data = pandas.read_csv(IOTools.openFile(infile), sep="\t", index_col=0) ref_data = pandas.read_csv(IOTools.openFile(reffile), sep="\t", index_col=0) shared_files = set(cmp_data.index).intersection(ref_data.index) missing = set(ref_data.index).difference(cmp_data.index) extra = set(cmp_data.index).difference(ref_data.index) different = set(shared_files) # remove those for which only check for existence if regex_exist: same_exist = set([x for x in different if regex_exist.search(x)]) different = set([x for x in different if not regex_exist.search(x)]) else: same_exist = set() # select those for which only check for number of lines if regex_linecount: check_lines = [x for x in different if regex_linecount.search(x)] dd = (cmp_data['nlines'][check_lines] != ref_data['nlines'][check_lines]) different_lines = set(dd.index[dd]) different = different.difference(check_lines) dd = (cmp_data['nlines'][check_lines] == ref_data['nlines'][check_lines]) same_lines = set(dd.index[dd]) else: different_lines = set() same_lines = set() # remainder - check md5 if regex_md5: check_md5 = [x for x in different if regex_md5.search(x)] dd = (cmp_data['md5'][check_md5] != ref_data['md5'][check_md5]) different_md5 = set(dd.index[dd]) dd = (cmp_data['md5'][check_md5] == ref_data['md5'][check_md5]) same_md5 = set(dd.index[dd]) else: different_md5 = set() same_md5 = set() if job_finished and (len(missing) + len(extra) + len(different_md5) + len(different_lines) == 0): status = "OK" else: status = "FAIL" outf.write("\t".join(map(str, ( track, status, job_finished, len(cmp_data), len(ref_data), len(missing), len(extra), len(different_md5) + len(different_lines), len(different_md5), len(different_lines), len(same_md5) + len(same_lines) + len(same_exist), len(same_md5), len(same_lines), len(same_exist), ",".join(missing), ",".join(extra), ",".join(different_md5), ",".join(different_lines), ))) + "\n") outf.close()
--log=%(outfile)s.log --fdr=%(edger_fdr)f" | grep -v "warnings" | gzip > %(outfile)s ''' P.run() @follows(aggregateTiledReadCounts, mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation"))) @files([((data, design), "diff_methylation/%s_%s.deseq.gz" % (P.snip(os.path.basename(data), ".counts.tsv.gz"), P.snip(os.path.basename(design), ".tsv"))) for data, design in itertools.product( glob.glob("diff_methylation/*.counts.tsv.gz"), P.asList(PARAMS["deseq_designs"]))]) def runDESeq(infiles, outfile): '''estimate differential expression using DESeq. The final output is a table. It is slightly edited such that it contains a similar output and similar fdr compared to cuffdiff. ''' runDE(infiles, outfile, "deseq") ######################################################################### ######################################################################### ######################################################################### @follows(aggregateTiledReadCounts, mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation")))
%(options)s - -o %(outfiles)s --too-short-o too_short.dir/%(track)s_tooshort.fastq.gz --untrimmed-output untrimmed.dir/%(track)s_untrimmed.fastq.gz >> %(track)s.log ''' P.run() ############################################################################### # Read alignment to library (with bowtie or bowtie2) ############################################################################### mapper = PARAMS['mapper'] library_files = P.asList(PARAMS['libraryfiles']) library_names = P.asList(PARAMS['librarynames']) library_dict = dict(zip(library_names, library_files)) if mapper == 'bowtie': @follows(mkdir("library.dir")) @subdivide(library_files, regex(r"(\S+).fasta"), r"library.dir/\1.*.ebwt") def BuildBowtieIndex(infiles, outfiles): basename = 'library.dir/' + P.snip(os.path.basename(infiles), ".fasta") statement = ''' bowtie-build -f %(infiles)s %(basename)s ''' P.run()
--fdr=%(edger_fdr)f" | grep -v "warnings" | gzip > %(outfile)s ''' P.run() @follows(aggregateTiledReadCounts, mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation"))) @files([((data, design), "diff_methylation/%s_%s.deseq.gz" % (P.snip(os.path.basename(data), ".counts.tsv.gz"), P.snip(os.path.basename(design), ".tsv"))) for data, design in itertools.product( glob.glob("diff_methylation/*.counts.tsv.gz"), P.asList(PARAMS["deseq_designs"]))]) def runDESeq(infiles, outfile): '''estimate differential expression using DESeq. The final output is a table. It is slightly edited such that it contains a similar output and similar fdr compared to cuffdiff. ''' runDE(infiles, outfile, "deseq") ######################################################################### ######################################################################### #########################################################################
defaults={"annotations_dir": "", "genesets_abinitio_coding": "pruned.gtf.gz", "genesets_abinitio_lncrna": "pruned.gtf.gz", "genesets_reference": "reference.gtf.gz", "genesets_refcoding": "refcoding.gtf.gz", "genesets_previous": ""}) PARAMS = P.PARAMS PARAMS.update(P.peekParameters( PARAMS["annotations_dir"], "pipeline_annotations.py", prefix="annotations_", update_interface=True)) PREVIOUS = P.asList(PARAMS["genesets_previous"]) def connect(): '''connect to database. This method also attaches to helper databases. ''' dbh = sqlite3.connect(PARAMS["database_name"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement) cc.close()
@follows(mkdir("motifs")) @transform(BEDFILES, regex(".*/(.*).bed.gz"), r"motifs/\1.control.fasta") def exportMotifControlSequences(infile, outfile): '''for each interval, export the left and right sequence segment of the same size. ''' PipelineMotifs.exportSequencesFromBedFile(infile, outfile, masker=PARAMS['motifs_masker'], mode="leftright") ############################################################ ############################################################ ############################################################ @active_if("meme" in P.asList(PARAMS["methods"]) or "disc_meme" in P.asList(PARAMS["methods"])) @transform(loadIntervals, suffix("_intervals.load"), ".meme.fasta") def exportMemeIntervalSequences(infile, outfile): track = os.path.basename(P.snip(infile, "_intervals.load")) exportIntervalSequences(infile, outfile, track, "meme") ############################################################ @follows(mkdir("meme.dir")) @active_if("meme" in P.asList(PARAMS["methods"])) @transform(exportMemeIntervalSequences, regex("(.+).meme.fasta"), r"meme.dir/\1.meme") def runMeme(infile, outfile):
P.run() statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; > %(outfile)s''' P.run() ################################################################### ################################################################### ################################################################### # do not run in parallel. run_weka.pl creates a $testfile # that is not unique. run_weka.pl and pph2arff.pl could either # be patched or the following jobs run in sequence. @jobs_limit(1, "polyphen") @files([(buildPolyphenFeatures, "polyphen_%s.output.gz" % x, x) for x in P.asList(PARAMS["polyphen_models"])]) def runPolyphen(infile, outfile, model): '''run POLYPHEN on feature tables to classify SNPs. ''' # options # -f: feature set, default is F11 # -c: classifier, default is NBd (Naive Bayes with discretization) # -l: model name, default is HumDiv statement = ''' %(polyphen_home)s/bin/run_weka.pl -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model %(infile)s | gzip > %(outfile)s 2> %(outfile)s.log
def quantifySequins(): pass ############################################################################## # Sequins Quantify - End # ############################################################################## ############################################################################## # Sequins Add Models - Start # ############################################################################## add_models_gtfs = [] for add_type in ["skip_exons", "incomplete", "3prime"]: for fraction in P.asList(PARAMS['%s_fractions' % add_type]): for iteration in range(0, PARAMS['%s_iterations' % add_type]): add_models_gtfs.append( "sequins/add_models/%s/transcripts_%s_%s.gtf.gz" % (add_type, fraction, iteration)) @mkdir('sequins/add_models/skip_exons', 'sequins/add_models/incomplete', 'sequins/add_models/3prime') @originate(add_models_gtfs) def buildAddModels(outfile): ''' build a set of reference transcriptomes with additional transcripts with skipped exons, incomplete transcripts and transcripts with alternative 3' ends ''' # how to avoid hardcoding this? infile = 'annotations/sequins.gtf.gz'
with IOTools.openFile(outfile, "w") as outf: outf.write("%s\n" % "\t".join( ("target_id", "length", "tpm", "est_counts"))) for line in lines: if not line.startswith("# "): outf.write(line) # define simulation targets SIMTARGETS = [] mapToSimulationTargets = {'kallisto': (extractKallistoCountSimulation, ), 'salmon': (extractSalmonCountSimulation, ), 'sailfish': (extractSailfishCountSimulation, )} for x in P.asList(PARAMS["quantifiers"]): SIMTARGETS.extend(mapToSimulationTargets[x]) @follows(*SIMTARGETS) def quantifySimulation(): pass @transform(SIMTARGETS, regex("simulation.dir/quant.dir/(\S+)/simulated_reads_(\d+)/abundance.tsv"), r"simulation.dir/quant.dir/\1/simulated_reads_\2/results.tsv", r"simulation.dir/simulated_read_counts_\2.tsv") def mergeAbundanceCounts(infile, outfile, counts): ''' merge the abundance and simulation counts files for each simulation '''
def processReads(infile, outfiles): '''process reads from .fastq and other sequence files. ''' trimmomatic_options = PARAMS["trimmomatic_options"] if PARAMS["trimmomatic_adapter"]: trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % ( PARAMS["trimmomatic_adapter"], PARAMS["trimmomatic_mismatches"], PARAMS["trimmomatic_p_thresh"], PARAMS["trimmomatic_c_thresh"], PARAMS["trimmomatic_min_adapter_len"], PARAMS["trimmomatic_keep_both_reads"]) + trimmomatic_options if PARAMS["auto_remove"]: trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % ( "contaminants.fasta", PARAMS["trimmomatic_mismatches"], PARAMS["trimmomatic_p_thresh"], PARAMS["trimmomatic_c_thresh"], PARAMS["trimmomatic_min_adapter_len"], PARAMS["trimmomatic_keep_both_reads"]) + trimmomatic_options job_threads = PARAMS["threads"] job_memory = "12G" track = re.match(REGEX_TRACK, infile).groups()[0] m = PipelinePreprocess.MasterProcessor( save=PARAMS["save"], summarize=PARAMS["summarize"], threads=PARAMS["threads"]) for tool in P.asList(PARAMS["preprocessors"]): if tool == "fastx_trimmer": m.add(PipelinePreprocess.FastxTrimmer( PARAMS["fastx_trimmer_options"], threads=PARAMS["threads"])) elif tool == "trimmomatic": m.add(PipelinePreprocess.Trimmomatic( trimmomatic_options, threads=PARAMS["threads"])) elif tool == "sickle": m.add(PipelinePreprocess.Sickle( PARAMS["sickle_options"], threads=PARAMS["threads"])) elif tool == "trimgalore": m.add(PipelinePreprocess.Trimgalore( PARAMS["trimgalore_options"], threads=PARAMS["threads"])) elif tool == "flash": m.add(PipelinePreprocess.Flash( PARAMS["flash_options"], threads=PARAMS["threads"])) elif tool == "reversecomplement": m.add(PipelinePreprocess.ReverseComplement( PARAMS["reversecomplement_options"])) elif tool == "pandaseq": m.add(PipelinePreprocess.Pandaseq( PARAMS["pandaseq_options"], threads=PARAMS["threads"])) elif tool == "cutadapt": cutadapt_options = PARAMS["cutadapt_options"] if PARAMS["auto_remove"]: cutadapt_options += " -a file:contaminants.fasta " m.add(PipelinePreprocess.Cutadapt( cutadapt_options, threads=PARAMS["threads"], untrimmed=PARAMS['cutadapt_reroute_untrimmed'], process_paired=PARAMS["cutadapt_process_paired"])) else: raise NotImplementedError("tool '%s' not implemented" % tool) statement = m.build((infile,), "processed.dir/trimmed-", track) P.run()
r"filesummaries.dir/\1.seqsummary") def checkFile(infile, outfile): seqdat=PipelineMetaAssemblyKit.SequencingData(infile) outf=open(outfile,'w') outf.write("name\t{}\nformat\t{}\ncompressed\t{}\npaired\t{}\ninterleaved\t{}\n".format( seqdat.filename,seqdat.fileformat,seqdat.compressed,seqdat.paired,seqdat.interleaved)) seqdat.readCount() outf.write("read_count\t{}\n".format(seqdat.readcount)) outf.close() ################################################## #Run Selected Assemblers ################################################## #get the list of assemblers to run on the data ASSEMBLERS = P.asList(PARAMS.get("Assembler_assemblers", "")) ################################################### # Run Megahit ################################################### @active_if("megahit" in ASSEMBLERS) @follows(checkFile) @follows(mkdir("megahit_out.dir")) @transform(SEQUENCEFILES, SEQUENCEFILES_REGEX, r"megahit_out.dir/\1/\1.contigs.fa") def runMegahit(infile, outfile): job_memory = str(PARAMS["Megahit_clus_memory"])+"G" job_threads = PARAMS["Megahit_clus_threads"] seqdat=PipelineMetaAssemblyKit.SequencingData(infile) assembler = PipelineMetaAssemblyKit.Megahit(seqdat,"megahit_out.dir",PARAMS)
defaults={"annotations_dir": "", "genesets_abinitio_coding": "pruned.gtf.gz", "genesets_abinitio_lncrna": "pruned.gtf.gz", "genesets_reference": "reference.gtf.gz", "genesets_refcoding": "refcoding.gtf.gz", "genesets_previous": ""}) PARAMS = P.PARAMS PARAMS.update(P.peekParameters( PARAMS["annotations_annotations_dir"], "pipeline_annotations.py", prefix="annotations_", update_interface=True)) PREVIOUS = P.asList(PARAMS["genesets_previous"]) def connect(): '''connect to database. This method also attaches to helper databases. ''' dbh = sqlite3.connect(PARAMS["database_name"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement) cc.close()
# Pipeline configuration ################################################### # load options from the config file import CGATPipelines.Pipeline as P P.getParameters( ["%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini"]) PARAMS = P.PARAMS # obtain prerequisite generic data @files([(None, "%s.tgz" % x) for x in P.asList(PARAMS.get("prerequisites", ""))]) def setupPrerequisites(infile, outfile): '''setup pre-requisites. These are tar-balls that are unpacked, but not run. ''' to_cluster = False track = P.snip(outfile, ".tgz") # obtain data - should overwrite pipeline.ini file statement = ''' wget --no-check-certificate -O %(track)s.tgz %(data_url)s/%(track)s.tgz''' P.run() tf = tarfile.open(outfile)
def getAssociatedBAMFiles(track): '''return a list of BAM files associated with a track. By default, this method searches for ``track.bam`` file in the current directory and returns an offset of 0. Associations can be defined in the .ini file in the section [bams]. For example, the following snippet associates track track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`:: [bams] track1=track1.bam,track2.bam Glob expressions are permitted. Offsets are used to shift tags in ChIP experiments. Offsets need to be defined in the [offsets] sections. If no offsets are defined, the method returns a list of 0 offsets. Offsets need to be defined in the same order as the bam files:: [offsets] track1=120,200 returns a list of BAM files and offsets. Default tracks and offsets can be specified using a placeholder ``%``. The following will associate all tracks with the same bam file:: [bams] %=all.bam ''' fn = track.asFile() bamfiles = glob.glob("%s.bam" % fn) if bamfiles == []: if "bams_%s" % fn.lower() in PARAMS: for ff in P.asList(PARAMS["bams_%s" % fn.lower()]): bamfiles.extend(glob.glob(ff)) else: for pattern, value in P.CONFIG.items("bams"): if "%" in pattern: p = re.sub("%", "\S+", pattern) if re.search(p, fn, re.IGNORECASE): bamfiles.extend(glob.glob(value)) offsets = [] if "offsets_%s" % fn.lower() in PARAMS: offsets = map(int, P.asList(PARAMS["offsets_%s" % fn.lower()])) else: for pattern, value in P.CONFIG.items("offsets"): if "%" in pattern: p = re.sub("%", "\S+", pattern) if re.search(p, fn, re.IGNORECASE): offsets.extend(map(int, value.split(","))) if offsets == []: offsets = [0] * len(bamfiles) if len(bamfiles) != len(offsets): raise ValueError( "number of BAM files %s is not the " "same as number of offsets: %s" % (str(bamfiles), str(offsets))) return bamfiles, offsets