def buildIndirectMaps(infile, outfile, track): '''build a map between query and target, linking via intermediate targets.''' to_cluster = True path = P.asList(PARAMS["%s_path" % track]) E.info("path=%s" % str(path)) statement = [] for stage, part in enumerate(path): filename = part + ".over.psl.gz" if not os.path.exists(filename): raise ValueError("required file %s for %s (stage %i) not exist." % (filename, outfile, stage)) if stage == 0: statement.append('''gunzip < %(filename)s''' % locals()) else: statement.append(''' pslMap stdin <(gunzip < %(filename)s) stdout ''' % locals()) statement.append("gzip") statement = " | ".join(statement) + " > %(outfile)s " % locals() P.run()
def publish(): '''publish files.''' # directory, files export_files = {"bigwigfiles": glob.glob("*/*.bigwig")} if PARAMS['ucsc_exclude']: for filetype, files in export_files.items(): new_files = set(files) for f in files: for regex in P.asList(PARAMS['ucsc_exclude']): if re.match(regex, f): new_files.remove(f) break export_files[filetype] = list(new_files) # publish web pages E.info("publishing report") P.publish_report(export_files=export_files) E.info("publishing UCSC data hub") P.publish_tracks(export_files)
P.run() statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; | gzip > %(outfile)s''' P.run() ################################################################### ################################################################### ################################################################### @files([(x, "%s_%s.output.gz" % (x[:-len(".features.gz")], y), y) for x, y in itertools.product(glob.glob("*.features.gz"), P.asList(PARAMS["polyphen_models"]))]) def runPolyphen(infile, outfile, model): '''run POLYPHEN on feature tables to classify SNPs. ''' to_cluster = False # need to run in chunks for large feature files statement = """gunzip < %(infile)s | %(cmd-farm)s --split-at-lines=10000 --output-header "perl %(polyphen_home)s/bin/run_weka_cpp.pl -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model -p
--fdr=%(edger_fdr)f" | grep -v "warnings" | gzip > %(outfile)s ''' P.run() @follows(aggregateTiledReadCounts, mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation"))) @files([((data, design), "diff_methylation/%s_%s.deseq.gz" % (P.snip(os.path.basename(data), ".counts.tsv.gz"), P.snip(os.path.basename(design), ".tsv"))) for data, design in itertools.product( glob.glob("diff_methylation/*.counts.tsv.gz"), P.asList(PARAMS["deseq_designs"]))]) def runDESeq(infiles, outfile): '''estimate differential expression using DESeq. The final output is a table. It is slightly edited such that it contains a similar output and similar fdr compared to cuffdiff. ''' runDE(infiles, outfile, "deseq") ######################################################################### ######################################################################### #########################################################################
> %(outfile)s ''' P.run() ########################################################################## ########################################################################## ########################################################################## # extracting alignments from maf files ########################################################################## if "maf_dir" in PARAMS and "maf_tracks" in PARAMS: @files([(("%s/*.maf.gz" % PARAMS["maf_dir"]), "%sTo%s.raw.psl.gz" % (PARAMS["%s_label" % track], PARAMS["maf_master"]), track) for track in P.asList(PARAMS["maf_tracks"])]) def extractPairwiseAlignmentSingleFile(infiles, outfile, track): '''build pairwise genomic aligment from maf files.''' try: os.remove(outfile) except OSError: pass genomefile = PARAMS["%s_genome" % track] to_cluster = True for infile in infiles: E.info("adding %s" % infile)
P.run() statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; > %(outfile)s''' P.run() ################################################################### ################################################################### ################################################################### # do not run in parallel. run_weka.pl creates a $testfile # that is not unique. run_weka.pl and pph2arff.pl could either # be patched or the following jobs run in sequence. @jobs_limit(1, "polyphen") @files([(buildPolyphenFeatures, "polyphen_%s.output.gz" % x, x) for x in P.asList(PARAMS["polyphen_models"])]) def runPolyphen(infile, outfile, model): '''run POLYPHEN on feature tables to classify SNPs. ''' # options # -f: feature set, default is F11 # -c: classifier, default is NBd (Naive Bayes with discretization) # -l: model name, default is HumDiv statement = ''' %(polyphen_home)s/bin/run_weka.pl -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model %(infile)s | gzip > %(outfile)s 2> %(outfile)s.log
"genesets_abinitio_coding": "pruned.gtf.gz", "genesets_abinitio_lncrna": "pruned.gtf.gz", "genesets_reference": "reference.gtf.gz", "genesets_refcoding": "refcoding.gtf.gz", "genesets_previous": "" }) PARAMS = P.PARAMS PARAMS.update( P.peekParameters(PARAMS["annotations_annotations_dir"], "pipeline_annotations.py", prefix="annotations_", update_interface=True)) PREVIOUS = P.asList(PARAMS["genesets_previous"]) def connect(): '''connect to database. This method also attaches to helper databases. ''' dbh = sqlite3.connect(PARAMS["database_name"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement) cc.close()