def __call__(self, track, slice=None): exp_statement = """ SELECT TPM, gene_id, sample_name FROM sailfish_genes AS A JOIN samples AS B ON A.sample_id = B.id""" exp_df = self.getDataFrame(exp_statement) factors_statement = ''' SELECT factor, factor_value, sample_name FROM samples AS A JOIN factors AS B ON A.id = B.sample_id WHERE factor != 'genome' ''' factors_df = self.getDataFrame(factors_statement) merged_df = pd.merge(exp_df, factors_df, left_on="sample_name", right_on="sample_name") genes = Pipeline.asList(Pipeline.peekParameters( ".", "pipeline_rnaseqqc.py")['genes_of_interest']) interest_df = merged_df[merged_df['gene_id'].isin(genes)] interest_df['TPM'] = interest_df['TPM'].astype(float) return interest_df.reset_index().set_index("factor")
def buildIndirectMaps(infile, outfile, track): '''build a map between query and target, linking via intermediate targets.''' to_cluster = True path = P.asList(PARAMS["%s_path" % track]) E.info("path=%s" % str(path)) statement = [] for stage, part in enumerate(path): filename = part + ".over.psl.gz" if not os.path.exists(filename): raise ValueError("required file %s for %s (stage %i) not exist." % (filename, outfile, stage)) if stage == 0: statement.append('''gunzip < %(filename)s''' % locals()) else: statement.append(''' pslMap stdin <(gunzip < %(filename)s) stdout ''' % locals()) statement.append("gzip") statement = " | ".join(statement) + " > %(outfile)s " % locals() P.run()
def publish(): '''publish files.''' # directory, files export_files = {"bigwigfiles": glob.glob("*/*.bigwig")} if PARAMS['ucsc_exclude']: for filetype, files in export_files.items(): new_files = set(files) for f in files: for regex in P.asList(PARAMS['ucsc_exclude']): if re.match(regex, f): new_files.remove(f) break export_files[filetype] = list(new_files) # publish web pages E.info("publishing report") P.publish_report(export_files=export_files) E.info("publishing UCSC data hub") P.publish_tracks(export_files)
> %(outfile)s ''' P.run() ########################################################################## ########################################################################## ########################################################################## # extracting alignments from maf files ########################################################################## if "maf_dir" in PARAMS and "maf_tracks" in PARAMS: @files([(("%s/*.maf.gz" % PARAMS["maf_dir"]), "%sTo%s.raw.psl.gz" % (PARAMS["%s_label" % track], PARAMS["maf_master"]), track) for track in P.asList(PARAMS["maf_tracks"])]) def extractPairwiseAlignmentSingleFile(infiles, outfile, track): '''build pairwise genomic aligment from maf files.''' try: os.remove(outfile) except OSError: pass genomefile = PARAMS["%s_genome" % track] to_cluster = True for infile in infiles: E.info("adding %s" % infile)
"genesets_abinitio_coding": "pruned.gtf.gz", "genesets_abinitio_lncrna": "pruned.gtf.gz", "genesets_reference": "reference.gtf.gz", "genesets_refcoding": "refcoding.gtf.gz", "genesets_previous": "" }) PARAMS = P.PARAMS PARAMS.update( P.peekParameters(PARAMS["annotations_annotations_dir"], "pipeline_annotations.py", prefix="annotations_", update_interface=True)) PREVIOUS = P.asList(PARAMS["genesets_previous"]) def connect(): '''connect to database. This method also attaches to helper databases. ''' dbh = sqlite3.connect(PARAMS["database_name"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement) cc.close()
"../pipeline.ini", "pipeline.ini"], defaults={ 'annotations_dir': "", 'paired_end': False}) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters( PARAMS["annotations_dir"], "pipeline_annotations.py") # get options that are to be tested cufflinks_options = {} if "cufflinks_test_options" in PARAMS: options = P.asList(PARAMS["cufflinks_test_options"]) for option in options: if option == "--pre-mrna-fraction" \ or option == "--small-anchor-fraction" \ or option == "--max-multiread-fraction": cufflinks_options[option] = [0, 0.5, 0.75, 1] elif option == "--min-isoform-fraction": cufflinks_options[option] = [0.05, 0.1, 0.5, 1] elif option == "--junc-alpha": cufflinks_options[option] = [0.001, 0.01, 0.1] elif option == "--min-frags-per-transfrag": cufflinks_options[option] = [1, 5, 10] elif option == "--overhang-tolerance": cufflinks_options[option] = [0, 2, 5, 8] elif option == "--overlap-radius": cufflinks_options[option] = [50, 100, 200]
--fdr=%(edger_fdr)f" | grep -v "warnings" | gzip > %(outfile)s ''' P.run() @follows(aggregateTiledReadCounts, mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation"))) @files([((data, design), "diff_methylation/%s_%s.deseq.gz" % (P.snip(os.path.basename(data), ".counts.tsv.gz"), P.snip(os.path.basename(design), ".tsv"))) for data, design in itertools.product( glob.glob("diff_methylation/*.counts.tsv.gz"), P.asList(PARAMS["deseq_designs"]))]) def runDESeq(infiles, outfile): '''estimate differential expression using DESeq. The final output is a table. It is slightly edited such that it contains a similar output and similar fdr compared to cuffdiff. ''' runDE(infiles, outfile, "deseq") ######################################################################### ######################################################################### #########################################################################
P.run() statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; | gzip > %(outfile)s''' P.run() ################################################################### ################################################################### ################################################################### @files([(x, "%s_%s.output.gz" % (x[:-len(".features.gz")], y), y) for x, y in itertools.product(glob.glob("*.features.gz"), P.asList(PARAMS["polyphen_models"]))]) def runPolyphen(infile, outfile, model): '''run POLYPHEN on feature tables to classify SNPs. ''' to_cluster = False # need to run in chunks for large feature files statement = """gunzip < %(infile)s | %(cmd-farm)s --split-at-lines=10000 --output-header "perl %(polyphen_home)s/bin/run_weka_cpp.pl -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model -p
statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; > %(outfile)s''' P.run() ################################################################### ################################################################### ################################################################### # do not run in parallel. run_weka.pl creates a $testfile # that is not unique. run_weka.pl and pph2arff.pl could either # be patched or the following jobs run in sequence. @jobs_limit(1, "polyphen") @files([(buildPolyphenFeatures, "polyphen_%s.output.gz" % x, x) for x in P.asList(PARAMS["polyphen_models"])]) def runPolyphen(infile, outfile, model): '''run POLYPHEN on feature tables to classify SNPs. ''' # options # -f: feature set, default is F11 # -c: classifier, default is NBd (Naive Bayes with discretization) # -l: model name, default is HumDiv statement = ''' %(polyphen_home)s/bin/run_weka.pl -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model %(infile)s | gzip > %(outfile)s 2> %(outfile)s.log