def __call__(self, track, slice=None): exp_statement = """ SELECT TPM, gene_id, sample_name FROM sailfish_genes AS A JOIN samples AS B ON A.sample_id = B.id""" exp_df = self.getDataFrame(exp_statement) factors_statement = ''' SELECT factor, factor_value, sample_name FROM samples AS A JOIN factors AS B ON A.id = B.sample_id WHERE factor != 'genome' ''' factors_df = self.getDataFrame(factors_statement) merged_df = pd.merge(exp_df, factors_df, left_on="sample_name", right_on="sample_name") genes = Pipeline.asList(Pipeline.peekParameters( ".", "pipeline_rnaseqqc.py")['genes_of_interest']) interest_df = merged_df[merged_df['gene_id'].isin(genes)] interest_df['TPM'] = interest_df['TPM'].astype(float) return interest_df.reset_index().set_index("factor")
import sys import os import glob import sqlite3 from CGATCore import Pipeline as P import CGATPipelines.PipelineTracks as PipelineTracks # load options from the config file P.getParameters([ "%s/pipeline.ini" % __file__[:-len(".py")], "../pipeline.ini", "pipeline.ini" ]) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") Sample = PipelineTracks.Sample TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob("medip_*"), "medip_(\S+)") def connect(): '''connect to database. This method also attaches to helper databases. ''' dbh = sqlite3.connect(PARAMS["database_name"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_database"])
"pipeline.ini" ], defaults={ "annotations_dir": "", "genesets_abinitio_coding": "pruned.gtf.gz", "genesets_abinitio_lncrna": "pruned.gtf.gz", "genesets_reference": "reference.gtf.gz", "genesets_refcoding": "refcoding.gtf.gz", "genesets_previous": "" }) PARAMS = P.PARAMS PARAMS.update( P.peekParameters(PARAMS["annotations_annotations_dir"], "pipeline_annotations.py", prefix="annotations_", update_interface=True)) PREVIOUS = P.asList(PARAMS["genesets_previous"]) def connect(): '''connect to database. This method also attaches to helper databases. ''' dbh = sqlite3.connect(PARAMS["database_name"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_database"]) cc = dbh.cursor()
"pipeline.ini" ]) PARAMS["projectsrc"] = os.path.dirname(__file__) #for key, value in PARAMS.iteritems(): # print "%s:\t%s" % (key,value) # add configuration values from associated pipelines # # 1. pipeline_annotations: any parameters will be added with the # prefix "annotations_". The interface will be updated with # "annotations_dir" to point to the absolute path names. PARAMS.update( P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py", on_error_raise=__name__ == "__main__", prefix="annotations_", update_interface=True)) # if necessary, update the PARAMS dictionary in any modules file. # e.g.: # # import CGATPipelines.PipelineGeneset as PipelineGeneset # PipelineGeneset.PARAMS = PARAMS # # Note that this is a hack and deprecated, better pass all # parameters that are needed by a function explicitely. # ----------------------------------------------- # Utility functions
import CGAT.GTF as GTF import CGATPipelines.PipelineTracks as PipelineTracks # load options from the config file P.getParameters( ["%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini"]) PARAMS = P.PARAMS USECLUSTER = True # link up with annotations PARAMS_ANNOTATIONS = P.peekParameters( PARAMS["annotations_dir"], "pipeline_annotations.py") # link up with ancestral repeats PARAMS_ANCESTRAL_REPEATS = P.peekParameters( PARAMS["ancestral_repeats_dir"], "pipeline_ancestral_repeats.py") ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # collect sra nd fastq.gz tracks TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz", exclude=("repeats.gtf.gz", "introns.gtf.gz", "merged.gtf.gz"))
from CGATCore import Pipeline as Pipeline import CGATPipelines.PipelineTracks as PipelineTracks ################################################################### ################################################################### # parameterization EXPORTDIR = P.get('calling_exportdir', P.get('exportdir', 'export')) DATADIR = P.get('calling_datadir', P.get('datadir', '.')) DATABASE = P.get('calling_backend', P.get('report_sql_backend', 'sqlite:///./csvdb')) ################################################################### # cf. pipeline_chipseq.py # This should be automatically gleaned from pipeline_chipseq.py ################################################################### PARAMS_PIPELINE = Pipeline.peekParameters(".", "pipeline_chipseq.py") Sample = PipelineTracks.Sample3 suffixes = ["export.txt.gz", "sra", "fastq.gz", "fastq.1.gz", "csfasta.gz"] TRACKS = sum( itertools.chain([ PipelineTracks.Tracks(Sample).loadFromDirectory([ x for x in glob.glob("%s/*.%s" % (DATADIR, s)) if "input" not in x ], "%s/(\S+).%s" % (DATADIR, s)) for s in suffixes ]), PipelineTracks.Tracks(Sample)) Sample.setDefault("asTable") ALL = PipelineTracks.Aggregate(TRACKS)