예제 #1
0
    def __call__(self, track, slice=None):

        exp_statement = """
        SELECT TPM, gene_id, sample_name
        FROM sailfish_genes AS A
        JOIN samples AS B
        ON A.sample_id = B.id"""

        exp_df = self.getDataFrame(exp_statement)

        factors_statement = '''
        SELECT factor, factor_value, sample_name
        FROM samples AS A
        JOIN factors AS B
        ON A.id = B.sample_id
        WHERE factor != 'genome'
        '''

        factors_df = self.getDataFrame(factors_statement)

        merged_df = pd.merge(exp_df,
                             factors_df,
                             left_on="sample_name",
                             right_on="sample_name")

        genes = Pipeline.asList(
            Pipeline.peekParameters(
                ".", "pipeline_rnaseqqc.py")['genes_of_interest'])

        interest_df = merged_df[merged_df['gene_id'].isin(genes)]

        interest_df['TPM'] = interest_df['TPM'].astype(float)

        return interest_df.reset_index().set_index("factor")
예제 #2
0

# load options from the config file
PARAMS = P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"])

# add configuration values from associated pipelines
#
# 1. pipeline_annotations: any parameters will be added with the
#    prefix "annotations_". The interface will be updated with
#    "annotations_dir" to point to the absolute path names.
PARAMS.update(P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py",
    on_error_raise=__name__ == "__main__",
    prefix="annotations_",
    update_interface=True))


GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"),
    "(\S+).gtf.gz")

TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample3)
TRACKS = TRACKS.loadFromDirectory(glob.glob("*.bam"),
                                  "(\S+).bam")
REPLICATE = PipelineTracks.Aggregate(TRACKS, labels=("replicate"))
CONDITION = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))

# if necessary, update the PARAMS dictionary in any modules file.
예제 #3
0
import pysam

# load options from the config file
PARAMS = P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"])

# add configuration values from associated pipelines
#
# 1. pipeline_annotations: any parameters will be added with the
#    prefix "annotations_". The interface will be updated with
#    "annotations_dir" to point to the absolute path names.
PARAMS.update(P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py",
    on_error_raise=__name__ == "__main__",
    prefix="annotations_",
    update_interface=True))

# define some tracks if needed

TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.ini"), "(\S+).ini")


# --------------------------< utility functions >---------------------------- #

def connect():
    '''Connect to database.
       Use this method to connect to additional databases.
       Returns an sqlite3 database handle.
import CGAT.Stats as Stats
import pysam
import CGATPipelines.PipelineTracks as PipelineTracks

###################################################################
###################################################################
###################################################################
# Pipeline configuration
import CGATPipelines.Pipeline as P
P.getParameters(["%s/pipeline.ini" %
                 os.path.splitext(__file__)[0], "../pipeline.ini",
                 "pipeline.ini"])

PARAMS = P.PARAMS
PARAMS_ANNOTATIONS = P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py")

SEPARATOR = "|"

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc


class TracksVCF (PipelineTracks.Tracks):

    def load(self, filename, exclude=None):
        '''load tracks from a vcf file.'''
        tracks = []
import CGAT.Stats as Stats
import pysam
import CGATPipelines.PipelineTracks as PipelineTracks

###################################################################
###################################################################
###################################################################
# Pipeline configuration
import CGATPipelines.Pipeline as P
P.getParameters(["%s/pipeline.ini" %
                 os.path.splitext(__file__)[0], "../pipeline.ini",
                 "pipeline.ini"])

PARAMS = P.PARAMS
PARAMS_ANNOTATIONS = P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py")

SEPARATOR = "|"

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc


class TracksVCF (PipelineTracks.Tracks):

    def load(self, filename, exclude=None):
        '''load tracks from a vcf file.'''
        tracks = []
예제 #6
0
P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"],
    defaults={"annotations_dir": "",
              "genesets_abinitio_coding": "pruned.gtf.gz",
              "genesets_abinitio_lncrna": "pruned.gtf.gz",
              "genesets_reference": "reference.gtf.gz",
              "genesets_refcoding": "refcoding.gtf.gz",
              "genesets_previous": ""})

PARAMS = P.PARAMS

PARAMS.update(P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py",
    prefix="annotations_",
    update_interface=True))

PREVIOUS = P.asList(PARAMS["genesets_previous"])


def connect():
    '''connect to database.

    This method also attaches to helper databases.
    '''

    dbh = sqlite3.connect(PARAMS["database_name"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_database"])
예제 #7
0
###################################################
###################################################
###################################################
# Pipeline configuration
###################################################
P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
],
                defaults={'paired_end': False})

PARAMS = P.PARAMS

PARAMS.update(
    P.peekParameters(PARAMS["annotations_dir"],
                     "pipeline_annotations.py",
                     prefix="annotations_",
                     update_interface=True))

PipelinePeakcalling.PARAMS = PARAMS
PipelineMotifs.PARAMS = PARAMS

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
# determine the location of the input files (reads).
DATADIR = PARAMS.get('input', '.')
if not os.path.exists(DATADIR):
    raise OSError('data directory %s does not exists')
예제 #8
0
import CGATPipelines.Pipeline as Pipeline
import CGATPipelines.PipelineTracks as PipelineTracks
###################################################################
###################################################################
# parameterization

EXPORTDIR = P.get('calling_exportdir', P.get('exportdir', 'export'))
DATADIR = P.get('calling_datadir', P.get('datadir', '.'))
DATABASE = P.get('calling_backend',
                 P.get('report_sql_backend', 'sqlite:///./csvdb'))

###################################################################
# cf. pipeline_chipseq.py
# This should be automatically gleaned from pipeline_chipseq.py
###################################################################
PARAMS_PIPELINE = Pipeline.peekParameters(".", "pipeline_chipseq.py")

Sample = PipelineTracks.Sample3

suffixes = ["export.txt.gz", "sra", "fastq.gz", "fastq.1.gz", "csfasta.gz"]

TRACKS = sum(
    itertools.chain([
        PipelineTracks.Tracks(Sample).loadFromDirectory([
            x for x in glob.glob("%s/*.%s" % (DATADIR, s)) if "input" not in x
        ], "%s/(\S+).%s" % (DATADIR, s)) for s in suffixes
    ]), PipelineTracks.Tracks(Sample))

Sample.setDefault("asTable")

ALL = PipelineTracks.Aggregate(TRACKS)
예제 #9
0
from CGATReport.Utils import PARAMS as P
import CGATPipelines.Pipeline as Pipeline
import CGATPipelines.PipelineTracks as PipelineTracks
###################################################################
###################################################################
# parameterization

EXPORTDIR = P.get('calling_exportdir', P.get('exportdir', 'export'))
DATADIR = P.get('calling_datadir', P.get('datadir', '.'))
DATABASE = P.get('calling_backend', P.get('report_sql_backend', 'sqlite:///./csvdb'))

###################################################################
# cf. pipeline_chipseq.py
# This should be automatically gleaned from pipeline_chipseq.py
###################################################################
PARAMS_PIPELINE = Pipeline.peekParameters(".",
                                          "pipeline_chipseq.py")


Sample = PipelineTracks.Sample3

suffixes = ["export.txt.gz",
            "sra",
            "fastq.gz",
            "fastq.1.gz",
            "csfasta.gz"]

TRACKS = sum(itertools.chain([PipelineTracks.Tracks(Sample).loadFromDirectory(
    [x for x in glob.glob("%s/*.%s" % (DATADIR, s)) if "input" not in x],
    "%s/(\S+).%s" % (DATADIR, s)) for s in suffixes]),
    PipelineTracks.Tracks(Sample))
예제 #10
0
import CGAT.Database as Database
import CGAT.GTF as GTF
import CGATPipelines.PipelineTracks as PipelineTracks

# load options from the config file
P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
])

PARAMS = P.PARAMS

USECLUSTER = True

# link up with annotations
PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

# link up with ancestral repeats
PARAMS_ANCESTRAL_REPEATS = P.peekParameters(PARAMS["ancestral_repeats_dir"],
                                            "pipeline_ancestral_repeats.py")

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# collect sra nd fastq.gz tracks
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"),
    "(\S+).gtf.gz",
    exclude=("repeats.gtf.gz", "introns.gtf.gz", "merged.gtf.gz"))
import CGAT.GTF as GTF
import CGATPipelines.PipelineTracks as PipelineTracks

# load options from the config file
P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"])

PARAMS = P.PARAMS

USECLUSTER = True

# link up with annotations
PARAMS_ANNOTATIONS = P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py")

# link up with ancestral repeats
PARAMS_ANCESTRAL_REPEATS = P.peekParameters(
    PARAMS["ancestral_repeats_dir"],
    "pipeline_ancestral_repeats.py")

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# collect sra nd fastq.gz tracks
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"), "(\S+).gtf.gz", exclude=("repeats.gtf.gz", "introns.gtf.gz", "merged.gtf.gz"))
예제 #12
0
###################################################################
###################################################################
# Load options and annotations
###################################################################

# load options from the config file
PARAMS = P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"])

# add configuration values from associated pipelines
PARAMS = P.PARAMS
PARAMS.update(P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_genesets.py",
    prefix="annotations_",
    update_interface=True,
    restrict_interface=True))  # add config values from associated pipelines

# The DEXSeq R directory contains important python helper functions
PYTHONSCRIPTSDIR = R('''
    f = function(){
    pythonScriptsDir = system.file("python_scripts", package="DEXSeq")
    }
    f()''').tostring()


###################################################################
###################################################################
###################################################################
# Utility functions
예제 #13
0
# load options from the config file
P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
])

PARAMS = P.PARAMS

# Add parameters from the gtf_subset pipeline, but
# only the interface section. All PARAMS options
# will have the prefix `annotations_`

PARAMS.update(
    P.peekParameters(PARAMS["gtf_dir"],
                     "pipeline_genesets.py",
                     prefix="annotations_",
                     update_interface=True,
                     restrict_interface=True))

# -----------------------------------------------
# Utility functions


def connect():
    '''utility function to connect to database.

    Use this method to connect to the pipeline database.
    Additional databases can be attached here as well.

    Returns an sqlite3 database handle.
    '''
예제 #14
0
# load options from the config file
P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"])

PARAMS = P.PARAMS

# Add parameters from the gtf_subset pipeline, but
# only the interface section. All PARAMS options
# will have the prefix `annotations_`

PARAMS.update(P.peekParameters(
    PARAMS["gtf_dir"],
    "pipeline_genesets.py",
    prefix="annotations_",
    update_interface=True,
    restrict_interface=True))

# -----------------------------------------------
# Utility functions


def connect():
    '''utility function to connect to database.

    Use this method to connect to the pipeline database.
    Additional databases can be attached here as well.

    Returns an sqlite3 database handle.
    '''