Beispiel #1
0
###################################################################
# cf. pipeline_chipseq.py
# This should be automatically gleaned from pipeline_chipseq.py
###################################################################

PARAMS_PIPELINE = Pipeline.peekParameters(".", "pipeline_chipseq.py")

Sample = PipelineTracks.Sample3

suffixes = ["export.txt.gz", "sra", "fastq.gz", "fastq.1.gz", "csfasta.gz"]

TRACKS = sum(
    itertools.chain([
        PipelineTracks.Tracks(Sample).loadFromDirectory([
            x for x in glob.glob("%s/*.%s" % (DATADIR, s)) if "input" not in x
        ], "%s/(\S+).%s" % (DATADIR, s)) for s in suffixes
    ]), PipelineTracks.Tracks(Sample))

Sample.setDefault("asTable")

ALL = PipelineTracks.Aggregate(TRACKS)
EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))
CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", ))
TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", ))

############################################################################
# The folllowing need to be parameterized in a config file
# TISSUES=["GM00855", "GM00861" ]
# CONDITIONS=["D3", "unstim" ]
# REPLICATES=["R1", "R2" ]
Beispiel #2
0
import sqlite3
from cgatcore import pipeline as P
import cgatPipelines.PipelineTracks as PipelineTracks

# load options from the config file
P.getParameters(["%s/pipeline.ini" % __file__[:-len(".py")],
                 "../pipeline.ini",
                 "pipeline.ini"])

PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

Sample = PipelineTracks.Sample
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(glob.glob("medip_*"),
                                                         "medip_(\S+)")


def connect():
    '''connect to database.

    This method also attaches to helper databases.
    '''

    dbh = sqlite3.connect(PARAMS["database_name"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_database"])
    cc = dbh.cursor()
    cc.execute(statement)
    cc.close()
Beispiel #3
0
PipelineMedip.PARAMS = PARAMS

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = PipelineTracks.Sample3

suffixes = ["export.txt.gz", "sra", "fastq.gz", "cfastq.1.gz", "csfasta.gz"]

TRACKS = sum(
    itertools.chain([
        PipelineTracks.Tracks(Sample).loadFromDirectory([
            x
            for x in glob.glob("*.%s" % s) if PARAMS["tracks_control"] not in x
        ], "(\S+).%s" % s) for s in suffixes
    ]), PipelineTracks.Tracks(Sample))

###################################################################
###################################################################
###################################################################
# if conf.py exists: execute to change the above assignmentsn
if os.path.exists("pipeline_conf.py"):
    L.info("reading additional configuration from pipeline_conf.py")
    exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec'))

###################################################################
###################################################################
###################################################################
# define aggregates
    "*.sra",
    "*.export.txt.gz",
    "*.csfasta.gz",
    "*.csfasta.F3.gz",
)

SEQUENCEFILES = tuple(
    [os.path.join(DATADIR, suffix_name) for suffix_name in SEQUENCESUFFIXES])

SEQUENCEFILES_REGEX = regex(
    r"(\S+)-(\S+)-(\S+).(?P<suffix>fastq.1.gz|fastq.gz|sra)")

Sample = PipelineTracks.AutoSample
Sample.attributes = ('tissue', 'condition', 'replicate')
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    [y for x in SEQUENCESUFFIXES for y in glob.glob(x)],
    "(\S+).(fastq.1.gz|fastq.gz|sra)")

EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("tissue", "condition"))
CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", ))
REPLICATES = PipelineTracks.Aggregate(TRACKS, labels=("replicate", ))

#########################################################################
# summarise read 3'
#########################################################################


@follows(mkdir("sequence_characteristics.dir"))
@transform(SEQUENCEFILES, SEQUENCEFILES_REGEX,
           r"sequence_characteristics.dir/\1-\2-\3.\g<suffix>_start.tsv")
def summariseReadStart(infile, outfile):
###################################################################
###################################################################
###################################################################
##
###################################################################
if os.path.exists("pipeline_conf.py"):
    L.info("reading additional configuration from pipeline_conf.py")
    exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec'))

PARAMS = P.getParameters()

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"), "(\S+).gtf.gz", exclude=(".mapped.gtf.gz", ))

#####################################################################
#####################################################################
#####################################################################


@transform(TRACKS.getTracks("%s.gtf.gz"), suffix(".gtf.gz"), '.psl.gz')
def convertGtf2Psl(infile, outfile):
    """convert a gtf to a psl file.

    This method only takes features of type 'exon' and
    skips all contigs that are not in the genome sequence
    (for example the variant human chromosomes).
    """
# load options from the config file
P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
])
PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################

TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.fastq.gz"), "(\S+).fastq.gz")

USECLUSTER = True

###################################################################
###################################################################
###################################################################


def connect():
    '''connect to database.

    Use this method to connect to additional databases.

    Returns a database connection.
    '''
Beispiel #7
0
###################################################
# Pipeline configuration
# load options from the config file
from cgatcore import pipeline as P
P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"])

PARAMS = P.PARAMS

###################################################################
# Helper functions mapping tracks to conditions, etc
GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"),
    "(\S+).gtf.gz")
TRACKS3 = PipelineTracks.Tracks(PipelineTracks.Sample3)
TRACKS = TRACKS3.loadFromDirectory(glob.glob("*.bam"), "(\S+).bam")
REPLICATE = PipelineTracks.Aggregate(TRACKS, labels=("replicate", ))
TIME = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))


def connect():
    '''connect to database.

    Use this method to connect to additional databases.

    Returns a database connection.
    '''
Beispiel #8
0
USECLUSTER = True

P.getParameters(["%s.ini" % os.path.splitext(__file__)[0],  "pipeline.ini"])
PARAMS = P.PARAMS

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = PipelineTracks.Sample3

#TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( [x for x in glob.glob( "*.fastq.gz" ) if PARAMS["tracks_control"] not in x], "(\S+).fastq.gz" )
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    [x.replace("../", "")
     for x in glob.glob("*.export.txt.gz") if PARAMS["tracks_control"] not in x],
    "(\S+).export.txt.gz" ) +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        [x.replace("../", "")
         for x in glob.glob("*.sra") if PARAMS["tracks_control"] not in x],
        "(\S+).sra" ) +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        [x.replace("../", "")
         for x in glob.glob("*.fastq.gz") if PARAMS["tracks_control"] not in x],
        "(\S+).fastq.gz" ) +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        [x.replace("../", "")
         for x in glob.glob("*.fastq.1.gz") if PARAMS["tracks_control"] not in x],
        "(\S+).fastq.1.gz" ) +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        [x.replace("../", "")
# link up with annotations
PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

# link up with ancestral repeats
PARAMS_ANCESTRAL_REPEATS = P.peekParameters(PARAMS["ancestral_repeats_dir"],
                                            "pipeline_ancestral_repeats.py")

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# collect sra nd fastq.gz tracks
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"),
    "(\S+).gtf.gz",
    exclude=("repeats.gtf.gz", "introns.gtf.gz", "merged.gtf.gz"))

TRACKS_CONTROL = PipelineTracks.Tracks(
    PipelineTracks.Sample).loadFromDirectory(
        ("repeats.gtf.gz", "introns.gtf.gz"), "(\S+).gtf.gz")

TRACKS_META = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    ("merged.gtf.gz", ), "(\S+).gtf.gz")

TRACKS_GENESETS = PipelineTracks.Tracks(
    PipelineTracks.Sample).loadFromDirectory(("genes.gtf.gz", ),
                                             "(\S+).gtf.gz")

# collection of all tracks including controls
TRACKS_WITH_CONTROLS = TRACKS + TRACKS_CONTROL
Beispiel #10
0
     "pipeline.ini"])

PARAMS = P.PARAMS

PARAMS.update(P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py",
    prefix="annotations_",
    update_interface=True))

# Helper functions mapping tracks to conditions, etc
Sample = PipelineTracks.AutoSample

# define tracks based on all samples in .bamfile that are not input or index
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob(os.path.join(PARAMS.get("location_bamfiles", ""), "*.bam")),
    "(\S+).bam",
    exclude=[".+input.+"])


@files(None, None)
def printTracks(infile, outfile):
    P.warn("\n\n\n\nprinting tracks:")
    for track in EXPERIMENTS:
        print("\t")
        print(track)


def get_peak_caller_parameters(peak_caller_id):
    """
    Returns a dictionary of config file parameters for the chosen peak caller
    (an attempt to keep access to PARAMS out of associated pipeline script).