Esempio n. 1
0
import CGAT.Pipeline as P

P.getParameters(["%s.ini" % os.path.splitext(__file__)[0], "pipeline.ini"])
PARAMS = P.PARAMS

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = PipelineTracks.Sample3

#TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( [x for x in glob.glob( "*.fastq.gz" ) if PARAMS["tracks_control"] not in x], "(\S+).fastq.gz" )
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    [x.replace("../", "")
     for x in glob.glob("*.export.txt.gz") if PARAMS["tracks_control"] not in x],
    "(\S+).export.txt.gz" ) +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        [x.replace("../", "")
         for x in glob.glob("*.sra") if PARAMS["tracks_control"] not in x],
        "(\S+).sra" ) +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        [x.replace("../", "")
         for x in glob.glob("*.fastq.gz") if PARAMS["tracks_control"] not in x],
        "(\S+).fastq.gz" ) +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        [x.replace("../", "")
         for x in glob.glob("*.fastq.1.gz") if PARAMS["tracks_control"] not in x],
        "(\S+).fastq.1.gz" ) +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        [x.replace("../", "")
Esempio n. 2
0
###################################################################
###################################################################
## parameterization

EXPORTDIR = P.get('mapping_exportdir', P.get('exportdir', 'export'))
DATADIR = P.get('mapping_datadir', P.get('datadir', '.'))
DATABASE = P.get('mapping_backend', P.get('sql_backend', 'sqlite:///./csvdb'))

###################################################################
# cf. pipeline_rnaseq.py
# This should be automatically gleaned from pipeline_rnaseq.py
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory(
    glob.glob( "%s/*.sra" % DATADIR), "%s/(\S+).sra" % DATADIR) +\
    PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory(
    glob.glob( "%s/*.fastq.gz" % DATADIR), "%s/(\S+).fastq.gz" % DATADIR ) +\
    PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory(
    glob.glob( "%s/*.fastq.1.gz" % DATADIR), "%s/(\S+).fastq.1.gz" % DATADIR ) +\
    PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory(
    glob.glob( "*.csfasta.gz" ), "(\S+).csfasta.gz" )


###########################################################################
## tracks for the gene sets
class GenesetTrack(PipelineTracks.Sample):
    attributes = ("geneset", )


GENESET_TRACKS = PipelineTracks.Tracks(GenesetTrack).loadFromDirectory(
Esempio n. 3
0
PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py",
                                      on_error_raise=__name__ == "__main__")

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = PipelineTracks.Sample

TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob("*.bed.gz"), "(\S+).bed.gz")

TRACKS_BEDFILES = ["%s.bed.gz" % x for x in TRACKS]

###################################################################
###################################################################
###################################################################
# if conf.py exists: execute to change the above assignmentsn
if os.path.exists("pipeline_conf.py"):
    L.info("reading additional configuration from pipeline_conf.py")
    execfile("pipeline_conf.py")

###################################################################
###################################################################
###################################################################
#
# parameterization

EXPORTDIR = P.get('rnaseqdiffexpression_exportdir',
                  P.get('exportdir', 'export'))
DATADIR = P.get('rnaseqdiffexpression_datadir', P.get('datadir', '.'))
DATABASE = P.get('rnaseqdiffexpression_backend',
                 P.get('sql_backend', 'sqlite:///./csvdb'))

DATABASE_ANNOTATIONS = P['annotations_database']

###################################################################
# cf. pipeline_rnaseq.py
# This should be automatically gleaned from pipeline_rnaseq.py
###################################################################

TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("%s/*.bam" % DATADIR), "(\S+).bam")

ALL = PipelineTracks.Aggregate(TRACKS)
EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))
CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", ))
TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", ))

GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"), "(\S+).gtf.gz")

DESIGNS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("design*.tsv"), "(\S+).tsv")

METHODS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*_stats.tsv"), "(\S+)_stats.tsv")
Esempio n. 5
0
import CGAT.Pipeline as Pipeline
PARAMS_PIPELINE = Pipeline.peekParameters( ".",
                                           "pipeline_chipseq.py" )

import CGATPipelines.PipelineTracks as PipelineTracks

Sample = PipelineTracks.Sample3

suffixes = ["export.txt.gz",
            "sra",
            "fastq.gz",
            "fastq.1.gz",
            "csfasta.gz" ]

TRACKS = sum( itertools.chain( [ PipelineTracks.Tracks( Sample ).loadFromDirectory( 
        [ x for x in glob.glob( "%s/*.%s" % (DATADIR, s) ) if "input" not in x ],
        "%s/(\S+).%s" % (DATADIR, s) ) for s in suffixes ] ), 
              PipelineTracks.Tracks( Sample ) )

Sample.setDefault( "asTable" )

ALL = PipelineTracks.Aggregate( TRACKS )
EXPERIMENTS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", "tissue" ) )
CONDITIONS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", ) )
TISSUES = PipelineTracks.Aggregate( TRACKS, labels = ("tissue", ) )

############################################################################
# The folllowing need to be parameterized in a config file
# TISSUES=["GM00855", "GM00861" ]
# CONDITIONS=["D3", "unstim" ]
# REPLICATES=["R1", "R2" ]
Esempio n. 6
0
# add configuration values from associated pipelines
#
# 1. pipeline_annotations: any parameters will be added with the
#    prefix "annotations_". The interface will be updated with
#    "annotations_dir" to point to the absolute path names.
PARAMS.update(P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py",
    on_error_raise=__name__ == "__main__",
    prefix="annotations_",
    update_interface=True))

# define some tracks if needed

TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.ini"), "(\S+).ini")


# --------------------------< utility functions >---------------------------- #

def connect():
    '''Connect to database.
       Use this method to connect to additional databases.
       Returns an sqlite3 database handle.
    '''

    dbh = sqlite3.connect(PARAMS["database"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_database"])
    cc = dbh.cursor()
    cc.execute(statement)
Esempio n. 7
0
###################################################################
###################################################################
## parameterization

EXPORTDIR=P['rnaseqtranscripts_exportdir']
DATADIR=P['rnaseqtranscripts_datadir']
DATABASE=P['rnaseqtranscripts_backend']

###################################################################
# cf. pipeline_rnaseq.py
# This should be automatically gleaned from pipeline_rnaseq.py
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( 
    glob.glob( "%s/*.bam" % DATADIR), "%s/(\S+).bam" % DATADIR)

ALL = PipelineTracks.Aggregate( TRACKS )
EXPERIMENTS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", "tissue" ) )
CONDITIONS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", ) )
TISSUES = PipelineTracks.Aggregate( TRACKS, labels = ("tissue", ) )

GENESETS = PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory( 
    glob.glob( "*.gtf.gz" ), "(\S+).gtf.gz" )

###########################################################################

CUFFDIFF_LEVELS= ("gene", "isoform", "cds", "tss")

###########################################################################
## shorthand
Esempio n. 8
0
###################################################

# load options from the config file
import CGAT.Pipeline as P
P.getParameters("pipeline.ini")

PARAMS = P.PARAMS

###################################################################
###################################################################
## Helper functions mapping tracks to conditions, etc
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

# collect fastq.gz tracks
TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
        glob.glob( "*.fastq.gz" ), "(\S+).fastq.gz" ) +\
        PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
            glob.glob( "*.fastq.1.gz" ), "(\S+).fastq.1.gz" )

ALL = PipelineTracks.Sample3()
EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))
CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", ))
TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", ))

###################################################################
## Global flags
###################################################################
ASSEMBLERS = P.asList(PARAMS["general_assemblers"])
METAGENOME = "meta-velvet" in ASSEMBLERS or "ibda" in ASSEMBLERS or "cortex_var" in ASSEMBLERS

ASSEMBLERS = P.asList(PARAMS["assemblers"])
Esempio n. 9
0
###################################################
# Pipeline configuration
# load options from the config file
from CGATCore import Pipeline as P

P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
])

PARAMS = P.PARAMS

###################################################################
# Helper functions mapping tracks to conditions, etc
GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"), "(\S+).gtf.gz")
TRACKS3 = PipelineTracks.Tracks(PipelineTracks.Sample3)
TRACKS = TRACKS3.loadFromDirectory(glob.glob("*.bam"), "(\S+).bam")
REPLICATE = PipelineTracks.Aggregate(TRACKS, labels=("replicate", ))
TIME = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))


def connect():
    '''connect to database.

    Use this method to connect to additional databases.

    Returns a database connection.
    '''

    dbh = sqlite3.connect(PARAMS["database_name"])
Esempio n. 10
0
PARAMS = P.PARAMS
PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

##########################################################################
##########################################################################
# Helper functions mapping tracks to conditions, etc
##########################################################################

import CGATPipelines.PipelineTracks as PipelineTracks

Sample = PipelineTracks.AutoSample

# define tracks based on all samples in .bamfile that are not input or index
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob(os.path.join(PARAMS["location_bamfiles"], "*.bam")),
    "(\S+).bam",
    exclude=[".+input.+"])


@files(None, None)
def printTracks(infile, outfile):
    P.warn("\n\n\n\nprinting tracks:")
    for track in EXPERIMENTS:
        print "\t"
        print track


def get_peak_caller_parameters(peak_caller_id):
    """
    Returns a dictionary of config file parameters for the chosen peak caller
    (an attempt to keep access to PARAMS out of associated pipeline script).
Esempio n. 11
0
                    "*.sra",
                    "*.export.txt.gz",
                    "*.csfasta.gz",
                    "*.csfasta.F3.gz",
                    )

SEQUENCEFILES = tuple([os.path.join(DATADIR, suffix_name)
                       for suffix_name in SEQUENCESUFFIXES])

SEQUENCEFILES_REGEX = regex(
    r"(\S+)-(\S+)-(\S+).(?P<suffix>fastq.1.gz|fastq.gz|sra)")

Sample = PipelineTracks.AutoSample
Sample.attributes = ('tissue', 'condition', 'replicate')
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    [y for x in SEQUENCESUFFIXES for y in glob.glob(x)],
    "(\S+).(fastq.1.gz|fastq.gz|sra)")

EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("tissue", "condition"))
CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", ))
REPLICATES = PipelineTracks.Aggregate(TRACKS, labels=("replicate", ))

#########################################################################
# summarise read 3'
#########################################################################


@follows(mkdir("sequence_characteristics.dir"))
@transform(SEQUENCEFILES,
           SEQUENCEFILES_REGEX,
           r"sequence_characteristics.dir/\1-\2-\3.\g<suffix>_start.tsv")
])

PARAMS = P.PARAMS
PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

Sample = PipelineTracks.AutoSample

# collect sra nd fastq.gz tracks
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob("*.bam"), "(\S+).bam")

# group by experiment (assume that last field is a replicate identifier)
EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))

GENESETS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"), "(\S+).gtf.gz")

###################################################################
###################################################################
###################################################################


def connect():
    '''connect to database.
Esempio n. 13
0
## Pipeline configuration
import CGAT.Pipeline as P
P.getParameters("pipeline_capseq.ini")
PARAMS = P.PARAMS
USECLUSTER = True

###################################################################
###################################################################
###################################################################
## Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = PipelineTracks.Sample3

TRACKS = PipelineTracks.Tracks( Sample ).loadFromDirectory(
    [ x for x in glob.glob( "*.export.txt.gz" ) if PARAMS["tracks_control"] not in x ],
      "(\S+).export.txt.gz" ) +\
      PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
          [ x for x in glob.glob( "*.sra" ) if PARAMS["tracks_control"] not in x ],
          "(\S+).sra" ) +\
          PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
              [x for x in glob.glob( "*.fastq.gz" ) if PARAMS["tracks_control"] not in x],
              "(\S+).fastq.gz" ) +\
              PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
                  [x for x in glob.glob( "*.fastq.1.gz" ) if PARAMS["tracks_control"] not in x],
                  "(\S+).fastq.1.gz" ) +\
                  PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
                      [ x for x in glob.glob( "*.csfasta.gz" ) if PARAMS["track_control"] not in x],
                        "(\S+).csfasta.gz" )
for X in TRACKS:
    print "TRACK=", X, "\n"
Esempio n. 14
0
PARAMS = P.PARAMS
PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

PipelineiCLIP.PARAMS = PARAMS
PipelineiCLIP.PARAMS_ANNOTATIONS = PARAMS_ANNOTATIONS
PARAMS["project_src"] = os.path.join(os.path.dirname(__file__), "..")

###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

# define some tracks if needed
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample3)
for line in IOTools.openFile("sample_table.tsv"):
    track = line.split("\t")[2]
    TRACKS.tracks.append(PipelineTracks.Sample3(filename=track))


###################################################################
def connect():
    '''connect to database.

    Use this method to connect to additional databases.

    Returns a database connection.
    '''

    dbh = sqlite3.connect(PARAMS["database"])
Esempio n. 15
0
                                      "pipeline_annotations.py",
                                      on_error_raise=__name__ == "__main__")

# link up with ancestral repeats
PARAMS_ANCESTRAL_REPEATS = P.peekParameters(PARAMS["ancestral_repeats_dir"],
                                            "pipeline_ancestral_repeats.py")

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

# collect sra nd fastq.gz tracks
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"),
    "(\S+).gtf.gz",
    exclude=("repeats.gtf.gz", "introns.gtf.gz", "merged.gtf.gz"))

TRACKS_CONTROL = PipelineTracks.Tracks(
    PipelineTracks.Sample).loadFromDirectory(
        ("repeats.gtf.gz", "introns.gtf.gz"), "(\S+).gtf.gz")

TRACKS_META = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    ("merged.gtf.gz", ), "(\S+).gtf.gz")

TRACKS_GENESETS = PipelineTracks.Tracks(
    PipelineTracks.Sample).loadFromDirectory(("genes.gtf.gz", ),
                                             "(\S+).gtf.gz")

# collection of all tracks including controls
TRACKS_WITH_CONTROLS = TRACKS + TRACKS_CONTROL
Esempio n. 16
0
###################################################################
###################################################################
###################################################################
##
###################################################################
if os.path.exists("pipeline_conf.py"):
    L.info("reading additional configuration from pipeline_conf.py")
    exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec'))

PARAMS = P.getParameters()

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"), "(\S+).gtf.gz", exclude=(".mapped.gtf.gz", ))

#####################################################################
#####################################################################
#####################################################################


@transform(TRACKS.getTracks("%s.gtf.gz"), suffix(".gtf.gz"), '.psl.gz')
def convertGtf2Psl(infile, outfile):
    """convert a gtf to a psl file.

    This method only takes features of type 'exon' and
    skips all contigs that are not in the genome sequence
    (for example the variant human chromosomes).
    """
Esempio n. 17
0
def makeAdaptorFasta(infile, outfile, track, dbh, contaminants_file):
    '''Generate a .fasta file of adaptor sequences that are
    overrepresented in the reads from a sample.

    Requires cutadapt >= 1.7.

    Arguments
    ---------
    infile : string
        Input filename that has been QC'ed. The filename is used to
        check if the input was a :term:`sra` file and guess the
        number of tracks to check.
    outfile : string
        Output filename in :term:`fasta` format.
    track : string
        Track name, used to access FastQC results in database.
    dbh : object
        Database handle.
    contaminants_file : string
        Path of file containing contaminants used for screening by
        Fastqc.

    '''
    tracks = [track]

    if infile.endswith(".sra"):
        # patch for SRA files, look at multiple tracks
        f, fastq_format, datatype = Sra.peek(infile)
        if len(f) == 2:
            tracks = [track + "_fastq_1", track + "_fastq_2"]
    elif infile.endswith(".fastq.1.gz"):
        tracks = [track + "_fastq_1", track + "_fastq_2"]
    elif infile.endswith(".fastq.gz"):
        tracks = [track]

    found_contaminants = []

    for t in tracks:
        table = PipelineTracks.AutoSample(os.path.basename(t)).asTable()

        # if sample name starts with a number, sql table will have
        # prepended "_"
        if re.match("^\d+.*", table):
            table = "_" + table

        query = '''SELECT Possible_Source, Sequence FROM
        %s_fastqc_Overrepresented_sequences;''' % table

        cc = dbh.cursor()

        # if there is no contamination table for even a single sample
        # it will prevent the whole pipeline progressing
        try:
            found_contaminants.extend(cc.execute(query).fetchall())
        except sqlite3.OperationalError:
            E.warn("No table found for {}".format(t))

    if len(found_contaminants) == 0:
        P.touch(outfile)
        return

    # read contaminants from existing file
    with IOTools.openFile(contaminants_file, "r") as inf:
        known_contaminants = [l.split() for l in inf
                              if not l.startswith("#") and l.strip()]
        known_contaminants = {" ".join(x[:-1]): x[-1]
                              for x in known_contaminants}

    # output the full sequence of the contaminant if found
    # in the list of known contaminants, otherwise don't report!

    matched_contaminants = set()
    with IOTools.openFile(outfile, "w") as outf:
        for found_source, found_seq in found_contaminants:
            possible_source = found_source.split(" (")[0]

            if possible_source in known_contaminants:
                matched_contaminants.update((possible_source,))
            else:
                pass

        if len(matched_contaminants) > 0:
            for match in matched_contaminants:
                outf.write(">%s\n%s\n" % (match.replace(" ,", ""),
                                          known_contaminants[match]))
Esempio n. 18
0
PipelineMedip.PARAMS = PARAMS

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = PipelineTracks.Sample3

suffixes = ["export.txt.gz", "sra", "fastq.gz", "cfastq.1.gz", "csfasta.gz"]

TRACKS = sum(
    itertools.chain([
        PipelineTracks.Tracks(Sample).loadFromDirectory([
            x
            for x in glob.glob("*.%s" % s) if PARAMS["tracks_control"] not in x
        ], "(\S+).%s" % s) for s in suffixes
    ]), PipelineTracks.Tracks(Sample))

###################################################################
###################################################################
###################################################################
# if conf.py exists: execute to change the above assignmentsn
if os.path.exists("pipeline_conf.py"):
    L.info("reading additional configuration from pipeline_conf.py")
    exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec'))

###################################################################
###################################################################
###################################################################
# define aggregates
Esempio n. 19
0
from CGATCore import Pipeline as P
import CGATPipelines.PipelineTracks as PipelineTracks

# load options from the config file
P.getParameters([
    "%s/pipeline.ini" % __file__[:-len(".py")], "../pipeline.ini",
    "pipeline.ini"
])

PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

Sample = PipelineTracks.Sample
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob("medip_*"), "medip_(\S+)")


def connect():
    '''connect to database.

    This method also attaches to helper databases.
    '''

    dbh = sqlite3.connect(PARAMS["database_name"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_database"])
    cc = dbh.cursor()
    cc.execute(statement)
    cc.close()
Esempio n. 20
0
PipelineMotifs.PARAMS = PARAMS

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
# determine the location of the input files (reads).
DATADIR = PARAMS.get('input', '.')
if not os.path.exists(DATADIR):
    raise OSError('data directory %s does not exists')

Sample = PipelineTracks.Sample

TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob(os.path.join(DATADIR, "*.bed.gz")), "(\S+).bed.gz")

BEDFILES = [os.path.join(DATADIR, "%s.bed.gz") % x for x in TRACKS]


# create an indicator target
@transform(BEDFILES, suffix(".gz"), ".gz")
def BedFiles(infile, outfile):
    pass


BAMFILES = glob.glob(os.path.join(DATADIR, "*.bam"))


def getAssociatedBAMFiles(track):
    '''return a list of BAM files associated with a track.
Esempio n. 21
0
###################################################################
###################################################################
# parameterization

EXPORTDIR = P.get('readqc_exportdir', P.get('exportdir', 'export'))
DATADIR = P.get('readqc_datadir', P.get('datadir', '.'))
DATABASE = P.get('readqc_backend', P.get('sql_backend', 'sqlite:///./csvdb'))

###################################################################
# cf. pipeline_rnaseq.py
# This should be automatically gleaned from pipeline_rnaseq.py
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("%s/*.sra" % DATADIR), "(\S+).sra") +\
    PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
        glob.glob("%s/*.fastq.gz" % DATADIR), "(\S+).fastq.gz") +\
    PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
        glob.glob("%s/*.fastq.1.gz" % DATADIR), "(\S+).fastq.1.gz") +\
    PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
        glob.glob("*.csfasta.gz"), "(\S+).csfasta.gz")

###########################################################################


class ReadqcTracker(TrackerSQL):
    '''Define convenience tracks for plots'''
    def __init__(self, *args, **kwargs):
        TrackerSQL.__init__(self, *args, backend=DATABASE, **kwargs)
Esempio n. 22
0
    dbh = sqlite3.connect(PARAMS["database_name"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_database"])
    cc = dbh.cursor()
    cc.execute(statement)
    cc.close()

    return dbh


class MySample(PipelineTracks.Sample):
    attributes = tuple(PARAMS["attributes"].split(","))


TRACKS = PipelineTracks.Tracks(MySample).loadFromDirectory(
    glob.glob("*.bam"), "(\S+).bam")

Sample = PipelineTracks.AutoSample
DESIGNS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob("*.design.tsv"), "(\S+).design.tsv")

###################################################################
###################################################################
###################################################################
# DEXSeq workflow
###################################################################


@mkdir("results.dir")
@files(PARAMS["annotations_interface_geneset_all_gtf"], "geneset_flat.gff")
def buildGff(infile, outfile):