Python Pipeline Examples, CGATCore.Pipeline Python Examples

Example #1

0

Show file

def loadDESeqNormalize(infile, outfile):
    P.load(infile, outfile, transpose=True)

Example #2

0

Show file

def update_report():
    '''update report.'''

    E.info("updating documentation")
    P.run_report(clean=False)

Example #3

0

Show file

def loadGeneSetStats(infile, outfile):
    '''
    load stats on coding and lncRNA gene sets
    '''
    P.load(infile, outfile)

Example #4

0

Show file

File: pipeline_germlinevariantcalls_original.py Project: sudlab/pipeline_germlinevariantcalls

def publish_report():
    '''publish report in the CGAT downloads directory.'''

    E.info("publishing report")
    P.publish_report()

Example #5

0

Show file

def loadControlCPCResults(infile, outfile):
    P.load(infile,
           outfile,
           options="--header-names=transcript_id,feature,C_NC,CP_score "
           "--add-index=transcript_id")

Example #6

0

Show file

File: PipelineUCSC.py Project: logust79/cgat-flow

def getRepeatsFromUCSC(dbhandle,
                       repclasses,
                       outfile,
                       remove_contigs_regex=None):
    '''download repeats from UCSC database and write to `outfile` in
    :term:`gff` format.

    This method downloads repeats from the repeatmasker track at
    the UCSC.

    Arguments
    ---------
    dbhandle : object
       Database handle to UCSC mysql database
    repclasses : list
       List of repeat classes to select. If empty, all repeat classes
       will be collected.
    outfile : string
       Filename of output file in :term:`gff` format.
    remove_contigs_regex : list
       If given, remove repeats on contigs matching the regular
       expression given.

    '''

    # Repeats are either stored in a single ``rmsk`` table (hg19) or in
    # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, ....
    # In order to do a single statement, the ucsc mysql database is
    # queried for tables that end in rmsk.
    cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'")
    tables = [x[0] for x in cc.fetchall()]
    if len(tables) == 0:
        raise ValueError("could not find any `rmsk` tables")

    # now collect repeats
    tmpfile = P.get_temp_file(".")

    for table in tables:

        sql = """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd,
        '.', strand, '.',
        CONCAT('class \\"', repClass, '\\"; family \\"',
        repFamily, '\\"; repName \\"', repName, '\\";')
        FROM %(table)s"""

        if repclasses:
            repclasses_str = ",".join(
                ["'" + x.strip() + "'" for x in repclasses])
            sql += ''' WHERE repClass in (%(repclasses_str)s) ''' % locals()

        sql = sql % locals()

        E.debug("executing sql statement: %s" % sql)
        cc = dbhandle.execute(sql)
        for data in cc.fetchall():
            tmpfile.write("\t".join(map(str, data)) + "\n")

    tmpfile.close()

    # sort gff and make sure that names are correct
    tmpfilename = tmpfile.name

    statement = [
        '''cat %(tmpfilename)s
    | sort -t$'\\t' -k1,1 -k4,4n
    | cgat gff2gff
    --method=sanitize
    --sanitize-method=genome
    --skip-missing
    --genome-file=%(genome_dir)s/%(genome)s
    --log=%(outfile)s.log '''
    ]

    if remove_contigs_regex:
        statement.append('--contig-pattern="{}"'.format(
            ",".join(remove_contigs_regex)))

    statement.append('| gzip > %(outfile)s')

    statement = " ".join(statement)

    P.run(statement)

    os.unlink(tmpfilename)

Example #7

0

Show file

import os
import gzip
from ruffus import *

from CGATCore import Pipeline as P
import CGATPipelines.PipelineTracks as PipelineTracks
import CGATCore.IOTools as IOTools

###################################################
###################################################
###################################################
# Pipeline configuration
###################################################
# load options from the config file
P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
])
PARAMS = P.PARAMS

###################################################################
###################################################################
###################################################################
##
###################################################################
if os.path.exists("pipeline_conf.py"):
    L.info("reading additional configuration from pipeline_conf.py")
    exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec'))

PARAMS = P.getParameters()

###################################################################

Example #8

0

Show file

File: conf.py Project: logust79/cgat-flow

                        'pipeline_docs', 'themes')
logopath = os.path.join(themedir, "cgat_logo.png")

################################################################
# Import pipeline configuration from pipeline.ini in the current
# directory and the common one.

# PATH were code for pipelines is stored
pipelinesdir = os.path.dirname(CGATPipelines.__file__)

# The default configuration file - 'inifile' is read by
# sphinx-report.
inifile = os.path.join(os.path.dirname(CGATPipelines.__file__),
                       'configuration', 'pipeline.yml')

PARAMS = P.get_parameters([inifile, "pipeline.yml"])

# Definition now part of CGATReport
# def setup(app):
#     app.add_config_value('PARAMS', {}, True)

################################################################
################################################################
################################################################
# The pipeline assumes that sphinxreport is called within the
# working directory. If the report is in a separate build directory,
# change the paths below.
#
# directory with export directory from pipeline
# This should be a directory in the build directory - you can
# link from here to a directory outside the build tree, though.

Example #9

0

Show file

File: pipeline_motifs.py Project: logust79/cgat-flow

import CGATCore.Experiment as E
import CGATCore.IOTools as IOTools

import CGATPipelines.PipelineMotifs as PipelineMotifs
import CGATPipelines.PipelineTracks as PipelineTracks
from CGATPipelines.Report import run_report

###################################################
###################################################
###################################################
# Pipeline configuration
###################################################
from CGATCore import Pipeline as P
P.get_parameters([
    "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml",
    "pipeline.yml"
],
                 defaults={'annotations_dir': ""})

PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peek_parameters(PARAMS["annotations_dir"], "genesets")

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = PipelineTracks.Sample

Example #10

0

Show file

def sortByPosition(infile, outfile):
    '''Add number of hits tags to sam file'''
    to_cluster = USECLUSTER
    track = P.snip(outfile, ".bam")
    statement = '''samtools sort %(infile)s %(track)s;'''
    P.run()

Example #11

0

Show file

import sys
import os
import shutil
import sqlite3
import subprocess
import glob
from CGATCore import Experiment as E
import CGAT.Sra as Sra
from CGATCore import Pipeline as P
import CGATPipelines.PipelineRnaseq as RnaSeq
import tempfile
from CGATPipelines.Report import run_report

# load options from the config file
PARAMS = P.get_parameters([
    "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml",
    "pipeline.yml"
])

# add configuration values from associated pipelines
#
# 1. pipeline_annotations: any parameters will be added with the
#    prefix "annotations_". The interface will be updated with
#    "annotations_dir" to point to the absolute path names.
PARAMS.update(
    P.peek_parameters(PARAMS["annotations_dir"],
                      'genesets',
                      prefix="annotations_",
                      update_interface=True,
                      restrict_interface=True))

PARAMS["project_src"] = os.path.dirname(__file__)

Example #12

0

Show file

def loadFilteredData(infile, outfile):
    P.load(infile, outfile)

Example #13

0

Show file

def loadTimePointDiffExpression(infile, outfile):
    P.load(infile, outfile)

Example #14

0

Show file

def loadConditionDiffExpression(infile, outfile):
    P.load(infile, outfile)

Example #15

0

Show file

File: pipeline_transcriptome.py Project: logust79/cgat-flow

def loadRepeats(infile, outfile):
    '''load repeat overlap'''
    P.load(infile, outfile, "--add-index=gene_id --map=gene_id:str")

Example #16

0

Show file

File: pipeline_motifs.py Project: logust79/cgat-flow

def getAssociatedBAMFiles(track):
    '''return a list of BAM files associated with a track.

    By default, this method searches for ``track.bam`` file in the
    current directory and returns an offset of 0.

    Associations can be defined in the .yml file in the section
    [bams]. For example, the following snippet associates track
    track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`::

       [bams]
       track1=track1.bam,track2.bam

    Glob expressions are permitted.

    Offsets are used to shift tags in ChIP experiments. Offsets
    need to be defined in the [offsets] sections. If no offsets
    are defined, the method returns a list of 0 offsets.

    Offsets need to be defined in the same order as the bam files::

       [offsets]
       track1=120,200

    returns a list of BAM files and offsets.

    Default tracks and offsets can be specified using a placeholder ``%``. The
    following will associate all tracks with the same bam file::

        [bams]
        %=all.bam

    '''
    fn = track.asFile()
    bamfiles = glob.glob("%s.bam" % fn)

    if bamfiles == []:
        if "bams_%s" % fn.lower() in PARAMS:
            for ff in P.as_list(PARAMS["bams_%s" % fn.lower()]):
                bamfiles.extend(glob.glob(ff))
        else:
            for pattern, value in P.CONFIG.items("bams"):
                if "%" in pattern:
                    p = re.sub("%", "\S+", pattern)
                    if re.search(p, fn, re.IGNORECASE):
                        bamfiles.extend(glob.glob(value))

    offsets = []
    if "offsets_%s" % fn.lower() in PARAMS:
        offsets = list(map(int, P.as_list(PARAMS["offsets_%s" % fn.lower()])))
    else:
        for pattern, value in P.CONFIG.items("offsets"):
            if "%" in pattern:
                p = re.sub("%", "\S+", pattern)
                if re.search(p, fn, re.IGNORECASE):
                    offsets.extend(list(map(int, value.split(","))))

    if offsets == []:
        offsets = [0] * len(bamfiles)

    if len(bamfiles) != len(offsets):
        raise ValueError("number of BAM files %s is not the "
                         "same as number of offsets: %s" %
                         (str(bamfiles), str(offsets)))

    return bamfiles, offsets

Example #17

0

Show file

File: pipeline_transcriptome.py Project: logust79/cgat-flow

def buildCodingPotential(infile, outfile):
    '''run CPC analysis as in the cpc script.

    This module runs framefinder and blastx on both strands.
    It seems to work, but I have not thoroughly tested it.
    I expect that the false positive rate increases (i.e.,
    predicting non-coding as coding) in cases where the best
    framefinder match and the best blast match are on opposite
    strands. In the original CPC, these would be separated.
    '''

    try:
        cpc_dir = os.environ["CPC_HOME"]
    except KeyError:
        raise ValueError("CPC_HOME environment variable is not set. ")

    tmpdir = P.getTempDir(".")
    track = P.snip(outfile, ".coding.gz")

    # extract features for frame finder
    # replaces extract_framefinder_feats.pl to parse both strands
    with open(os.path.join(tmpdir, "ff.feat"), "w") as outf:
        outf.write(
            "\t".join(("QueryID", "CDSLength", "Score", "Used", "Strict")) + "\n")
        for line in IOTools.openFile("%s.frame.gz" % track):
            if line.startswith(">"):
                try:
                    (id, start, end, score, used, mode, tpe) = \
                        re.match(
                            ">(\S+).*framefinder \((\d+),(\d+)\) score=(\S+) used=(\S+)% \{(\S+),(\w+)\}", line).groups()
                except AttributeError:
                    raise ValueError("parsing error in line %s" % line)
                length = int(end) - int(start) + 1
                strict = int(tpe == "strict")
                outf.write(
                    "\t".join((id, str(length), used, str(strict))) + "\n")

    to_cluster = USECLUSTER

    # extract features and prepare svm data
    s = []

    s.append('''
    zcat %(infile)s
    | perl %(cpc_dir)s/libs/blast2table.pl
    | tee %(tmpdir)s/blastx.table
    | perl %(cpc_dir)s/bin/extract_blastx_features.pl
    > %(tmpdir)s/blastx.feat1;
    ''')

    s.append('''
    cat %(track)s_norepeats.fasta
    | perl %(cpc_dir)s/bin/add_missing_entries.pl
       %(tmpdir)s/blastx.feat1
    > %(tmpdir)s/blastx.feat;
    ''')

    # step 2 - prepare data
    s.append('''
    perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,4,6 NA NA %(tmpdir)s/blastx.feat
    > %(tmpdir)s/blastx.lsv;
    ''')

    s.append('''
    perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,3,4,5 NA NA %(tmpdir)s/ff.feat
    > %(tmpdir)s/ff.lsv;
    ''')

    s.append('''
    perl -w %(cpc_dir)s/bin/lsv_cbind.pl %(tmpdir)s/blastx.lsv %(tmpdir)s/ff.lsv
    > %(tmpdir)s/test.lsv;
    ''')

    s.append('''
    %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-scale
               -r %(cpc_dir)s/data/libsvm.range
               %(tmpdir)s/test.lsv
    > %(tmpdir)s/test.lsv.scaled;
    ''')

    # step 3: prediction
    m_libsvm_model0 = os.path.join(cpc_dir, "data/libsvm.model0")  # standard
    m_libsvm_model = os.path.join(cpc_dir, "data/libsvm.model")  # Prob
    m_libsvm_model2 = os.path.join(
        cpc_dir, "data/libsvm.model2")  # Prob + weighted version
    m_libsvm_range = os.path.join(cpc_dir, "data/libsvm.range")

    s.append('''
               %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-predict2
               %(tmpdir)s/test.lsv.scaled
               %(m_libsvm_model0)s
               %(tmpdir)s/test.svm0.predict
    > %(tmpdir)s/test.svm0.stdout 2> %(tmpdir)s/test.svm0.stderr;
    ''')

    s.append('''
    printf "gene_id\\tlength\\tresult\\tvalue\\n"
    | gzip > %(outfile)s;
    cat %(tmpdir)s/test.svm0.predict
    | perl -w %(cpc_dir)s/bin/predict.pl %(track)s_norepeats.fasta
    | gzip >> %(outfile)s;
    ''')

    # generate reports
    s.append('''cat %(tmpdir)s/blastx.feat
    | perl -w %(cpc_dir)s/bin/generate_plot_features.pl %(tmpdir)s/blastx.table <( zcat %(track)s.frame.gz)
    | perl -w %(cpc_dir)s/bin/split_plot_features_by_type.pl %(outfile)s.homology %(outfile)s.orf;
    gzip %(outfile)s.orf %(outfile)s.homology;
    ''')

    # now run it all
    statement = " checkpoint; ".join(s)
    P.run()

    # clean up
    shutil.rmtree(tmpdir)

Example #18

0

Show file

File: pipeline_motifs.py Project: logust79/cgat-flow

def indexIntervals(infile, outfile):
    '''index intervals.
    '''
    statement = '''zcat %(infile)s | sort -k1,1 -k2,2n | bgzip > %(outfile)s; tabix -p bed %(outfile)s'''
    P.run(statement)

Example #19

0

Show file

File: pipeline_pq_example.py Project: sk8517/pq_example-

        P.run(statement)

    elif (os.path.exists(report_dir) and os.path.isdir(report_dir)
          and os.listdir(report_dir)):
        sys.exit(''' {} exists, not overwriting. You can manually run:
                       make html ;
                       ln -sf _build/html/report_pipeline_pq_example.html . ;
                       make latexpdf ;
                       ln -sf _build/latex/pq_example.pdf .
                       Or delete the folder and re-run make_report
                 '''.format(report_dir))

    else:
        sys.exit(''' The directory "pipeline_report" does not exist.
                     Are the paths correct?
                     Template files were tried to be copied from:
                     {}
                     You can also manually copy files and run "make html" or
                     "make latexpdf".
                 '''.format(report_path))

    return


################

################
if __name__ == "__main__":
    sys.exit(P.main(sys.argv))
################

Example #20

0

Show file

File: pipeline_motifs.py Project: logust79/cgat-flow

def buildMemeBackgroundFiles(infile, outfile):
    '''prepare the meme background model'''
    statement = '''fasta-get-markov -m 2 %(infile)s  > %(outfile)s''' % locals(
    )
    P.run(statement)

Example #21

0

Show file

File: pipeline_germlinevariantcalls_original.py Project: sudlab/pipeline_germlinevariantcalls

Code
====

"""
from ruffus import *

import sys
import os
import sqlite3
import CGATCore.Experiment as E
from CGATCore import Pipeline as P
import re

# load options from the config file
PARAMS = P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
])

PARAMS["projectsrc"] = os.path.dirname(__file__)
#for key, value in PARAMS.iteritems():
#    print "%s:\t%s" % (key,value)

# add configuration values from associated pipelines
#
# 1. pipeline_annotations: any parameters will be added with the
#    prefix "annotations_". The interface will be updated with
#    "annotations_dir" to point to the absolute path names.
PARAMS.update(
    P.peekParameters(PARAMS["annotations_dir"],
                     "pipeline_annotations.py",
                     on_error_raise=__name__ == "__main__",

Example #22

0

Show file

File: pipeline_transcriptome.py Project: logust79/cgat-flow

import re
import glob
import os
import gzip
from ruffus import *
import sqlite3
import CGATCore.Experiment as E
from CGATCore import Pipeline as P
import CGATCore.IOTools as IOTools
import CGATCore.Database as Database
import CGAT.GTF as GTF
import CGATPipelines.PipelineTracks as PipelineTracks

# load options from the config file
P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"])

PARAMS = P.PARAMS

USECLUSTER = True

# link up with annotations
PARAMS_ANNOTATIONS = P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py")

# link up with ancestral repeats
PARAMS_ANCESTRAL_REPEATS = P.peekParameters(
    PARAMS["ancestral_repeats_dir"],
    "pipeline_ancestral_repeats.py")

Example #23

0

Show file

import CGATCore.Experiment as E
from CGATCore import Pipeline as P
import CGAT.GTF as GTF
import CGATCore.IOTools as IOTools
import CGATPipelines.PipelineLncRNA as PipelineLncRNA

###################################################
# Pipeline configuration
###################################################
P.getParameters(
    [
        "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
        "pipeline.ini"
    ],
    defaults={
        "annotations_dir": "",
        "genesets_abinitio_coding": "pruned.gtf.gz",
        "genesets_abinitio_lncrna": "pruned.gtf.gz",
        "genesets_reference": "reference.gtf.gz",
        "genesets_refcoding": "refcoding.gtf.gz",
        "genesets_previous": ""
    })

PARAMS = P.PARAMS

PARAMS.update(
    P.peekParameters(PARAMS["annotations_annotations_dir"],
                     "pipeline_annotations.py",
                     prefix="annotations_",
                     update_interface=True))

Example #24

0

Show file

File: pipeline_transcriptome.py Project: logust79/cgat-flow

def loadSummary(infile, outfile):
    '''load several rates into a single convenience table.
    '''

    stmt_select = []
    stmt_from = []
    stmt_where = ["1"]

    track = infile[:-len(".gtf.gz")]

    tablename = "%s_evol" % track

    if os.path.exists("%s_rates.load" % track):
        stmt_select.append("a.distance AS ks, a.aligned AS aligned")
        stmt_from.append('''LEFT JOIN %(track)s_rates AS a
        ON r.gene_id = a.gene_id AND
        a.aligned >= %(rates_min_aligned)i AND
        a.distance <= %(rates_max_rate)f''')

    if os.path.exists("%s_coverage.load" % track):
        stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage")
        stmt_from.append(
            "LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id")

    if os.path.exists("%s_repeats_gc.load" % track):
        stmt_select.append("ar_gc.exons_mean AS repeats_gc")
        stmt_from.append(
            "LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id")

    if os.path.exists("%s_repeats_rates.load" % track):
        stmt_select.append(
            "ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska")
        stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar
                     ON r.gene_id = ar.gene_id AND
                     ar.exons_nval >= %(rates_min_repeats)i''')

    if os.path.exists("%s_introns_rates.load" % track):
        stmt_select.append(
            "ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski")
        stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir
                            ON r.gene_id = ir.gene_id AND
                            ir.aligned >= %(rates_min_aligned)i''')

    x = locals()
    x.update(PARAMS)
    stmt_select = ", ".join(stmt_select) % x
    stmt_from = " ".join(stmt_from) % x
    stmt_where = " AND ".join(stmt_where) % x

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    Database.executewait(
        dbhandle, "DROP TABLE IF EXISTS %(tablename)s " % locals())

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT
    CAST(r.gene_id AS TEXT) AS gene_id,
    r.exons_sum as length,
    r.exons_pGC as pgc,
    %(stmt_select)s
    FROM
    %(track)s_annotation AS r
    %(stmt_from)s
        WHERE %(stmt_where)s
    ''' % locals()

    Database.executewait(dbhandle, statement)
    dbhandle.commit()
    P.touch(outfile)

Example #25

0

Show file

def build_report():
    '''build report from scratch.'''

    E.info("starting documentation build process from scratch")
    P.run_report(clean=True)

Example #26

0

Show file

File: pipeline_transcriptome.py Project: logust79/cgat-flow

def loadOverrun(infile, outfile):
    '''load annotations'''
    P.load(infile, outfile, "--add-index=gene_id --map=gene_id:str")

Example #27

0

Show file

def main(argv=None):
    if argv is None:
        argv = sys.argv
    P.main(argv)

Example #28

0

Show file

File: pipeline_transcriptome.py Project: logust79/cgat-flow

def loadDistances(infile, outfile):
    '''load annotations'''
    P.load(infile, outfile,
           "--add-index=gene_id --map=gene_id:str --add-index=closest_id --map=closest_id:str")
    table = outfile[:-len(".load")]

Example #29

0

Show file

def loadLncRNAPhyloCSF(infile, outfile):
    tmpf = P.getTempFilename("/ifs/scratch")
    PipelineLncRNA.parsePhyloCSF(infile, tmpf)
    P.load(tmpf, outfile, options="--add-index=gene_id")

Example #30

0

Show file

def loadCombinedExpression(infile, outfile):
    P.load(infile, outfile)