コード例 #1
0
def buildAllStats(infiles, outfile):
    '''
    paste stats together
    '''
    statement = '''paste %s > %s''' % (
        " ".join([infile for infile in infiles]), outfile)
    P.run()
コード例 #2
0
ファイル: PipelineGeneset.py プロジェクト: logust79/cgat-flow
def buildCDS(infile, outfile):
    '''output CDS features from an ENSEMBL gene set.

    Take all features from a :term:`gtf` file that are of feature type
    ``CDS`` and that are annotated as protein-coding.

    Note that only the coding parts of exons are output - UTR's are
    removed.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Output filename in :term:`gtf` format.

    '''
    statement = '''
    gunzip < %(infile)s
    | cgat gtf2gtf
    --method=filter --filter-method=proteincoding
    --log=%(outfile)s.log
    | awk '$3 == "CDS"'
    | cgat gtf2gtf
    --method=remove-duplicates --duplicate-feature=gene
    --log=%(outfile)s.log
    | gzip > %(outfile)s
    '''
    P.run(statement)
コード例 #3
0
def buildAnnotatorSlicedSegments(tmpdir, outfile, track, slice):
    '''slice segments.'''

    tmpsegments = os.path.join(tmpdir, "segments")
    to_cluster = True

    if slice == "all":
        where = "'1'"
    else:
        where = "is_%(slice)s" % locals()

    statement = '''
    %(cmd-sql)s %(database)s
    "SELECT g.* FROM %(track)s_gtf as g, %(track)s_annotation AS a WHERE a.gene_id = g.gene_id AND %(where)s"
    | cgat gtf2tsv --invert
    | cgat gff2annotator2tsv
    --remove-regex='%(annotator_remove_pattern)s'
    --log=%(outfile)s.log
    --section=segments
    > %(tmpsegments)s
    '''

    P.run(statement)

    if os.path.getsize(tmpsegments) == 0:
        return None
    else:
        return tmpsegments
コード例 #4
0
ファイル: PipelineGeneset.py プロジェクト: logust79/cgat-flow
def loadPeptideSequences(infile, outfile):
    '''load ENSEMBL peptide file into database

    This method removes empty sequences (see for example
    transcript:ENSMUST00000151316, ENSMUSP00000118372)

    The created table contains the columns ``protein_id``, ``length``
    and ``sequence``.

    Arguments
    ---------
    infile : string
        ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format
    outfile : string
        filename with logging information. The tablename is
        derived from ``outfile``.

    '''

    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-protein_id"
                                            "--map=protein_id:str")

    statement = '''gunzip
    < %(infile)s
    | perl -p -e 'if ("^>") { s/ .*//};'
    | cgat fasta2fasta --method=filter
    --filter-method=min-length=1
    | cgat fasta2table --section=length
    --section=sequence
    | perl -p -e 's/id/protein_id/'
    | %(load_statement)s
    > %(outfile)s'''

    P.run(statement)
コード例 #5
0
ファイル: PipelineGeneset.py プロジェクト: logust79/cgat-flow
def buildNonCodingExons(infile, outfile):
    '''output non-coding exons from ENSEMBL gene set.

    Remove all features from a :term:`gtf` file that are ``exon``
    and that are not protein-coding.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Output filename in :term:`gtf` format.

    '''

    statement = '''
    gunzip < %(infile)s
    | cgat gtf2gtf
    --method=filter --filter-method=proteincoding --invert-filter
    --log=%(outfile)s.log
    | awk '$3 == "exon"'
    | cgat gtf2gtf
    --method=remove-duplicates --duplicate-feature=gene
    --log=%(outfile)s.log
    | gzip > %(outfile)s
    '''
    P.run(statement)
コード例 #6
0
ファイル: PipelineGeneset.py プロジェクト: logust79/cgat-flow
def buildPromotorRegions(infile, outfile, promotor_size=1000):
    '''annotate promotor regions from reference gene set.

    This method builds promotor regions for transcripts
    in an ENSEMBL gene set.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Filename in :term:`gff` format.
    promotor_size : int
       Size of the promotor region (nucleotides upstream
       of TSS).
    '''

    statement = """
    gunzip < %(infile)s
    | cgat gff2gff --method=sanitize
    --sanitize-method=genome
    --skip-missing --genome-file=%(genome_dir)s/%(genome)s
    --log=%(outfile)s.log
    | cgat gtf2gff --method=promotors
    --promotor-size=%(promotor_size)s \
    --genome-file=%(genome_dir)s/%(genome)s
    --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    """
    P.run(statement)
コード例 #7
0
ファイル: PipelineGeneset.py プロジェクト: logust79/cgat-flow
def sortGTF(infile, outfile, order="contig+gene", job_memory="8G"):
    '''sort a gtf file.

    The sorting is performed on the cluster.

    Arguments
    ---------
    infile : string
       Geneset in :term:`gtf` format.
    outfile : string
       Geneset in :term:`gtf` format.
    order : string
       Sort order. See :mod:`scripts/gtf2gtf` for valid options for
       `order`.

    '''
    if infile.endswith(".gz"):
        uncompress = "zcat"
    else:
        # wastefull
        uncompress = "cat"

    if outfile.endswith(".gz"):
        compress = "gzip"
    else:
        compress = "cat"

    statement = '''%(uncompress)s %(infile)s
    | cgat gtf2gtf
    --method=sort --sort-order=%(order)s --log=%(outfile)s.log
    | %(compress)s > %(outfile)s'''

    P.run(statement, job_memory=job_memory)
コード例 #8
0
ファイル: pipeline_rrbs.py プロジェクト: logust79/cgat-flow
def buildBigWig(infile, outfile):
    '''build wiggle files from bam files.

    Generate :term:`bigWig` format file from :term:`bam` alignment file

    Parameters
    ----------
    infile : str
       Input filename in :term:`bam` format
    outfile : str
       Output filename in :term:`bigwig` format

    annotations_interface_contigs : str
       :term:`PARAMS`
       Input filename in :term:`bed` format

    '''

    # wigToBigWig observed to use 16G
    job_memory = "16G"
    statement = '''cgat bam2wiggle
    --output-format=bigwig
    %(bigwig_options)s
    %(infile)s
    %(outfile)s
    > %(outfile)s.log'''
    P.run()
コード例 #9
0
def loadRepeats(infile, outfile):
    """load genomic locations of repeats into database.

    This method loads the genomic coordinates (contig, start, end)
    and the repeat name into the database.

    Arguments
    ---------
    infile : string
        Input filename in :term:`gff` with repeat annotations.
    outfile : string
        Output filename with logging information. The table name is
        derived from outfile.

    """
    load_statement = P.build_load_statement(
        P.to_table(outfile),
        options="--add-index=class "
        "--header-names=contig,start,stop,class")

    statement = """zcat %(infile)s
    | cgat gff2bed --set-name=class
    | grep -v "#"
    | cut -f1,2,3,4
    | %(load_statement)s
    > %(outfile)s"""
    P.run(statement, job_memory=PARAMS["job_memory"])
コード例 #10
0
def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # variables required for statement built by FastqScreen()
    tempdir = P.get_temp_dir(".")
    outdir = os.path.join(PARAMS["exportdir"], "fastq_screen")

    # configure job_threads with fastq_screen_options from PARAMS
    job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options'])
    if len(job_threads) != 1:
        raise ValueError("Wrong number of threads for fastq_screen")

    job_threads = int(re.sub(r'--threads ', '', job_threads[0]))

    # Create fastq_screen config file in temp directory
    # using parameters from Pipeline.yml
    with IOTools.open_file(os.path.join(tempdir, "fastq_screen.conf"),
                           "w") as f:
        for i, k in list(PARAMS.items()):
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = PipelineMapping.FastqScreen()
    statement = m.build((infiles, ), outfile)
    P.run(statement, job_memory="8G")
    shutil.rmtree(tempdir)
    IOTools.touch_file(outfile)
コード例 #11
0
ファイル: pipeline_rrbs.py プロジェクト: logust79/cgat-flow
def callMethylationStatus(infile, outfile):

    if infile.endswith(("bismark_bt2.bam",
                        "bismark_bt.bam")):
        options = " --single-end "
    else:
        options = " --paired-end "

    if PARAMS["bismark_extraction_options"]:
        options += PARAMS["bismark_extraction_options"]

    CG = ("methylation.dir/CpG_context_" +
          P.snip(os.path.basename(outfile), ".bismark.cov") + ".txt")
    CHG = re.sub("CpG", "CHG", CG)
    CHH = re.sub("CpG", "CHH", CG)

    outdir = "methylation.dir"
    index_dir = PARAMS["bismark_index_dir"]
    genome = PARAMS["bismark_genome"]

    statement = '''bismark_methylation_extractor %(options)s
                --comprehensive --output %(outdir)s --counts
                --cytosine_report --bedGraph
                --genome_folder %(index_dir)s/%(genome)s/
                %(infile)s; gzip -f %(CG)s; gzip -f  %(CHG)s; gzip -f %(CHH)s
                ''' % locals()
    P.run()
コード例 #12
0
def runFastqc(infiles, outfile):
    '''run Fastqc on each input file.

    convert sra files to fastq and check mapping qualities are in
    solexa format.  Perform quality control checks on reads from
    .fastq files.
    '''
    # MM: only pass the contaminants file list if requested by user,
    # do not make this the default behaviour
    if PARAMS['use_custom_contaiminants']:
        m = PipelineMapping.FastQc(nogroup=PARAMS["readqc_no_group"],
                                   outdir=PARAMS["exportdir"] + "/fastqc",
                                   contaminants=PARAMS['contaminants_path'],
                                   qual_format=PARAMS['qual_format'])
    else:
        m = PipelineMapping.FastQc(nogroup=PARAMS["readqc_no_group"],
                                   outdir=PARAMS["exportdir"] + "/fastqc",
                                   qual_format=PARAMS['qual_format'])

    if PARAMS["general_reconcile"] == 1:
        infiles = infiles.replace("processed.dir/trimmed",
                                  "reconciled.dir/trimmed")

    statement = m.build((infiles, ), outfile)
    P.run(statement)
コード例 #13
0
    def quantifyWithSailfish(infiles, outfile):
        '''
        Quantify gene/transcript expression with sailfish
        '''

        fastqs = infiles[0]
        geneset = infiles[1][1]
        # need to check that fastq2 file exists
        # if not, run as single-end
        index_dir = "/".join(infiles[1][0].split("/")[:-1])
        out_dir = ".".join(outfile.split(".")[:-1])
        job_threads = 8
        job_memory = "6G"

        count_file = "/".join([out_dir, "quant.sf"])

        statement = '''
        cgat fastq2tpm
        --log=%(outfile)s.log
        --program=sailfish
        --method=quant
        --gene-gtf=%(geneset)s
        --index-file=%(index_dir)s
        --output-directory=%(out_dir)s
        --library-type=%(sailfish_library)s
        --threads=%(job_threads)s
        %(fastqs)s;
        '''

        P.run(statement)
コード例 #14
0
    def quantifyWithSailfish(infiles, outfile):
        '''
        Quantify gene/transcript expression with sailfish
        '''

        fastq1 = infiles[0]
        # need to check that fastq2 file exists
        # if not, run as single-end
        fastq2 = infiles[1][1]
        geneset = infiles[1][2]

        index_dir = infiles[1][0]
        out_dir = "/".join(outfile.split("/")[:-1])
        job_threads = 6
        fastqs = ",".join([fastq1, fastq2])
        job_memory = "1.5G"

        statement = '''
        cgat fastq2tpm
        --log=%(out_dir)s.log
        --program=sailfish
        --method=quant
        --paired-end
        --gene-gtf=%(geneset)s
        --index-file=%(index_dir)s
        --output-directory=%(out_dir)s
        --library-type=%(sailfish_library)s
        --threads=%(job_threads)s
        %(fastqs)s'''

        P.run(statement)
コード例 #15
0
def buildCoverageStats(infile, outfile):
    '''Generate coverage statistics for regions of interest from a
       bed file using Picard'''

    # TS check whether this is always required or specific to current baits
    # file

    # baits file requires modification to make picard accept it
    # this is performed before CalculateHsMetrics
    to_cluster = USECLUSTER
    baits = PARAMS["roi_baits"]
    modified_baits = infile + "_temp_baits_final.bed"
    regions = PARAMS["roi_regions"]
    statement = '''samtools view -H %(infile)s > %(infile)s_temp_header.txt;
                awk 'NR>2' %(baits)s |
                awk -F '\\t' 'BEGIN { OFS="\\t" } {print $1,$2,$3,"+",$4;}'
                > %(infile)s_temp_baits.bed;
                cat  %(infile)s_temp_header.txt %(infile)s_temp_baits.bed
                > %(modified_baits)s;
                rm -rf %(infile)s_temp_baits.bed %(infile)s_temp_header.txt
                '''
    P.run(statement)

    PipelineMappingQC.buildPicardCoverageStats(infile, outfile, modified_baits,
                                               modified_baits)

    IOTools.zap_file(modified_baits)
コード例 #16
0
def buildCpGBed(infile, outfile):
    '''
    Output a :term:`BED` file that contains the location of all CpGs
    in the input genome using `CGAT` script `fasta2bed`.

    Parameters
    ----------
    infile: str
      infile is constructed from `PARAMS` variable to retrieve
      the `genome` :term:`fasta` file

    Returns
    -------
    outfile: str
      A :term:`BED` format file containing location of CpGs across the
      genome.  The BED file is then indexed using tabix
    '''

    statement = '''
    cgat fasta2bed
        --method=cpg
        --log=%(outfile)s.log
    < %(infile)s
    | bgzip
    > %(outfile)s
    '''

    P.run(statement, job_memory=PARAMS["job_highmemory"])

    statement = '''
    tabix -p bed %(outfile)s
    '''
    P.run(statement, job_memory=PARAMS["job_highmemory"])
コード例 #17
0
def run_test(infile, outfile):
    '''run a test.

    Multiple targets are run iteratively.
    '''

    track = P.snip(outfile, ".log")
    pipeline_name = PARAMS.get("%s_pipeline" % track, track[len("test_"):])

    pipeline_targets = P.as_list(PARAMS.get("%s_target" % track, "full"))

    # do not run on cluster, mirror
    # that a pipeline is started from
    # the head node
    #to_cluster = False

    template_statement = ("cd %%(track)s.dir; "
                          "xvfb-run -d cgatflow %%(pipeline_name)s "
                          "%%(pipeline_options)s "
                          "%%(workflow_options)s make %s "
                          "-L ../%%(outfile)s "
                          "-S ../%%(outfile)s.stdout "
                          "-E ../%%(outfile)s.stderr")

    if len(pipeline_targets) == 1:
        statement = template_statement % pipeline_targets[0]
        P.run(statement, ignore_errors=True, job_memory="unlimited")
    else:
        statements = []
        for pipeline_target in pipeline_targets:
            statements.append(template_statement % pipeline_target)
        P.run(statement, ignore_errors=True, job_memory="unlimited")
コード例 #18
0
def buildTranscriptRegions(infile, outfile):
    """
    export a table of seleno cysteine transcripts.

    Selenocysteine containing transcripts are identified by checking
    if their protein sequence contains ``U``.

    The table contains a single column ``transcript_id`` with ENSEMBL
    transcript identifiers as values.
    Arguments
    ---------
    infile : string
       Input filename with geneset in :term:`gtf` format.
    outfile : string
       Output filename with genomic regions in :term:`bed` format.

    """

    statement = """
    gunzip < %(infile)s
    | cgat gtf2gtf --method=join-exons
    --log=%(outfile)s.log
    | cgat gff2bed --is-gtf
    --set-name=transcript_id
    --log=%(outfile)s.log
    | gzip
    > %(outfile)s """
    P.run(statement, job_memory=PARAMS["job_memory"])
コード例 #19
0
ファイル: PipelineGeneset.py プロジェクト: logust79/cgat-flow
def buildOverlapWithEnsembl(infile, outfile, filename_bed):
    '''compute overlap of genes with intervals.

    If `filename_bed` has multiple tracks the overlap will
    be computed for each track separately.

    The output is a tab-separated table with pairs of
    overlapping features between `infile` and `filename_bed`.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Output file in :term:`tsv` format.
    filename_bed : string
       Filename in :term:`bed` format.
    '''

    statement = '''gunzip
        < %(infile)s
        | cgat gtf2gtf --method=merge-transcripts
        | cgat gff2bed --is-gtf
        | cgat bed2graph
            --output-section=name
            --log=%(outfile)s.log
            - %(filename_bed)s
        > %(outfile)s
    '''
    P.run(statement)
コード例 #20
0
def buildGeneRegions(infile, outfile):
    """build a :term:`bed` file of regions spanning whole gene models.

    This method outputs a single interval spanning the genomic region
    that covers all transcripts within a particular gene.

    The name column of the :term:`bed` file is set to the `gene_id`.

    Arguments
    ---------
    infile : string
       Input filename with geneset in :term:`gtf` format.
    outfile : string
       Output filename with genomic regions in :term:`bed` format.

    """
    statement = """
    gunzip < %(infile)s
    | cgat gtf2gtf
    --method=merge-transcripts
    --log=%(outfile)s.log
    | cgat gff2bed --is-gtf --set-name=gene_id
    --log=%(outfile)s.log
    | gzip
    > %(outfile)s """
    P.run(statement, job_memory=PARAMS["job_memory"])
コード例 #21
0
ファイル: PipelineGeneset.py プロジェクト: logust79/cgat-flow
def buildPeptideFasta(infile, outfile):
    '''index an ENSEMBL peptide FASTA file

    The descriptions in the fasta file are truncated at the
    first space to contain only the sequence identifier.

    Arguments
    ---------
    infile : string
        ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format
    outfile : string
        indexed file in :term:`fasta` format
    '''
    dbname = outfile[:-len(".fasta")]

    statement = '''gunzip
    < %(infile)s
    | perl -p -e 'if ("^>") { s/ .*//};'
    | cgat index_fasta
       --force-output
    %(dbname)s -
    > %(dbname)s.log
    '''

    P.run(statement)
コード例 #22
0
def buildTranscriptTTS(infile, outfile):
    """build a :term:`bed` file with transcription termination sites.

    This method outputs all transcription start sites within a
    geneset. The trancription start site is derived from the most
    downstream coordinate of each transcript.

    The name column of the :term:`bed` file is set to the
    `transcript_id`.

    Arguments
    ---------
    infile : string
       Input filename with geneset in :term:`gtf` format.
    outfile : string
       Output filename with genomic regions in :term:`bed` format.

    """

    statement = """
    gunzip < %(infile)s
    | cgat gtf2gtf --method=join-exons
    --log=%(outfile)s.log
    | cgat gtf2gff --method=tts
    --promotor-size=1
    --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log
    | cgat gff2bed --is-gtf --set-name=transcript_id
    --log=%(outfile)s.log
    | gzip
    > %(outfile)s """
    P.run(statement, job_memory=PARAMS["job_memory"])
コード例 #23
0
ファイル: PipelineGeneset.py プロジェクト: logust79/cgat-flow
def loadGeneStats(infile, outfile):
    """compute and load gene statistics to database.

    Gene statistics are computed by :doc:`gtf2table` with the
    following counters:

    * length - gene/exon lengths
    * position - gene position
    * composition-na - gene nucleotide composition

    Parameters
    ----------
    infile : string
        A :term:`gtf` file which is output from :meth:`buildGenes`
    outfile : string
        A log file. The table name is derived from `outfile`.
        e.g. bam_stats.load
    """

    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-index=gene_id "
                                            "--map=gene_name:str")

    statement = '''
    gunzip < %(infile)s
    | cgat gtf2table
          --log=%(outfile)s.log
          --genome=%(genome_dir)s/%(genome)s
          --counter=position
          --counter=length
          --counter=composition-na
    | %(load_statement)s
    > %(outfile)s'''
    P.run(statement)
コード例 #24
0
def mapReadsWithTophatFusion(infiles, outfile):
    '''map reads from .fastq or .sra files and find candidate fusions

    A list with known splice junctions expect from rnaseq pipeline
    '''

    job_threads = PARAMS["tophat_threads"]

    if "--butterfly-search" in PARAMS["tophat_options"]:
        # for butterfly search - require insane amount of
        # RAM.
        job_options += " -l mem_free=50G"

    to_cluster = USECLUSTER
    m = PipelineMapping.TopHat_fusion()
    infile = infiles

    # if a file of reference junctions, as generated by the rnaseq pipline,
    # has been specified in the ini, then pass this to tophat-fusion
    if not PARAMS['tophatfusion_reference_junctions'] is None:
        reffile = PARAMS['tophatfusion_reference_junctions']
        tophat_options = PARAMS["tophat_options"] + \
            " --raw-juncs %(reffile)s" % locals()

    tophatfusion_options = PARAMS["tophatfusion_options"]
    statement = m.build((infile,), outfile)
    P.run()
コード例 #25
0
ファイル: PipelineGeneset.py プロジェクト: logust79/cgat-flow
def buildLincRNAExons(infile, outfile):
    """output LincRNA portion of ENSEMBL geneset.

    Take all features from a :term:`gtf` file that are of feature type
    ``exon`` and that are annotated as a lincrna biotype.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Output filename in :term:`gtf` format.

    """

    statement = '''
    gunzip < %(infile)s
    | cgat gtf2gtf
    --method=filter --filter-method=lincrna
    --log=%(outfile)s.log
    | awk '$3 == "exon"'
    | cgat gtf2gtf
    --method=remove-duplicates --duplicate-feature=gene
    --log=%(outfile)s.log
    | gzip > %(outfile)s
    '''
    P.run(statement)
コード例 #26
0
def postprocessTopHatFusion(infiles, outfile):
    ''' Uses tophat-fusion-post to postprocess and filter all of the
        tophat-fusion output into one report. Slow as it is not
        cluster aware and spawns a large number of blast tasks'''

    job_options = ' -l mem_free=50G'
    job_threads = PARAMS["tophatfusion_postthreads"]

    statement = '''
                  module load bio/tophatfusion;
                  tophat-fusion-post -p %(tophatfusion_postthreads)s
                                   %(tophatfusion_postoptions)s
                                   %(bowtie_index_dir)s/%(genome)s
                  &> tophatfusion_out.log
                '''

    P.run()

    # put the results in the export directory.

    # if the export directory doesn't exist, create it
    if not os.path.exists('export'):
        os.mkdir('export')

    # otherwise if it does, then delete any out directory that is
    # already there.
    elif os.path.exists(
            'export/tophatfusion_out') and os.path.isdir(
                'export/tophatfusion_out'):
        shutil.rmtree('export/tophatfusion.out')

    shutil.move('tophatfusion_out', 'export')
コード例 #27
0
ファイル: PipelineGeneset.py プロジェクト: logust79/cgat-flow
def loadTranscriptStats(infile, outfile):
    '''compute and load transcript properties into database.

    The method calls :doc:`gtf2table` with the following counters:
    * length - gene/exon lengths
    * position - gene position
    * composition-na - gene nucleotide composition

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''

    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-index=gene_id "
                                            "--add-index=transcript_id "
                                            "--map=gene_id:str")

    statement = '''
    gunzip < %(infile)s |\
    cgat gtf2table \
          --log=%(outfile)s.log \
          --genome=%(genome_dir)s/%(genome)s \
          --reporter=transcripts \
          --counter=position \
          --counter=length \
          --counter=composition-na
    | %(load_statement)s
    > %(outfile)s'''

    P.run(statement)
コード例 #28
0
def edgeR_analysis(infile, outfile):
    ''' Runs the edgeR GLM analysis script using each of the input
    files as the exon counts, and each of the *design.tsv*  as the
    designs. 

    Options to the script are stored in the ini. 
    '''

    to_cluster = USECLUSTER
    R_path = PARAMS['R_path']
    R_script_dir = PARAMS['R_scriptdir']
    R_args = PARAM['R_args']
    edgeR_args = ['edgeR_args']
    baseName = snip(infile, ".exon_counts.tsv.gz") + "_"

    if not os.path.exists('edgeR_output'):
        os.mkdir('edgeR_output')

    for design in glob.iglob(PARAMS['edgeR_design']):

        statement = ''' %(R_path)s CMD BATCH %(R_args)s
        \"--args count_file='%(infile)s'
        conditions_file='%(design)s'
        out_file='%(baseName)s'
        %(edgeR_args)s \"
        %(R_scriptdir)/edgeR-GLM.R
        edgeR_output/%(infile)s.edgeR.log ''' % locals()
        P.run()
コード例 #29
0
def runAnnotator(tmpdir,
                 outfile,
                 tmpannotations,
                 tmpsegments,
                 tmpworkspaces,
                 tmpsynonyms,
                 options=""):
    '''run annotator.'''

    to_cluster = True
    job_queue = "medium_jobs.q"
    job_options = "-l mem_free=8000M"

    workspace_options = ""
    for x, workspace in enumerate(tmpworkspaces):
        if x == 0:
            workspace_options += " -workspace %s" % workspace
        else:
            workspace_options += " -workspace%i %s" % (x + 1, workspace)

    if tmpsynonyms:
        workspace_options += " -synonyms %s" % tmpsynonyms

    statement = '''
    java -Xmx8000M -cp %(annotator_dir)s/commons-cli-1.0.jar:%(annotator_dir)s/Annotator.jar app.Annotator
    -verbose 4 -iterations %(annotator_iterations)s
    -annotation %(tmpannotations)s
    -segments %(tmpsegments)s
    -bucketsize %(annotator_bucketsize)i
    %(workspace_options)s
    %(options)s
    > %(outfile)s '''

    P.run(statement)
コード例 #30
0
def loadNumberExonsLengthSummaryStats(infile, outfile):
    '''
    load the table of exon counts and transcript lengths
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + "_stats"
    statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()