コード例 #1
0
def buildWorkSpace(outfile, workspace):
    '''write genomic workspace.

    Available workspaces are:

    genomic
       the full genome
    intronic
       introns (requires annotator_regions to be set)
    exonic
       exonic (requires annotator_regions to be set)
    intergenic
       introns (requires annotator_regions to be set)
    geneterritories
       introns (requires annotator_geneterritories to be set)
    mappable
       mappable part of genome (requires annotator_mappability to be set )
    alignable
       only the alignable part of a genome (requires annotator_alignment to be set)


    If ``gc_control`` is True, the chromosomes will be divided into isochores
    (requiers the paramater ``annotator_gc_workspace`` to be set).
    '''

    job_options = "-l mem_free=4000M"

    workspace = workspace.lower()

    if workspace == "genomic":
        P.checkParameter("genome")

        statement = '''
        cgat index2bed
        --genome=%(genome)s
        --log=%(outfile)s.log
        --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace in ("intergenic", "intronic", "cds"):

        P.checkParameter("enrichment_regions")

        workspace_upper = workspace.upper()

        statement = '''
        gunzip < %(enrichment_regions)s
        | awk 'BEGIN {printf("track name=%(workspace)s\\n"); }
               ($3 == "%(workspace)s"
               || $3 == "%(workspace_upper)s")
               && !( $1 ~ /%(enrichment_remove_pattern)s/)
               { printf("%%s\\t%%i\\t%%i\\n", $1, $4-1, $5); }'
        > %(outfile)s
        '''
    elif workspace == "unknown":

        P.checkParameter("enrichment_regions")
        statement = '''
        awk '($3 == "intronic" || $3 == "intergenic" )'
        < %(enrichment_regions)s
        | cgat gff2enrichment
                --section=workspace
                --max-length=0
                --log=%(outfile)s.log
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace == "known":
        P.checkParameter("enrichment_regions")
        statement = '''
        awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)'
        < %(enrichment_regions)s
        | cgat gff2enrichment
                --section=workspace
                --max-length=0
                --log=%(outfile)s.log
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace == "alignable":

        P.checkParameter("enrichment_alignment")
        statement = '''gunzip
        < %(enrichment_alignment)s
        | sort -k10,10
        | awk '$10 !~ /%(enrichment_remove_pattern)s/ \
            {if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \
            printf("\\t(%%i,%%i)", $12,$13); }; \
        END {printf ("\\n");}'\
        > %(outfile)s
        '''

    elif workspace == "gene-territories":

        P.checkParameter("enrichment_geneterritories")
        statement = '''
        cgat gff2enrichment \
                --section=workspace \
                --max-length=0 \
                --log=%(outfile)s.log \
                --remove-regex='%(enrichment_remove_pattern)s'
        < %(enrichment_geneterritories)s > %(outfile)s
        '''

    elif workspace == "mappable":

        P.checkParameter("enrichment_mappability")
        statement = '''
        cgat bed2gff < %(enrichment_mappability)s
        | cgat gff2enrichment
                --section=workspace
                --max-length=0
                --log=%(outfile)s.log
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''
    else:
        raise ValueError("unknown workspace '%s'" % workspace)

    P.run()
コード例 #2
0
def buildWorkSpace(outfile, workspace):
    '''write genomic workspace.

    Available workspaces are:

    genomic
       the full genome
    intronic
       introns (requires annotator_regions to be set)
    exonic
       exonic (requires annotator_regions to be set)
    intergenic
       introns (requires annotator_regions to be set)
    geneterritories
       introns (requires annotator_geneterritories to be set)
    mappable
       mappable part of genome (requires annotator_mappability to be set )
    alignable
       only the alignable part of a genome (requires annotator_alignment to be set)


    If ``gc_control`` is True, the chromosomes will be divided into isochores
    (requiers the paramater ``annotator_gc_workspace`` to be set).
    '''

    job_options = "-l mem_free=4000M"

    workspace = workspace.lower()

    if workspace == "genomic":
        P.checkParameter("genome")

        statement = '''
        python %(scriptsdir)s/index2bed.py
        --genome=%(genome)s
        --log=%(outfile)s.log
        --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace in ("intergenic", "intronic", "cds"):

        P.checkParameter("enrichment_regions")

        workspace_upper = workspace.upper()

        statement = '''
        gunzip < %(enrichment_regions)s
        | awk 'BEGIN {printf("track name=%(workspace)s\\n"); }
               ($3 == "%(workspace)s"
               || $3 == "%(workspace_upper)s")
               && !( $1 ~ /%(enrichment_remove_pattern)s/)
               { printf("%%s\\t%%i\\t%%i\\n", $1, $4-1, $5); }'
        > %(outfile)s
        '''
    elif workspace == "unknown":

        P.checkParameter("enrichment_regions")
        statement = '''
        awk '($3 == "intronic" || $3 == "intergenic" )'
        < %(enrichment_regions)s
        | python %(scriptsdir)s/gff2enrichment.py
                --section=workspace
                --max-length=0
                --log=%(outfile)s.log
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace == "known":
        P.checkParameter("enrichment_regions")
        statement = '''
        awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)'
        < %(enrichment_regions)s
        | python %(scriptsdir)s/gff2enrichment.py
                --section=workspace
                --max-length=0
                --log=%(outfile)s.log
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace == "alignable":

        P.checkParameter("enrichment_alignment")
        statement = '''gunzip
        < %(enrichment_alignment)s
        | sort -k10,10
        | awk '$10 !~ /%(enrichment_remove_pattern)s/ \
            {if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \
            printf("\\t(%%i,%%i)", $12,$13); }; \
        END {printf ("\\n");}'\
        > %(outfile)s
        '''

    elif workspace == "gene-territories":

        P.checkParameter("enrichment_geneterritories")
        statement = '''
        python %(scriptsdir)s/gff2enrichment.py \
                --section=workspace \
                --max-length=0 \
                --log=%(outfile)s.log \
                --remove-regex='%(enrichment_remove_pattern)s'
        < %(enrichment_geneterritories)s > %(outfile)s
        '''

    elif workspace == "mappable":

        P.checkParameter("enrichment_mappability")
        statement = '''
        python %(scriptsdir)s/bed2gff.py < %(enrichment_mappability)s
        | python %(scriptsdir)s/gff2enrichment.py
                --section=workspace
                --max-length=0
                --log=%(outfile)s.log
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''
    else:
        raise P.PipelineError("unknown workspace '%s'" % workspace)

    P.run()