def buildWorkSpace(outfile, workspace):
    '''write genomic workspace.

    Available workspaces are:

    genomic
       the full genome
    intronic
       introns (requires annotator_regions to be set)
    exonic
       exonic (requires annotator_regions to be set)
    intergenic
       introns (requires annotator_regions to be set)
    geneterritories
       introns (requires annotator_geneterritories to be set)
    mappable
       mappable part of genome (requires annotator_mappability to be set )
    alignable
       only the alignable part of a genome (requires annotator_alignment to be set)


    If ``gc_control`` is True, the chromosomes will be divided into isochores
    (requiers the paramater ``annotator_gc_workspace`` to be set).
    '''

    to_cluster = True
    job_options = "-l mem_free=4000M"

    workspace = workspace.lower()

    if workspace == "genomic":
        P.checkParameter("genome")

        statement = '''
        python %(scriptsdir)s/index2bed.py 
                --genome=%(genome)s 
                --log=%(outfile)s.log 
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace in ("intergenic", "intronic", "cds"):

        P.checkParameter("enrichment_regions")

        workspace_upper = workspace.upper()

        statement = '''
        gunzip < %(enrichment_regions)s 
        | awk 'BEGIN {printf("track name=%(workspace)s\\n"); } 
               ($3 == "%(workspace)s" 
               || $3 == "%(workspace_upper)s") 
               && !( $1 ~ /%(enrichment_remove_pattern)s/)
               { printf("%%s\\t%%i\\t%%i\\n", $1, $4-1, $5); }'
        > %(outfile)s
        '''
    elif workspace == "unknown":

        P.checkParameter("enrichment_regions")
        statement = '''
        awk '($3 == "intronic" || $3 == "intergenic" )' 
        < %(enrichment_regions)s
        | python %(scriptsdir)s/gff2enrichment.py 
                --section=workspace 
                --max-length=0 
                --log=%(outfile)s.log 
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace == "known":
        P.checkParameter("enrichment_regions")
        statement = '''
        awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)' 
        < %(enrichment_regions)s
        | python %(scriptsdir)s/gff2enrichment.py 
                --section=workspace 
                --max-length=0 
                --log=%(outfile)s.log 
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace == "alignable":

        P.checkParameter("enrichment_alignment")
        statement = '''gunzip
        < %(enrichment_alignment)s 
        | sort -k10,10 
        | awk '$10 !~ /%(enrichment_remove_pattern)s/ \
            {if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \
            printf("\\t(%%i,%%i)", $12,$13); }; \
        END {printf ("\\n");}'\
        > %(outfile)s
        '''

    elif workspace == "gene-territories":

        P.checkParameter("enrichment_geneterritories")
        statement = '''
        python %(scriptsdir)s/gff2enrichment.py \
                --section=workspace \
                --max-length=0 \
                --log=%(outfile)s.log \
                --remove-regex='%(enrichment_remove_pattern)s'
        < %(enrichment_geneterritories)s > %(outfile)s
        '''

    elif workspace == "mappable":

        P.checkParameter("enrichment_mappability")
        statement = '''
        python %(scriptsdir)s/bed2gff.py < %(enrichment_mappability)s 
        | python %(scriptsdir)s/gff2enrichment.py 
                --section=workspace 
                --max-length=0 
                --log=%(outfile)s.log 
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''
    else:
        raise P.PipelineError("unknown workspace '%s'" % workspace)

    P.run()
Beispiel #2
0
def buildAnnotatorWorkSpace(tmpdir,
                            outfile,
                            workspaces=("genomic", ),
                            gc_control=False):
    '''write genomic workspace.

    Available workspaces are:

    genomic
       the full genome
    all
       is ignored
    intronic
       introns (requires annotator_regions to be set)
    intergenic
       introns (requires annotator_regions to be set)
    geneterritories
       introns (requires annotator_geneterritories to be set)
    mappable
       mappable part of genome (requires annotator_mappability to be set )
    alignable
       only the alignable part of a genome (requires annotator_alignment to be set)


    If ``gc_control`` is True, the chromosomes will be divided into isochores
    (requiers the paramater ``annotator_gc_workspace`` to be set).
    '''

    to_cluster = True
    job_options = "-l mem_free=4000M"

    tmpworkspaces = []

    if gc_control:
        P.checkParameter("annotator_gc_workspace")

        tmpsynonyms = PARAMS["annotator_gc_workspace"] + ".synonyms"
        tmpworkspaces.append(PARAMS["annotator_gc_workspace"])
    else:
        tmpsynonyms = None

    for workspace in workspaces:

        tmpworkspace = os.path.join(tmpdir, "workspace_%s" % workspace)

        if workspace == "all":
            continue
        elif workspace == "genomic":
            P.checkParameter("genome")

            statement = '''
            python %(scriptsdir)s/index2gff.py 
                    --genome=%(genome)s 
                    --log=%(outfile)s.log 
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''

        elif workspace in ("intergenic", "intronic", "CDS"):
            P.checkParameter("annotator_regions")
            statement = '''
            awk '$3 == "%(workspace)s"' 
            < %(annotator_regions)s
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''
        elif workspace == "unknown":
            P.checkParameter("annotator_regions")
            statement = '''
            awk '($3 == "intronic" || $3 == "intergenic" )' 
            < %(annotator_regions)s
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''

        elif workspace == "known":
            P.checkParameter("annotator_regions")
            statement = '''
            awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)' 
            < %(annotator_regions)s
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''

        elif workspace == "alignable":
            P.checkParameter("annotator_alignment")
            statement = '''gunzip
            < %(annotator_alignment)s 
            | sort -k10,10 
            | awk '$10 !~ /%(annotator_remove_pattern)s/ \
		{if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \
		printf("\\t(%%i,%%i)", $12,$13); }; \
	    END {printf ("\\n");}'\
	    > %(tmpworkspace)s
            '''

        elif workspace == "gene-territories":
            P.checkParameter("annotator_geneterritories")
            statement = '''
            python %(scriptsdir)s/gff2annotator2tsv.py \
                    --section=workspace \
                    --max-length=0 \
                    --log=%(outfile)s.log \
                    --remove-regex='%(annotator_remove_pattern)s'
            < %(annotator_geneterritories)s > %(tmpworkspace)s
            '''

        elif workspace == "mappable":
            P.checkParameter("annotator_mappability")
            statement = '''
            python %(scriptsdir)s/bed2gff.py < %(annotator_mappability)s 
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''
        else:
            raise P.PipelineError("unknown workspace '%s'" % workspace)

        P.run()
        tmpworkspaces.append(tmpworkspace)

    return tmpworkspaces, tmpsynonyms
Beispiel #3
0
def buildWorkSpace(outfile, workspace):
    '''write genomic workspace.

    Available workspaces are:

    genomic
       the full genome
    intronic
       introns (requires annotator_regions to be set)
    exonic
       exonic (requires annotator_regions to be set)
    intergenic
       introns (requires annotator_regions to be set)
    geneterritories
       introns (requires annotator_geneterritories to be set)
    mappable
       mappable part of genome (requires annotator_mappability to be set )
    alignable
       only the alignable part of a genome (requires annotator_alignment to be set)


    If ``gc_control`` is True, the chromosomes will be divided into isochores
    (requiers the paramater ``annotator_gc_workspace`` to be set).
    '''

    to_cluster = True
    job_options = "-l mem_free=4000M"

    workspace = workspace.lower()

    if workspace == "genomic":
        P.checkParameter("genome")

        statement = '''
        python %(scriptsdir)s/index2bed.py 
                --genome=%(genome)s 
                --log=%(outfile)s.log 
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace in ("intergenic", "intronic", "cds"):

        P.checkParameter("enrichment_regions")

        workspace_upper = workspace.upper()

        statement = '''
        gunzip < %(enrichment_regions)s 
        | awk 'BEGIN {printf("track name=%(workspace)s\\n"); } 
               ($3 == "%(workspace)s" 
               || $3 == "%(workspace_upper)s") 
               && !( $1 ~ /%(enrichment_remove_pattern)s/)
               { printf("%%s\\t%%i\\t%%i\\n", $1, $4-1, $5); }'
        > %(outfile)s
        '''
    elif workspace == "unknown":

        P.checkParameter("enrichment_regions")
        statement = '''
        awk '($3 == "intronic" || $3 == "intergenic" )' 
        < %(enrichment_regions)s
        | python %(scriptsdir)s/gff2enrichment.py 
                --section=workspace 
                --max-length=0 
                --log=%(outfile)s.log 
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace == "known":
        P.checkParameter("enrichment_regions")
        statement = '''
        awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)' 
        < %(enrichment_regions)s
        | python %(scriptsdir)s/gff2enrichment.py 
                --section=workspace 
                --max-length=0 
                --log=%(outfile)s.log 
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace == "alignable":

        P.checkParameter("enrichment_alignment")
        statement = '''gunzip
        < %(enrichment_alignment)s 
        | sort -k10,10 
        | awk '$10 !~ /%(enrichment_remove_pattern)s/ \
            {if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \
            printf("\\t(%%i,%%i)", $12,$13); }; \
        END {printf ("\\n");}'\
        > %(outfile)s
        '''

    elif workspace == "gene-territories":

        P.checkParameter("enrichment_geneterritories")
        statement = '''
        python %(scriptsdir)s/gff2enrichment.py \
                --section=workspace \
                --max-length=0 \
                --log=%(outfile)s.log \
                --remove-regex='%(enrichment_remove_pattern)s'
        < %(enrichment_geneterritories)s > %(outfile)s
        '''

    elif workspace == "mappable":

        P.checkParameter("enrichment_mappability")
        statement = '''
        python %(scriptsdir)s/bed2gff.py < %(enrichment_mappability)s 
        | python %(scriptsdir)s/gff2enrichment.py 
                --section=workspace 
                --max-length=0 
                --log=%(outfile)s.log 
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''
    else:
        raise P.PipelineError("unknown workspace '%s'" % workspace)

    P.run()
def buildAnnotatorWorkSpace(tmpdir,
                            outfile,
                            workspaces=("genomic",),
                            gc_control = False):
    '''write genomic workspace.

    Available workspaces are:

    genomic
       the full genome
    all
       is ignored
    intronic
       introns (requires annotator_regions to be set)
    intergenic
       introns (requires annotator_regions to be set)
    geneterritories
       introns (requires annotator_geneterritories to be set)
    mappable
       mappable part of genome (requires annotator_mappability to be set )
    alignable
       only the alignable part of a genome (requires annotator_alignment to be set)


    If ``gc_control`` is True, the chromosomes will be divided into isochores
    (requiers the paramater ``annotator_gc_workspace`` to be set).
    '''

    to_cluster = True
    job_options = "-l mem_free=4000M"

    tmpworkspaces = []

    if gc_control:
        P.checkParameter("annotator_gc_workspace")

        tmpsynonyms = PARAMS["annotator_gc_workspace"] + ".synonyms"
        tmpworkspaces.append(PARAMS["annotator_gc_workspace"])
    else:
        tmpsynonyms = None

    for workspace in workspaces:

        tmpworkspace = os.path.join(tmpdir, "workspace_%s" % workspace)

        if workspace == "all":
            continue
        elif workspace == "genomic":
            P.checkParameter("genome")

            statement = '''
            python %(scriptsdir)s/index2gff.py 
                    --genome=%(genome)s 
                    --log=%(outfile)s.log 
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''

        elif workspace in ("intergenic", "intronic", "CDS"):
            P.checkParameter("annotator_regions")
            statement = '''
            awk '$3 == "%(workspace)s"' 
            < %(annotator_regions)s
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''
        elif workspace == "unknown":
            P.checkParameter("annotator_regions")
            statement = '''
            awk '($3 == "intronic" || $3 == "intergenic" )' 
            < %(annotator_regions)s
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''

        elif workspace == "known":
            P.checkParameter("annotator_regions")
            statement = '''
            awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)' 
            < %(annotator_regions)s
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''

        elif workspace == "alignable":
            P.checkParameter("annotator_alignment")
            statement = '''gunzip
            < %(annotator_alignment)s 
            | sort -k10,10 
            | awk '$10 !~ /%(annotator_remove_pattern)s/ \
		{if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \
		printf("\\t(%%i,%%i)", $12,$13); }; \
	    END {printf ("\\n");}'\
	    > %(tmpworkspace)s
            '''

        elif workspace == "gene-territories":
            P.checkParameter("annotator_geneterritories")
            statement = '''
            python %(scriptsdir)s/gff2annotator2tsv.py \
                    --section=workspace \
                    --max-length=0 \
                    --log=%(outfile)s.log \
                    --remove-regex='%(annotator_remove_pattern)s'
            < %(annotator_geneterritories)s > %(tmpworkspace)s
            '''

        elif workspace == "mappable":
            P.checkParameter("annotator_mappability")
            statement = '''
            python %(scriptsdir)s/bed2gff.py < %(annotator_mappability)s 
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''
        else:
            raise P.PipelineError("unknown workspace '%s'" % workspace)

        P.run()
        tmpworkspaces.append(tmpworkspace)

    return tmpworkspaces, tmpsynonyms