def buildWorkSpace(outfile, workspace): '''write genomic workspace. Available workspaces are: genomic the full genome intronic introns (requires annotator_regions to be set) exonic exonic (requires annotator_regions to be set) intergenic introns (requires annotator_regions to be set) geneterritories introns (requires annotator_geneterritories to be set) mappable mappable part of genome (requires annotator_mappability to be set ) alignable only the alignable part of a genome (requires annotator_alignment to be set) If ``gc_control`` is True, the chromosomes will be divided into isochores (requiers the paramater ``annotator_gc_workspace`` to be set). ''' to_cluster = True job_options = "-l mem_free=4000M" workspace = workspace.lower() if workspace == "genomic": P.checkParameter("genome") statement = ''' python %(scriptsdir)s/index2bed.py --genome=%(genome)s --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' elif workspace in ("intergenic", "intronic", "cds"): P.checkParameter("enrichment_regions") workspace_upper = workspace.upper() statement = ''' gunzip < %(enrichment_regions)s | awk 'BEGIN {printf("track name=%(workspace)s\\n"); } ($3 == "%(workspace)s" || $3 == "%(workspace_upper)s") && !( $1 ~ /%(enrichment_remove_pattern)s/) { printf("%%s\\t%%i\\t%%i\\n", $1, $4-1, $5); }' > %(outfile)s ''' elif workspace == "unknown": P.checkParameter("enrichment_regions") statement = ''' awk '($3 == "intronic" || $3 == "intergenic" )' < %(enrichment_regions)s | python %(scriptsdir)s/gff2enrichment.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' elif workspace == "known": P.checkParameter("enrichment_regions") statement = ''' awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)' < %(enrichment_regions)s | python %(scriptsdir)s/gff2enrichment.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' elif workspace == "alignable": P.checkParameter("enrichment_alignment") statement = '''gunzip < %(enrichment_alignment)s | sort -k10,10 | awk '$10 !~ /%(enrichment_remove_pattern)s/ \ {if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \ printf("\\t(%%i,%%i)", $12,$13); }; \ END {printf ("\\n");}'\ > %(outfile)s ''' elif workspace == "gene-territories": P.checkParameter("enrichment_geneterritories") statement = ''' python %(scriptsdir)s/gff2enrichment.py \ --section=workspace \ --max-length=0 \ --log=%(outfile)s.log \ --remove-regex='%(enrichment_remove_pattern)s' < %(enrichment_geneterritories)s > %(outfile)s ''' elif workspace == "mappable": P.checkParameter("enrichment_mappability") statement = ''' python %(scriptsdir)s/bed2gff.py < %(enrichment_mappability)s | python %(scriptsdir)s/gff2enrichment.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' else: raise P.PipelineError("unknown workspace '%s'" % workspace) P.run()
def buildAnnotatorWorkSpace(tmpdir, outfile, workspaces=("genomic", ), gc_control=False): '''write genomic workspace. Available workspaces are: genomic the full genome all is ignored intronic introns (requires annotator_regions to be set) intergenic introns (requires annotator_regions to be set) geneterritories introns (requires annotator_geneterritories to be set) mappable mappable part of genome (requires annotator_mappability to be set ) alignable only the alignable part of a genome (requires annotator_alignment to be set) If ``gc_control`` is True, the chromosomes will be divided into isochores (requiers the paramater ``annotator_gc_workspace`` to be set). ''' to_cluster = True job_options = "-l mem_free=4000M" tmpworkspaces = [] if gc_control: P.checkParameter("annotator_gc_workspace") tmpsynonyms = PARAMS["annotator_gc_workspace"] + ".synonyms" tmpworkspaces.append(PARAMS["annotator_gc_workspace"]) else: tmpsynonyms = None for workspace in workspaces: tmpworkspace = os.path.join(tmpdir, "workspace_%s" % workspace) if workspace == "all": continue elif workspace == "genomic": P.checkParameter("genome") statement = ''' python %(scriptsdir)s/index2gff.py --genome=%(genome)s --log=%(outfile)s.log | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' elif workspace in ("intergenic", "intronic", "CDS"): P.checkParameter("annotator_regions") statement = ''' awk '$3 == "%(workspace)s"' < %(annotator_regions)s | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' elif workspace == "unknown": P.checkParameter("annotator_regions") statement = ''' awk '($3 == "intronic" || $3 == "intergenic" )' < %(annotator_regions)s | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' elif workspace == "known": P.checkParameter("annotator_regions") statement = ''' awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)' < %(annotator_regions)s | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' elif workspace == "alignable": P.checkParameter("annotator_alignment") statement = '''gunzip < %(annotator_alignment)s | sort -k10,10 | awk '$10 !~ /%(annotator_remove_pattern)s/ \ {if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \ printf("\\t(%%i,%%i)", $12,$13); }; \ END {printf ("\\n");}'\ > %(tmpworkspace)s ''' elif workspace == "gene-territories": P.checkParameter("annotator_geneterritories") statement = ''' python %(scriptsdir)s/gff2annotator2tsv.py \ --section=workspace \ --max-length=0 \ --log=%(outfile)s.log \ --remove-regex='%(annotator_remove_pattern)s' < %(annotator_geneterritories)s > %(tmpworkspace)s ''' elif workspace == "mappable": P.checkParameter("annotator_mappability") statement = ''' python %(scriptsdir)s/bed2gff.py < %(annotator_mappability)s | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' else: raise P.PipelineError("unknown workspace '%s'" % workspace) P.run() tmpworkspaces.append(tmpworkspace) return tmpworkspaces, tmpsynonyms
def buildAnnotatorWorkSpace(tmpdir, outfile, workspaces=("genomic",), gc_control = False): '''write genomic workspace. Available workspaces are: genomic the full genome all is ignored intronic introns (requires annotator_regions to be set) intergenic introns (requires annotator_regions to be set) geneterritories introns (requires annotator_geneterritories to be set) mappable mappable part of genome (requires annotator_mappability to be set ) alignable only the alignable part of a genome (requires annotator_alignment to be set) If ``gc_control`` is True, the chromosomes will be divided into isochores (requiers the paramater ``annotator_gc_workspace`` to be set). ''' to_cluster = True job_options = "-l mem_free=4000M" tmpworkspaces = [] if gc_control: P.checkParameter("annotator_gc_workspace") tmpsynonyms = PARAMS["annotator_gc_workspace"] + ".synonyms" tmpworkspaces.append(PARAMS["annotator_gc_workspace"]) else: tmpsynonyms = None for workspace in workspaces: tmpworkspace = os.path.join(tmpdir, "workspace_%s" % workspace) if workspace == "all": continue elif workspace == "genomic": P.checkParameter("genome") statement = ''' python %(scriptsdir)s/index2gff.py --genome=%(genome)s --log=%(outfile)s.log | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' elif workspace in ("intergenic", "intronic", "CDS"): P.checkParameter("annotator_regions") statement = ''' awk '$3 == "%(workspace)s"' < %(annotator_regions)s | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' elif workspace == "unknown": P.checkParameter("annotator_regions") statement = ''' awk '($3 == "intronic" || $3 == "intergenic" )' < %(annotator_regions)s | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' elif workspace == "known": P.checkParameter("annotator_regions") statement = ''' awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)' < %(annotator_regions)s | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' elif workspace == "alignable": P.checkParameter("annotator_alignment") statement = '''gunzip < %(annotator_alignment)s | sort -k10,10 | awk '$10 !~ /%(annotator_remove_pattern)s/ \ {if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \ printf("\\t(%%i,%%i)", $12,$13); }; \ END {printf ("\\n");}'\ > %(tmpworkspace)s ''' elif workspace == "gene-territories": P.checkParameter("annotator_geneterritories") statement = ''' python %(scriptsdir)s/gff2annotator2tsv.py \ --section=workspace \ --max-length=0 \ --log=%(outfile)s.log \ --remove-regex='%(annotator_remove_pattern)s' < %(annotator_geneterritories)s > %(tmpworkspace)s ''' elif workspace == "mappable": P.checkParameter("annotator_mappability") statement = ''' python %(scriptsdir)s/bed2gff.py < %(annotator_mappability)s | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' else: raise P.PipelineError("unknown workspace '%s'" % workspace) P.run() tmpworkspaces.append(tmpworkspace) return tmpworkspaces, tmpsynonyms