def buildWorkSpace(outfile, workspace): '''write genomic workspace. Available workspaces are: genomic the full genome intronic introns (requires annotator_regions to be set) exonic exonic (requires annotator_regions to be set) intergenic introns (requires annotator_regions to be set) geneterritories introns (requires annotator_geneterritories to be set) mappable mappable part of genome (requires annotator_mappability to be set ) alignable only the alignable part of a genome (requires annotator_alignment to be set) If ``gc_control`` is True, the chromosomes will be divided into isochores (requiers the paramater ``annotator_gc_workspace`` to be set). ''' job_options = "-l mem_free=4000M" workspace = workspace.lower() if workspace == "genomic": P.checkParameter("genome") statement = ''' cgat index2bed --genome=%(genome)s --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' elif workspace in ("intergenic", "intronic", "cds"): P.checkParameter("enrichment_regions") workspace_upper = workspace.upper() statement = ''' gunzip < %(enrichment_regions)s | awk 'BEGIN {printf("track name=%(workspace)s\\n"); } ($3 == "%(workspace)s" || $3 == "%(workspace_upper)s") && !( $1 ~ /%(enrichment_remove_pattern)s/) { printf("%%s\\t%%i\\t%%i\\n", $1, $4-1, $5); }' > %(outfile)s ''' elif workspace == "unknown": P.checkParameter("enrichment_regions") statement = ''' awk '($3 == "intronic" || $3 == "intergenic" )' < %(enrichment_regions)s | cgat gff2enrichment --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' elif workspace == "known": P.checkParameter("enrichment_regions") statement = ''' awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)' < %(enrichment_regions)s | cgat gff2enrichment --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' elif workspace == "alignable": P.checkParameter("enrichment_alignment") statement = '''gunzip < %(enrichment_alignment)s | sort -k10,10 | awk '$10 !~ /%(enrichment_remove_pattern)s/ \ {if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \ printf("\\t(%%i,%%i)", $12,$13); }; \ END {printf ("\\n");}'\ > %(outfile)s ''' elif workspace == "gene-territories": P.checkParameter("enrichment_geneterritories") statement = ''' cgat gff2enrichment \ --section=workspace \ --max-length=0 \ --log=%(outfile)s.log \ --remove-regex='%(enrichment_remove_pattern)s' < %(enrichment_geneterritories)s > %(outfile)s ''' elif workspace == "mappable": P.checkParameter("enrichment_mappability") statement = ''' cgat bed2gff < %(enrichment_mappability)s | cgat gff2enrichment --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' else: raise ValueError("unknown workspace '%s'" % workspace) P.run()
def buildWorkSpace(outfile, workspace): '''write genomic workspace. Available workspaces are: genomic the full genome intronic introns (requires annotator_regions to be set) exonic exonic (requires annotator_regions to be set) intergenic introns (requires annotator_regions to be set) geneterritories introns (requires annotator_geneterritories to be set) mappable mappable part of genome (requires annotator_mappability to be set ) alignable only the alignable part of a genome (requires annotator_alignment to be set) If ``gc_control`` is True, the chromosomes will be divided into isochores (requiers the paramater ``annotator_gc_workspace`` to be set). ''' job_options = "-l mem_free=4000M" workspace = workspace.lower() if workspace == "genomic": P.checkParameter("genome") statement = ''' python %(scriptsdir)s/index2bed.py --genome=%(genome)s --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' elif workspace in ("intergenic", "intronic", "cds"): P.checkParameter("enrichment_regions") workspace_upper = workspace.upper() statement = ''' gunzip < %(enrichment_regions)s | awk 'BEGIN {printf("track name=%(workspace)s\\n"); } ($3 == "%(workspace)s" || $3 == "%(workspace_upper)s") && !( $1 ~ /%(enrichment_remove_pattern)s/) { printf("%%s\\t%%i\\t%%i\\n", $1, $4-1, $5); }' > %(outfile)s ''' elif workspace == "unknown": P.checkParameter("enrichment_regions") statement = ''' awk '($3 == "intronic" || $3 == "intergenic" )' < %(enrichment_regions)s | python %(scriptsdir)s/gff2enrichment.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' elif workspace == "known": P.checkParameter("enrichment_regions") statement = ''' awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)' < %(enrichment_regions)s | python %(scriptsdir)s/gff2enrichment.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' elif workspace == "alignable": P.checkParameter("enrichment_alignment") statement = '''gunzip < %(enrichment_alignment)s | sort -k10,10 | awk '$10 !~ /%(enrichment_remove_pattern)s/ \ {if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \ printf("\\t(%%i,%%i)", $12,$13); }; \ END {printf ("\\n");}'\ > %(outfile)s ''' elif workspace == "gene-territories": P.checkParameter("enrichment_geneterritories") statement = ''' python %(scriptsdir)s/gff2enrichment.py \ --section=workspace \ --max-length=0 \ --log=%(outfile)s.log \ --remove-regex='%(enrichment_remove_pattern)s' < %(enrichment_geneterritories)s > %(outfile)s ''' elif workspace == "mappable": P.checkParameter("enrichment_mappability") statement = ''' python %(scriptsdir)s/bed2gff.py < %(enrichment_mappability)s | python %(scriptsdir)s/gff2enrichment.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' else: raise P.PipelineError("unknown workspace '%s'" % workspace) P.run()