def importRNAAnnotationFromUCSC(outfile): """This task downloads UCSC repetetive RNA types. """ PipelineGtfsubset.getRepeatDataFromUCSC( dbhandle=connectToUCSC(), repclasses=P.asList(PARAMS["ucsc_rnatypes"]), outfile=outfile, remove_contigs_regex=PARAMS["ncbi_remove_contigs"])
def importRepeatsFromUCSC(outfile): """This task downloads UCSC repeats types as identified in the configuration file. """ PipelineGtfsubset.getRepeatDataFromUCSC(dbhandle=connectToUCSC(), repclasses=P.asList( PARAMS["ucsc_repeattypes"]), outfile=outfile)
def importRNAAnnotationFromUCSC(outfile): """This task downloads UCSC repetetive RNA types. """ PipelineGtfsubset.getRepeatDataFromUCSC( dbhandle=connectToUCSC(), repclasses=P.asList(PARAMS["ucsc_rnatypes"]), outfile=outfile, remove_contigs_regex=PARAMS["ncbi_remove_contigs"], job_memory=PARAMS["job_memory"])
def importRepeatsFromUCSC(outfile): """This task downloads UCSC repeats types as identified in the configuration file. """ PipelineGtfsubset.getRepeatDataFromUCSC( dbhandle=connectToUCSC(), repclasses=P.asList(PARAMS["ucsc_repeattypes"]), outfile=outfile, job_memory=PARAMS["job_memory"])
def buildNonCodingExonTranscript(infile, outfile): ''' Output of the non-coding exon features from an ENSEMBL gene set Remove all of the features from a :term:`gtf` file that are features of ``exon`` and are protein-coding Arguments --------- infile : from ruffus ENSEMBL geneset, filename named in pipeline.yml outfile : from ruffus Output filename named in pipeline.yml filteroption : string Filter option set in the piepline.yml as feature column in GTF nomenclature ''' m = PipelineGtfsubset.SubsetGTF(infile) filteroptions = [ PARAMS['ensembl_cgat_feature'], PARAMS['ensembl_cgat_gene_biotype'] ] filteritem = ["exon", "protein_coding"] m.filterGTF(outfile, filteroptions, filteritem, operators="and not")
def buildCdsTranscript(infile, outfile): ''' Output the CDS features from an ENSEMBL gene set takes all of the features from a :term:`gtf` file that are feature types of ``CDS``. Note - we have not filtered on gene_biotype because some of the CDS are classified as polymorphic_pseudogene. Arguments --------- infile : from ruffus ENSEMBL geneset, filename named in pipeline.yml outfile : from ruffus Output filename named in pipeline.yml filteroption : string Filter option set in the piepline.yml as feature column in GTF nomenclature ''' m = PipelineGtfsubset.SubsetGTF(infile) filteroption = PARAMS['ensembl_cgat_feature'] filteritem = ["CDS"] m.filterGTF(outfile, filteroption, filteritem, operators=None)
def buildLincRNAExonTranscript(infile, outfile): ''' Output of the lincRNA features from an ENSEMBL gene set Takes all of the features from a :term:`gtf` file that are features of ``lincRNA`` Arguments --------- infile : from ruffus ENSEMBL geneset, filename named in pipeline.yml outfile : from ruffus Output filename named in pipeline.yml filteroption : string Filter option set in the piepline.yml as feature column in GTF nomenclature ''' m = PipelineGtfsubset.SubsetGTF(infile) filteroptions = [ PARAMS['ensembl_cgat_feature'], PARAMS['ensembl_cgat_gene_biotype'] ] filteritem = ["exon", "lincRNA"] m.filterGTF(outfile, filteroptions, filteritem, operators="and")
def buildmiRNonPrimaryTranscript(infile, outfile): ''' This function will subset a miRbase annotation gff3 file.The GFF3 file can be downloaded from miRbase. Make sure the annotation matches the genome build that you are using. This function will subset the GFF3 file by selecting annotations that are labled "miRNA". This will subset all of the non primary transcripts. ''' m = PipelineGtfsubset.SubsetGFF3(infile) filteroption = PARAMS['ensembl_cgat_feature'] filteritem = ["miRNA"] m.filterGFF3(outfile, filteroption, filteritem)
def buildExonTranscript(infile, outfile): ''' Output of the exon features from an ENSEMBL gene set Takes all of the features from a :term:`gtf` file that are features of ``exon`` Arguments --------- infile : from ruffus ENSEMBL geneset, filename named in pipeline.ini outfile : from ruffus Output filename named in pipeline.ini filteroption : string Filter option set in the piepline.ini as feature column in GTF nomenclature ''' m = PipelineGtfsubset.SubsetGTF(infile) filteroption = PARAMS['ensembl_cgat_feature'] filteritem = ["exon"] m.filterGTF(outfile, filteroption, filteritem, operators=None)
def buildGenomicContext(infiles, outfile): PipelineGtfsubset.buildGenomicContext(infiles, outfile, job_memory=PARAMS["job_highmemory"])
def buildGenomicContext(infiles, outfile): PipelineGtfsubset.buildGenomicContext(infiles, outfile)
def connectToUCSC(): return PipelineGtfsubset.connectToUCSC(host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"], database=PARAMS["ucsc_database"])
def connectToUCSC(): return PipelineGtfsubset.connectToUCSC( host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"], database=PARAMS["ucsc_database"])
def loadGeneInformation(infile, outfile): '''load the transcript set.''' PipelineGtfsubset.loadGeneInformation(infile, outfile, job_memory=PARAMS["job_highmemory"])
def buildFlatGeneSet(infile, outfile): PipelineGtfsubset.buildFlatGeneSet(infile, outfile)
def buildFlatGeneSet(infile, outfile): PipelineGtfsubset.buildFlatGeneSet(infile, outfile, job_memory=PARAMS["job_highmemory"])
def loadGeneInformation(infile, outfile): '''load the transcript set.''' PipelineGtfsubset.loadGeneInformation(infile, outfile)