def loadGFF_UTR(self,fields,line,transcriptBeginEnd,GFF, transcripts,readOrder,genes): exonBegin=int(fields[3])-1 exonEnd=int(fields[4]) exonScore=fields[5] strand=fields[6] frame=fields[7] transcriptId=None rex=Rex() if(rex.find('transgrp[:=]\s*(\S+)',line)): transcriptId=rex[1] elif(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)): transcriptId=rex[1] elif(rex.find('Parent=([^;,\s]+)',line)): transcriptId=rex[1] geneId=None if(rex.find('genegrp=(\S+)',line)): geneId=rex[1] elif(rex.find('gene_id[:=]?\s*"?([^\s\;"]+)"?',line)): geneId=rex[1] if(transcriptId is None): transcriptId=geneId if(geneId is None): geneId=transcriptId if(transcriptId is None): raise Exception(line+" : no transcript ID found") if(rex.find("(\S+);$",transcriptId)): transcriptId=rex[1] if(rex.find("(\S+);$",geneId)): geneId=rex[1] extra="" for i in range(8,len(fields)): extra+=fields[i]+" " if(exonBegin>exonEnd): (exonBegin,exonEnd)=(exonEnd,exonBegin) transcript=transcripts.get(transcriptId,None) if(not transcript): transcripts[transcriptId]=transcript= \ Transcript(transcriptId,strand) transcript.setStopCodons(self.stopCodons) transcript.readOrder=readOrder readOrder+=1 transcript.substrate=fields[0] transcript.source=fields[1] if(transcriptBeginEnd.get(transcriptId,None) is not None): (begin,end)=transcriptBeginEnd[transcriptId] transcript.setBegin(begin) transcript.setEnd(end) else: transcript.setBegin(exonBegin) transcript.setEnd(exonEnd) transcript.geneId=geneId gene=genes.get(geneId,None) if(gene is None): genes[geneId]=gene=Gene(); gene.setId(geneId) transcript.setGene(gene) exon=Exon(exonBegin,exonEnd,transcript) exon.extraFields=extra if(transcript.rawExons is not None): exon.frame=frame exon.score=exonScore exon.type=fields[2] transcript.rawExons.append(exon) elif(not transcript.exonOverlapsExon(exon)): exon.frame=frame exon.score=exonScore exon.type=fields[2] transcript.UTR.append(exon) # OK -- we sort later gene.addTranscript(transcript)
def loadGFF_CDS(self,fields,line,transcriptBeginEnd,GFF, transcripts,readOrder,genes): exonBegin=int(fields[3])-1 exonEnd=int(fields[4]) exonScore=fields[5] strand=fields[6] frame=fields[7] transcriptId=None rex=Rex() if(rex.find('transgrp[:=]\s*(\S+)',line)): transcriptId=rex[1] elif(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)): transcriptId=rex[1] elif(rex.find('Parent=([^;,\s]+)',line)): transcriptId=rex[1] geneId=None if(rex.find('genegrp=(\S+)',line)): geneId=rex[1] elif(rex.find('gene_id[:=]?\s*"?([^\s\;"]+)"?',line)): geneId=rex[1] if(transcriptId is None): transcriptId=geneId if(geneId is None): geneId=transcriptId if(transcriptId is None): raise Exception(line+" : no transcript ID found") if(rex.find('(\S+);$',transcriptId)): transcriptId=rex[1] if(rex.find('(\S+);$',geneId)): geneId=rex[1] extra="" for i in range(8,len(fields)): extra+=fields[i]+" " if(exonBegin>exonEnd): (exonBegin,exonEnd)=(exonEnd,exonBegin) transcript=transcripts.get(transcriptId,None) if(transcript is None): transcripts[transcriptId]=transcript= \ Transcript(transcriptId,strand) transcript.setStopCodons(self.stopCodons) transcript.readOrder=readOrder readOrder+=1 transcript.substrate=fields[0] transcript.source=fields[1] if(transcriptBeginEnd.get(transcriptId,None) is not None): (begin,end)=transcriptBeginEnd[transcriptId] transcript.setBegin(begin) transcript.setEnd(end) transcript.geneId=geneId gene=genes.get(geneId,None) if(gene is None): genes[geneId]=gene=Gene(); gene.setId(geneId) transcript.setGene(gene) exon=Exon(exonBegin,exonEnd,transcript) exon.extraFields=extra if(not transcript.exonOverlapsExon(exon)): exon.frame=frame exon.score=exonScore exon.type=fields[2] transcript.exons.append(exon) # OK -- we sort later gene.addTranscript(transcript)
def __init__(self,id,strand=None): if(type(id)!=EssexNode): # not an EssexNode self.transcriptId=id self.strand=strand self.exons=[] self.UTR=[] self.rawExons=None self.stopCodons={"TAG":1,"TGA":1,"TAA":1} self.startCodon=None self.extraFields=None else: # EssexNode essex=id self.transcriptId=essex.getAttribute("ID") self.strand=essex.getAttribute("strand") self.source=essex.getAttribute("source") self.begin=essex.getAttribute("begin") self.end=essex.getAttribute("end") self.geneId=essex.getAttribute("gene") self.substrate=essex.getAttribute("substrate") self.exons=[] self.UTR=[] self.rawExons=None self.startCodon=None self.extraFields=None self.stopCodons={"TAG":1,"TGA":1,"TAA":1} exons=self.exons UTR=self.UTR exonsElem=essex.findChild("exons") if(exonsElem): n=exonsElem.numElements() for i in range(0,n): exon=exonsElem.getIthElem(i) begin=int(exon.getIthElem(0)) end=int(exon.getIthElem(1)) exon=Exon(begin,end,self) exons.append(exon) utrElem=essex.findChild("UTR") if(utrElem): n=utrElem.numElements() for i in range(0,n): exon=utrElem.getIthElem(i) begin=int(exon.getIthElem(0)) end=int(exon.getIthElem(1)) exon=Exon(begin,end,self) UTR.append(exon)
def makeExon(self, root): begin = int(root["begin"]) end = int(root["end"]) exon = Exon(begin, end, None) exon.strand = root["strand"] exon.frame = root["frame"] exon.type = root["type"] exon.score = root["score"] exon.substrate = root["substrate"] extra = root["extra"] exon.extraFields = "" for key in extra: exon.extraFields += key + "=" + extra[key] + ";" return exon
def makeExon(self,root): begin=int(root["begin"]) end=int(root["end"]) exon=Exon(begin,end,None) exon.strand=root["strand"] exon.frame=root["frame"] exon.type=root["type"] exon.score=root["score"] exon.substrate=root["substrate"] extra=root["extra"] exon.extraFields="" for key in extra: exon.extraFields+=key+"="+extra[key]+";" return exon
def organize_features( self, isoform, bool_exon = True ): """ Args: isoform = cruzdb object that contains information for a specific isoform Function: this will organize the feats, cds, & reading frame for a specific gene """ #hash_feats = will contain feats sorted by start position (key = index, value = feat) hash_feats = {} #key = string that is feat range (chrom:start-end), value = feat object #get reading frames ##TEST:: print "organize_feats: isoform = ", isoform.feats if bool_exon: list_features = isoform.exons feat_frames = map( int, [x for x in isoform.exonFrames.split(',') if x] ) else: list_features = isoform.introns for i, feat in enumerate( list_features ): #i = feat number #check to see if the key_range exists key_range = Isoform.make_key_range( self.chrom, feat[0], feat[1] ) #calculate the feat number # feat_num = i if self.strand == 1 else ( len(list_features) - i - 1 ) #for 0-based exons, use this # feat_num = i + 1 if self.strand == 1 else ( len(list_features) - i ) #for 1-based exons, use this feat_num = i + exon_base if self.strand == 1 else ( len(list_features) - i - ( exon_base - 1 ) ) #this handles both 0-based & 1-based #NOTE: UCSC has 0-based genome, meaning the first position of the feat is actually the last position in the previous intron, that is why I add '+1' if bool_exon: feat_info = self.get_feat_info( feat[0], feat[1], feat_frames[i], feat_num, True, 'exon' ) else: feat_info = self.get_feat_info( feat[0], feat[1], None, feat_num, True, 'intron' ) hash_feats[ key_range ] = Exon( feat_info ) #go through all cds, assign CDS to each feat if bool_exon: for each_cds in isoform.cds: #each_cd = tuple where [0] = start position & [1] = end position hash_feats = self.organize_exons_cds( each_cds[0], each_cds[1], hash_feats ) return hash_feats