def inferUTRort(self): ''' Infers a UTR record based on strand and coordinates compare between UTR and cds ''' if not self.utr: return self.cds = sortArr(self.cds,0) firstcdsStart, firstcdsEnd = self.cds[0] for utrs,utre in self.utr: if self.strand == "+": if utrs < firstcdsStart: self.fp_utr.append([utrs,utre]) else: self.tp_utr.append([utrs,utre]) else: if utrs < firstcdsStart: self.tp_utr.append([utrs,utre]) else: self.fp_utr.append([utrs,utre]) self.fp_utr=sortArr(self.fp_utr,0) self.tp_utr=sortArr(self.tp_utr,0) return True
def inferUTR(self): ''' This method would try to infer FP_UTR and TP_UTR even without UTR annotation for a transcript. Exons and CDSs are suffcient. ''' if self.fp_utr and self.tp_utr: return if not self.cds: return self.cds = sortArr(self.cds,0) self.exons = sortArr(self.exons,0) CDSstart,CDSend = (self.cds[0][0],self.cds[-1][-1]) if self.strand == "+" else (self.cds[-1][-1],self.cds[0][0]) for tmps,tmpe in self.exons: if self.strand == "+": if tmpe < CDSstart: self.fp_utr.append([tmps,tmpe]) elif tmps < CDSstart < tmpe: self.fp_utr.append([tmps,CDSstart]) elif tmps > CDSend: self.tp_utr.append([tmps,tmpe]) elif tmps < CDSend < tmpe: self.tp_utr.append([CDSend,tmpe]) else: pass #raise Exception("ERROR: could not determine this exon [%d,%d] is a UTR or not" % (tmps,tmpe,)) else: if tmps > CDSstart: self.fp_utr.append([tmps,tmpe]) elif tmps < CDSstart < tmpe: self.fp_utr.append([CDSstart,tmpe]) elif tmpe < CDSend: self.tp_utr.append([tmps,tmpe]) elif tmps < CDSend < tmpe: self.tp_utr.append([tmps,CDSend]) else: pass #raise Exception("ERROR: could not determine this exon [%d,%d] is a UTR or not" % (tmps,tmpe,)) self.fp_utr = sortArr(rmExtra(self.fp_utr),0) self.tp_utr = sortArr(rmExtra(self.tp_utr),0) return True
def inferIntron(self): '''Returns a list of duples containing start/end positions of introns in this transcript.''' self.introns = [] if self.exons: intervals = self.exons elif self.cds: intervals = self.cds else: return for i in xrange(1,len(intervals)) : intron = [exons[i-1].end(), exons[i].start()] self.introns.append(intron) self.introns = sortArr(self.introns,0) return True
class transcript(BaseFeature): """ An mRNA acts like an isoform in that it is associated with a parent gene and contains a number of coding sequences (CDS). """ def __init__(self, chromosome, start, end, strand, feature, id, attr={}): BaseFeature.__init__(self, chromosome, start, end, strand, feature, attr) self.id = id self.exons = [] self.biotype = attr['biotype'] self.feature = feature self.cds = [] self.cdsMap = {} self.start_codon = None self.stop_codon = None self.utr = [] self.fp_utr = [] self.tp_utr = [] self.utrMap = {} self.exonMap = {} self.introns = [] self.attrs = attr self.length = self.end - self.start def addexon(self,exon): if exon.strand != self.strand: raise Exception("ERROR: strand '%s' of exon from transcript %s does not match gene strand '%s'" % (exon.strand, exon.parent, self.strand)) if exon.chromosome != self.chromosome: raise Exception("ERROR: chromosome '%s' of exon from transcript %s does not match gene chromosome '%s'" % (exon.chromosome, exon.parent, self.chromosome)) exonTuple = (exon.start,exon.end) try : ignore = self.exonMap[exonTuple] return False except KeyError,e: self.exonMap[exonTuple] = exon self.exons.append(list(exonTuple)) self.exons = sortArr(self.exons,0,1) return True
def getCDSs(self): if self.CDSStartStat is 'unk' or self.CDSEndStat is 'unk': # in this case, CDSStart is equal to CDSEnd , so we will not try to infer cds boundrary return for exonStart,exonEnd in self.exons: if exonEnd < self.CDSStart: self.utr.append([exonStart,exonEnd]) continue elif exonStart < self.CDSStart < exonEnd: tmpStart = self.CDSStart if self.strand=='+' else self.CDSStart -3 self.utr.append([exonStart,tmpStart]) self.cds.append([self.CDSStart,exonEnd]) elif self.CDSStart < exonEnd < self.CDSEnd: self.cds.append([exonStart,exonEnd]) elif exonStart < self.CDSEnd < exonEnd: tmpEnd = self.CDSEnd+3 if self.strand=='+' else self.CDSEnd self.utr.append([tmpEnd,exonEnd]) self.cds.append([exonStart,self.CDSEnd]) else: self.utr.append([exonStart,exonEnd]) continue self.cds = sortArr(self.cds,0) return True
self.exons = sortArr(self.exons,0,1) return True def addCDS(self, cds): if cds.strand != self.strand: raise Exception("ERROR: strand '%s' of CDS from transcript %s does not match gene strand '%s'" % (cds.strand, cds.parent, self.strand)) if cds.chromosome != self.chromosome: raise Exception("ERROR: chromosome '%s' of CDS from transcript %s does not match gene chromosome '%s'" % (cds.chromosome, cds.parent, self.chromosome)) cdsTuple = (cds.start,cds.end) try : ignore = self.cdsMap[cdsTuple] return False except KeyError,e: self.cdsMap[cdsTuple] = cds self.cds.append(list(cdsTuple)) self.cds = sortArr(self.cds,0,1) return True def addUTR(self,utr): if utr.strand != self.strand: raise Exception("ERROR: strand '%s' of UTR from transcript %s does not match gene strand '%s'" % (cds.strand, cds.parent, self.strand)) if utr.chromosome != self.chromosome : raise Exception("ERROR: chromosome '%s' of UTR from transcript %s does not match gene chromosome '%s'" % (cds.chromosome, cds.parent, self.chromosome)) utrTuple = (utr.start,utr.end) try : ignore = self.utrMap[utrTuple] return False except KeyError,e: self.utrMap[utrTuple] = utr self.utr.append(list(utrTuple)) return True
def getIntrons(self): for i in range(self.txExonCount-1): self.introns.append([self.txExonsEnd[i],self.txExonsStart[i+1]-1]) self.introns=sortArr(self.introns,0) return True
#print a #print b sys.exit(0) import bamio ''' idx=bamio.Tabix(sys.argv[1]) for item in idx.fetch('1',3100,5000): print item idx.close() a,b,c,d,e,f,g=bamio.mappingstat(sys.argv[1]) print a,b,c,d,e,f,g #matplot.densityplot([a1,a2,a3,a4,a5,a6,a7,a8],['s','e','f','g','h','a','b','c']) ''' arr = [['1', 42, 52], ['11', 45, 78], ['2', 25, 100], ['1', 23, 78], ['1', 56, 89]] print utils.sortArr(arr, 0, 1) from format import * f = sys.argv[1] recs = fasta_itr(f) print dir(recs) print type(recs) for rec in recs: print rec.id print rec.seq