def clusterTEentries(entries, mmpenalty): """ cluster non-overlapping TE insertions of the same family. a scoring system is used to decide whether non-overlapping TE insertions will be clustered """ clustered = [] tes = sorted(entries, key=lambda e: e.start) while (len(tes) > 0): a = tes.pop(0) score = a.score end = a.end while (len(tes) > 0): totest = tes[0] gap = totest.start - end gappen = gap * mmpenalty scorewithgap = score - gappen if scorewithgap < 0: # if the score with the gap reaches zero; break! similar dynamic programming break # novel high score: totestscore = scorewithgap + totest.score if (totestscore >= score): tes.pop(0) score = totestscore end = totest.end else: #no new high score break ne = GTFEntry(a.chr, a.source, a.feature, a.start, end, score, a.strand, a.frame, a.comment) ne.target = a.target clustered.append(ne) return clustered
def clusterTEentries(entries,mmpenalty): """ cluster non-overlapping TE insertions of the same family. a scoring system is used to decide whether non-overlapping TE insertions will be clustered """ clustered=[] tes=sorted(entries,key=lambda e: e.start) while(len(tes)>0): a=tes.pop(0) score=a.score end=a.end while(len(tes)>0): totest=tes[0] gap=totest.start-end gappen=gap*mmpenalty scorewithgap=score-gappen if scorewithgap<0: # if the score with the gap reaches zero; break! similar dynamic programming break # novel high score: totestscore=scorewithgap+totest.score if(totestscore>=score): tes.pop(0) score=totestscore end=totest.end else: #no new high score break ne =GTFEntry(a.chr,a.source,a.feature,a.start,end,score,a.strand,a.frame,a.comment) ne.target=a.target clustered.append(ne) return clustered
def mergeTEentries(entries, matchscore): """ merge overlapping TE entries of the same family. New score is the total length """ merged = [] tes = sorted(entries, key=lambda e: e.start) while(len(tes) > 0): a = tes.pop(0) start = a.start highestend = a.end while(len(tes) > 0 and tes[0].start <= highestend): b = tes.pop(0) if(b.end > highestend): highestend = b.end ne = GTFEntry(a.chr, a.source, a.feature, start, highestend, matchscore * float(highestend - start), a.strand, a.frame, a.comment) ne.target = a.target merged.append(ne) return merged
def mergeTEentries(entries,matchscore): """ merge overlapping TE entries of the same family. New score is the total length """ merged=[] tes=sorted(entries,key=lambda e: e.start) while(len(tes)>0): a=tes.pop(0) start=a.start highestend=a.end while(len(tes)>0 and tes[0].start<=highestend): b=tes.pop(0) if(b.end>highestend): highestend=b.end ne =GTFEntry(a.chr,a.source,a.feature,start,highestend,matchscore*float(highestend-start),a.strand,a.frame,a.comment) ne.target=a.target merged.append(ne) return merged
def get_querygtflist(nucmerentries): #chr, source, feature, start, end, score, strand, frame, comment (comment is unparsed) gtfes = [] for nm in nucmerentries: comment = "{0}:{1}-{2} count={3}".format(nm.rname, nm.rstart, nm.rend, nm.count) gtfes.append( GTFEntry(nm.qname, "nucmer", "orthologous", nm.qstart, nm.qend, ".", ".", ".", comment)) return gtfes