def _find_qp_or_pq_match_on_orfobj(exon, orfObj, min_identity_count=1, max_dissimilar_count=0): """ """ tinyexonmatches = [] protseq = exon.proteinsequence() protlen = len(protseq) for offset in range(0, orfObj.protein_length - protlen): seqpart = orfObj.protein_sequence[offset:offset + protlen] match = make_alignment_match(protseq, seqpart, matrix=TINYEXON_MATRIX.matrix) if match.count(" ") <= max_dissimilar_count and\ match.count("*") >= min_identity_count: aapos = orfObj.protein_startPY + offset dnapos = orfObj.aapos2dnapos(aapos) if exon.acceptor.phase == 2: dnapos -= 1 if exon.acceptor.phase == 1: dnapos -= 2 dnaseq = orfObj.inputgenomicsequence[dnapos - 2:dnapos + exon.length + 2].upper() if dnaseq[0:2] == 'AG' or dnaseq[-2:] in ['GT', 'GC']: tinyexonmatches.append((seqpart, aapos)) # return list of tinyexon match tuples return tinyexonmatches
def _are_tinyexons_similar(exonQ,exonS,min_identity_count=1,max_dissimilar_count=0): """ """ if exonS.length != exonQ.length: return False if exonQ.donor.phase != exonS.donor.phase: return False if exonQ.proteinsequence() == exonS.proteinsequence(): return True # if here: test similarity match = make_alignment_match( exonQ.proteinsequence(), exonS.proteinsequence(), matrix=TINYEXON_MATRIX.matrix ) if match.count(" ") <= max_dissimilar_count and\ match.count("*") >= min_identity_count: return True else: return False
def _find_match_on_orfobj(exon, orfObj, min_identity_count=1, max_dissimilar_count=0): """ """ tinyexonmatches = [] protseq = exon.proteinsequence() protlen = len(protseq) for offset in range(0, orfObj.protein_length - protlen): seqpart = orfObj.protein_sequence[offset:offset + protlen] match = make_alignment_match(protseq, seqpart, matrix=TINYEXON_MATRIX.matrix) if protseq == "SGWNAA" and seqpart in [ 'SGFNSA', 'SGWNAA', 'GLFNSV', 'SGFTSA', 'GGFTSA', 'GDFNAV', 'GKFNTI', 'SGFNSA', 'GNFTTI', 'GGGSTN', 'GDFSAV', 'GKFNTI', 'GAFTSA' ]: maxQ = TINYEXON_MATRIX.scorealignment(protseq, protseq) maxS = TINYEXON_MATRIX.scorealignment(seqpart, seqpart) print True, protseq, "'%s'" % match, seqpart, ( TINYEXON_MATRIX.scorealignment(protseq, seqpart), maxQ, maxS), orfObj #if protseq == "LSPSM": # maxQ = TINYEXON_MATRIX.scorealignment(protseq,protseq) # maxS = TINYEXON_MATRIX.scorealignment(seqpart,seqpart) # print False, protseq, "'%s'" % match, seqpart, (TINYEXON_MATRIX.scorealignment(protseq,seqpart),maxQ,maxS), orfObj if match.count(" ") <= max_dissimilar_count and\ match.count("*") >= min_identity_count: aapos = orfObj.protein_startPY + offset tinyexonmatches.append((seqpart, aapos)) elif len(seqpart) >= 5 and match.count("*") >= min_identity_count and\ TINYEXON_MATRIX.scorealignment(protseq,seqpart) > 0 and\ match.count(" ") <= max_dissimilar_count+1: # escape for longer tinyexons; relax constrain a little bit aapos = orfObj.protein_startPY + offset tinyexonmatches.append((seqpart, aapos)) else: pass # return list of tinyexon match tuples return tinyexonmatches
def _are_tinyexons_similar(exonQ, exonS, min_identity_count=1, max_dissimilar_count=0): """ """ if exonS.length != exonQ.length: return False if exonQ.donor.phase != exonS.donor.phase: return False if exonQ.proteinsequence() == exonS.proteinsequence(): return True # if here: test similarity match = make_alignment_match(exonQ.proteinsequence(), exonS.proteinsequence(), matrix=TINYEXON_MATRIX.matrix) if match.count(" ") <= max_dissimilar_count and\ match.count("*") >= min_identity_count: return True else: return False
def _find_qp_or_pq_match_on_orfobj(exon,orfObj,min_identity_count=1,max_dissimilar_count=0): """ """ tinyexonmatches = [] protseq = exon.proteinsequence() protlen = len(protseq) for offset in range(0,orfObj.protein_length-protlen): seqpart = orfObj.protein_sequence[offset:offset+protlen] match = make_alignment_match(protseq,seqpart,matrix=TINYEXON_MATRIX.matrix) if match.count(" ") <= max_dissimilar_count and\ match.count("*") >= min_identity_count: aapos = orfObj.protein_startPY + offset dnapos = orfObj.aapos2dnapos(aapos) if exon.acceptor.phase == 2: dnapos-=1 if exon.acceptor.phase == 1: dnapos-=2 dnaseq = orfObj.inputgenomicsequence[dnapos-2:dnapos+exon.length+2].upper() if dnaseq[0:2] == 'AG' or dnaseq[-2:] in ['GT','GC']: tinyexonmatches.append( (seqpart,aapos) ) # return list of tinyexon match tuples return tinyexonmatches
def _find_match_on_orfobj(exon,orfObj,min_identity_count=1,max_dissimilar_count=0): """ """ tinyexonmatches = [] protseq = exon.proteinsequence() protlen = len(protseq) for offset in range(0,orfObj.protein_length-protlen): seqpart = orfObj.protein_sequence[offset:offset+protlen] match = make_alignment_match(protseq,seqpart,matrix=TINYEXON_MATRIX.matrix) if protseq == "SGWNAA" and seqpart in ['SGFNSA','SGWNAA','GLFNSV','SGFTSA','GGFTSA','GDFNAV','GKFNTI','SGFNSA','GNFTTI','GGGSTN','GDFSAV','GKFNTI','GAFTSA']: maxQ = TINYEXON_MATRIX.scorealignment(protseq,protseq) maxS = TINYEXON_MATRIX.scorealignment(seqpart,seqpart) print True, protseq, "'%s'" % match, seqpart, (TINYEXON_MATRIX.scorealignment(protseq,seqpart),maxQ,maxS), orfObj #if protseq == "LSPSM": # maxQ = TINYEXON_MATRIX.scorealignment(protseq,protseq) # maxS = TINYEXON_MATRIX.scorealignment(seqpart,seqpart) # print False, protseq, "'%s'" % match, seqpart, (TINYEXON_MATRIX.scorealignment(protseq,seqpart),maxQ,maxS), orfObj if match.count(" ") <= max_dissimilar_count and\ match.count("*") >= min_identity_count: aapos = orfObj.protein_startPY + offset tinyexonmatches.append( (seqpart,aapos) ) elif len(seqpart) >= 5 and match.count("*") >= min_identity_count and\ TINYEXON_MATRIX.scorealignment(protseq,seqpart) > 0 and\ match.count(" ") <= max_dissimilar_count+1: # escape for longer tinyexons; relax constrain a little bit aapos = orfObj.protein_startPY + offset tinyexonmatches.append( (seqpart,aapos) ) else: pass # return list of tinyexon match tuples return tinyexonmatches