def FilterSignalPeptideInTopology(topo, sp_pos):#{{{ """ Filter signal peptide in topology sp_pos: location of signal peptide """ gapless_topo = topo.replace(GAP, '') #get position of the N-terminal TM (fb, fe) = myfunc.GetFirstTMPosition(gapless_topo) if fb != -1: cov = myfunc.coverage(0, sp_pos, fb, fe) # print "cov = %d, LTM = %d"%(cov, fe-fb) if cov/float(fe-fb) > 0.5: # the first TM helix is signal peptide # get the position of the first TM helix in aligned form (fb_aln, fe_aln) = myfunc.GetFirstTMPosition(topo) Nterm = GetNtermState(topo) if Nterm == 'i': newNterm = 'o' else: newNterm = 'i' newtopo = topo[:fe_aln].replace(Nterm, newNterm).replace('M', newNterm) + topo[fe_aln:] return newtopo else: return topo else: return topo
def IsOverlappingDomain(thisRangeList, totalRangeList, seqid, hit):#{{{ if len(totalRangeList) < 1 or len(thisRangeList) < 1: return False else: sumCover = 0 thisSumLen = 0 totalSumLen = 0 for i in xrange(len(thisRangeList)): (b1, e1) = thisRangeList[i] thisSumLen += (e1-b1) for j in xrange(len(totalRangeList)): (b2, e2) = totalRangeList[j] totalSumLen += (e2-b2) for i in xrange(len(thisRangeList)): (b1, e1) = thisRangeList[i] for j in xrange(len(totalRangeList)): (b2, e2) = totalRangeList[j] sumCover += max(myfunc.coverage(b1,e1,b2,e2),0) percentCoverage = (sumCover / (float(thisSumLen+totalSumLen)/2.0))*100 if percentCoverage < 30.0: return False else: print >> sys.stderr, "Overlapping found:", "%s %s %4d %4d" % ( seqid, hit['pfamid'], hit['alnBeg'], hit['alnEnd'] ), "%4d %4d %4d %.1f" % ( sumCover, thisSumLen, totalSumLen, percentCoverage) return True
def MaskTopologyBySignalPeptide(idList, topoList, signalpDict): newTopoList = [] for i in xrange(len(idList)): topo = topoList[i] if idList[i] in signalpDict: posTMList = myfunc.GetTMPosition(topo) try: posSigP = signalpDict[idList[i]] (b,e) = (posTMList[0][0],posTMList[0][1]) cov = myfunc.coverage(0, posSigP, b, e) if float(cov)/(e-b) > 0.5: #mask masked_state = topo[e] newTopo = ( "".join([masked_state]*(e)) + topo[e:]) newTopoList.append(newTopo) if DEBUG: print print "posTM", (b,e), "SignalPeptide", posSigP print topo print newTopo else: newTopoList.append(topo) except (KeyError, IndexError): newTopoList.append(topo) else: newTopoList.append(topo) return newTopoList
def GetCoveredTM(segment, posTM):#{{{ newPosTM = [] indexList = [] cnt = 0 for (b,e) in posTM: x = myfunc.coverage(b,e,segment[0], segment[1]) if x/float(e-b) > 0.75: newPosTM.append((b,e)) indexList.append(cnt) cnt += 1 return (newPosTM, indexList)
def IsDuplicated(hitList, seqlen1, seqlen2):#{{{ """ Check the template is a duplicated form of the query The input is a list of hits retrieved by HHsearch each hit in the hit list contains information of posQuery: position of the hit in the query sequence posTemplate: position of the hit in the template sequence numTM1: numTM of the hit in the query sequence numTM2: numTM of the hit in the template sequence """ # checking whether the template is a duplicated form of the query numHit = len(hitList) # if any of two hits are not overlapping, consider it as a duplication for pair in itertools.combinations(range(numHit), 2): hit1 = hitList[pair[0]] hit2 = hitList[pair[1]] (b1_query, e1_query) = hit1['posQuery'] (b2_query, e2_query) = hit2['posQuery'] (b1_temp, e1_temp) = hit1['posTemplate'] (b2_temp, e2_temp) = hit2['posTemplate'] overlap_query = max(0, myfunc.coverage(b1_query, e1_query, b2_query, e2_query)) overlap_temp = max(0, myfunc.coverage(b1_temp, e1_temp, b2_temp, e2_temp)) # if one query segmnet find two hits in two template segments and both with # similar number of TM helices if ( (overlap_query / float(e1_query-b1_query) >= 0.75 or overlap_query / float(e2_query-b2_query) >= 0.75) and (overlap_temp / float(e1_temp-b1_temp) < 0.25) and (overlap_temp / float(e2_temp-b2_temp) < 0.25) and hitList[pair[0]]['numTM1'] > 0 and hitList[pair[0]]['numTM2'] > 0 and hitList[pair[1]]['numTM1'] > 0 and hitList[pair[1]]['numTM2'] > 0 and abs(hitList[pair[0]]['numTM2']-hitList[pair[1]]['numTM2']) <=2 ): return True return False
def IsDuplicatedByHHSearch(hhrfile): #{{{ try: fpin = open(hhrfile, "r") lines = fpin.readlines() fpin.close() hitList = [] numLine = len(lines) i = 0 while i < numLine: line = lines[i] if line.find(" No Hit") == 0: j = 1 while i + j < numLine and lines[i + j] != "": hit = ExtractHit(lines[i + j]) if hit != {}: hitList.append(hit) else: break j += 1 break else: i += 1 if len(hitList) < 2: return False else: sortedHitList = sorted(hitList, key=lambda x: x['evalue'], reverse=False) hit1 = hitList[0] hit2 = hitList[1] if hit2['evalue'] > 1e-3: return False else: (b1, e1) = hit1['posTemplate'] (b2, e2) = hit2['posTemplate'] overlap = max(0, myfunc.coverage(b1, e1, b2, e2)) if overlap / float(e1 - b1) < 0.2 and overlap / float( e2 - b2) < 0.2: return True else: return False except IOError: print >> sys.stderr, "Failed to read hhrfile %s" % hhrfile return False
def IsDuplicatedByHHSearch(hhrfile, seqid1="", seqid2="", cnt=0):#{{{ try: # Read in hhsearch hits fpin = open(hhrfile,"r") lines = fpin.readlines() fpin.close() except IOError: print >> sys.stderr, "Failed to read hhrfile %s"%hhrfile return False lengthQuery = 0 lengthTemplate = 0 hitList = [] numLine = len(lines) i = 0 while i < numLine: line = lines[i] if line.find("Match_columns") == 0: try: lengthQuery = int(line.split()[1]) except (IndexError, ValueError): print >> sys.stderr, "Error in hhrfile %s. Ignore"%(hhrfile) return False i += 1 elif line.find(" No Hit") == 0: j = 1 while i+j < numLine and lines[i+j] != "": hit = ExtractHit(lines[i+j]) if hit != {}: hitList.append(hit) else: break j += 1 break else: i += 1 isDup = False # checking whether the template is a duplicated form of the query numHit = len(hitList) numGoodHit = 0 if numHit >= 2: # there should be at least two hits sortedHitList = sorted(hitList, key=lambda x:x['evalue'], reverse=False) if hitList[1]['evalue'] <= 1e-3: # there should be at leave two hits with evalue < th lengthTemplate = hitList[0]['lengthTemplate'] countGoodHit = 0 # there should be at least two good hits. posListCoverageInTemplate = [] # covered segment list in template idxGoodHitList = [] for i in xrange(len(sortedHitList)): hit = hitList[i] (b1, e1) = hit['posQuery'] if (e1-b1)/float(lengthQuery) > 0.5: idxGoodHitList.append(i) numGoodHit = len(idxGoodHitList) if numGoodHit >= 2: # there should be >= 2 good Hits # if any of two hits are not overlapping, consider it as a duplication for pair in itertools.combinations(idxGoodHitList, 2): hit1 = hitList[pair[0]] hit2 = hitList[pair[1]] (b1, e1) = hit1['posTemplate'] (b2, e2) = hit2['posTemplate'] overlap = max(0, myfunc.coverage(b1, e1, b2, e2)) if (overlap / float(e1-b1) < 0.2) and (overlap / float(e2-b2) < 0.2): isDup = True # if non pair are not overlapping, return false if isDup: ss_isdup = 'y' else: ss_isdup = 'n' sys.stdout.write("%d: %s-%s %s numHit=%d numGoodHit=%d\n" %(cnt, seqid1, seqid2, ss_isdup, numHit, numGoodHit)) return isDup