def getKmersWithOneMisMtch(motif1, motifListWithMetrics): """ Takes a TAMO motif and a list of lists: [TAMOmotif, weightMetric]. Returns listOfLists in same form, if a revComp in motifListWithMetrics matches better, _IT_ is returned instead of the original motif. Restricts motif1 from collecting any motifs that are not of leng +/- 1 of itself. """ resultList = [] for mWithMetric in motifListWithMetrics: if len(motif1)-1 <= len(mWithMetric[0]) <= len(motif1)+1: # Determine what distanceResult == one misMatch. # Use length of shortest motif for misMatch Calc maxAlnLen = min(len(motif1), len(mWithMetric[0])) kMer = Motif('%s' %('A'*maxAlnLen)) kMer1mis = Motif('%s%s' %('A'*(maxAlnLen-1),'T')) # whether at end or in middle the 'T gives same align score' oneMisMatch = MotifCompare.minshortestoverhangdiff(kMer,kMer1mis) bestOri = getMinDiffOri(motif1,mWithMetric[0]) if bestOri[1] <= oneMisMatch: resultList.append([bestOri[0],mWithMetric[1]]) # keep [bestOriTAMOmotif, weightMetric] return resultList
def getMinDiffOri(motif1,motif2,minoverlap=6,getOffset=False): ##originally had this at end of func def. dunno why-> , N=1, keepLen=0): """ Takes two TAMO motifs. Calculates TAMO.Clustering.MotifCompare.minshortestoverhangdiff for motif1 against motif2 and the rvcmp of motif2. Returns a tuple containing the TAMO motif obj of motif2 that produced the least distance result and the distance result. (motif2, distResult) -OR- (motif2_rc, distResult) if getOffset: (motif2, distResult, offset) -OR- (motif2_rc, distResult, offset) """ dist = MotifCompare.minshortestoverhangdiff(motif1,motif2,minoverlap=minoverlap,want_DistAndOff=1) if getOffset: if dist[2]: return (motif2.revcomp(), dist[0],dist[1]) else: return (motif2, dist[0],dist[1]) else: if dist[2]: return (motif2.revcomp(), dist[0]) else: return (motif2, dist[0])
print "loading vars..." motifList = map(lambda line: MotifTools.Motif_from_text(line.strip()),open(\ '/Users/biggus/Documents/James/Writings_Talks/Grants/09_Feb/PrelimData_Grant_Feb09/Clus2_kmerSearch-0.01.8mers.motifs.txt','r').readlines()) dMat = '' print "constructing distanceMatrix..." dM_t1 = time() distanceMatrix={} for i in range(len(motifList)): print 'motif %s of %s' % (i+1,len(motifList)) distanceMatrix[i]={} for j in range(len(motifList)): # check fwd and revCmp alignments and take the lowest fwd_diff = MotifCompare.minshortestoverhangdiff(motifList[i],motifList[j]) revCmp_diff = MotifCompare.minshortestoverhangdiff(motifList[i].revcomp(),motifList[j]) print 'Fwd: %s\nRev: %s\n' % (fwd_diff,revCmp_diff) distanceMatrix[i][j] = min([fwd_diff,revCmp_diff]) dM_t2 = time() pprint(distanceMatrix) print 'distanceMatrix took %.4f sec.' % (dM_t2-dM_t1) print "discovering clusters..." # --Using Kmedoids -- clusterOut = Kmedoids.bestaveKMedoids_cluster(distanceMatrix,kmax=30) for c in clusterOut[1]: print 'cluster_%s:' % (c) for m in clusterOut[1][c]: print motifList[m].oneletter