def clearResidual(returnfmapping, G1, parameterRobot): logging.fmappingSave(returnfmapping, parameterRobot.defaultFolder) UBd = parameterRobot.brachingDepth for index in range(UBd-1,UBd,1): print "index", index parameterRobot.brachingDepth = index returnfmapping, G1= clearResidualMain(returnfmapping,G1,parameterRobot) f2, G2 = returnfmapping, G1 G2 = sorted(G2) eachnodeindex =0 while ( eachnodeindex < len( G2) -1 ): if G2[eachnodeindex] == G2[eachnodeindex +1]: G2.pop(eachnodeindex) else: eachnodeindex = eachnodeindex +1 f2 = sorted(f2) logging.fmapfusedSave(f2, parameterRobot.defaultFolder) logging.storeGraph(G2, parameterRobot.defaultFolder ) return f2, G2
def groupIndelNoisyKmers(noisyReadsDummy, parameterRobot, typeOfOpt = "fast"): ### Setup returnfmapping = [] N = parameterRobot.N L = parameterRobot.L G= parameterRobot.G threshold = parameterRobot.threshold liid = parameterRobot.liid folderName = parameterRobot.defaultFolder #clusterRounds, fingerPrint, clusterTreeSize = 3 , 6, int(N*L/G) clusterRounds, fingerPrint, clusterTreeSize = parameterRobot.clusterRounds , parameterRobot.fingerPrint, int(parameterRobot.clusterRatio*N*L/G) overhang = 5 # Find fingerPrint K = 10 ### Shared memory objects global noisyReads global noisyReads_base global kmerlinkGraph noisyReads_base = multiprocessing.Array(ctypes.c_double, N*2*L) noisyReads = np.ctypeslib.as_array(noisyReads_base.get_obj()) noisyReads = noisyReads.reshape(N, 2*L) assert noisyReads.base.base is noisyReads_base.get_obj() noisyReads[:] = noisyReadsDummy[:] ### End sharing print "clusterRounds, fingerPrint, clusterTreeSize",clusterRounds, fingerPrint ,clusterTreeSize specification = str(K)+'int8, 3int64' kmerList = np.zeros(N*(L-K+1)*2, dtype=specification) tempKmer = np.zeros(1, dtype=specification) endIndexArray = [] print "len(noisyReads)", len(noisyReads) runningindex =0 for indexN in range(N): indexL = 0 #print indexN, indexL, K #print noisyReads while (noisyReads[indexN][indexL+K-1] !=0 ): tempKmer[0][0][:] = noisyReads[indexN][indexL:indexL+K] tempKmer[0][1][0] = indexN tempKmer[0][1][1] = indexL tempKmer[0][1][2] = runningindex kmerList[runningindex] = tempKmer[0] runningindex += 1 indexL = indexL +1 endIndexArray.append(runningindex - 1) activeKmerUbdd = runningindex -1 activeKmerList = kmerList[0:activeKmerUbdd+1] ### Computing overlap using fingerprint toCompareList = [[] for i in range(N)] print "len(activeKmerList): ",activeKmerList[0] #activeKmerList = sorted(activeKmerList, cmp=multiplier(0,K)) #activeKmerList = sorted(activeKmerList, key = itemgetterkk(range(0,K))) #activeKmerList = sorted(activeKmerList, key = itemgetter(slice(0))) mytime = time.time() activeKmerList.sort() print "len(activeKmerList): ", activeKmerList[0], time.time() - mytime toCompareList = fromCompareList(toCompareList, activeKmerList, liid, threshold) print "activeKmerList[0] : ", activeKmerList[0] print "toCompareList[0] : " , toCompareList[0] ### Alignment and establish K-mer link graphs K = parameterRobot.K runningindex =0 endIndexArray = [] endOfEachReadArr = [] for indexN in range(N): indexL = 0 while (noisyReads[indexN][indexL+K-1] !=0 ): runningindex += 1 indexL = indexL +1 endOfEachReadArr.append(indexL+K-1) endIndexArray.append(runningindex - 1) activeKmerUbdd = runningindex -1 kmerlinkGraph = [[] for i in range(activeKmerUbdd +1 ) ] print "Check the slow checking:" tkk = time.time() numProc = 4 myParaObjList = [] ''' def __init__(self,index): self.index = index def loadParameters(self, toCompareListi,endIndexArray, parameterRobot,endOfEachReadArr): self.toCompareListi , self.endIndexArray, self.parameterRobot, self.endOfEachReadArr = toCompareListi , endIndexArray, parameterRobot,endOfEachReadArr ''' for i in range(N): myNewObj = sharedMemoryWrapper(i) myNewObj.loadParameters(toCompareList[i],endIndexArray, parameterRobot,endOfEachReadArr) myParaObjList.append(myNewObj) print "len(myParaObjList)" , len(myParaObjList) pool = multiprocessing.Pool(processes=numProc) #pool.map(alignParallel,myParaObjList) r = pool.map_async(alignParallel,myParaObjList, callback=myCallBack) #r = pool.map_async(alignParallel2, range(N), callback= myCallBack) r.wait() print kmerlinkGraph[0] print "taken: (sec)", time.time() - tkk ### Form clusters and form fmapping tempfmapping = formClusterMapping(kmerlinkGraph) ### Format Return returnfmapping = [] for i in range(N): if i == 0 : startIndex = 0 else: startIndex = endIndexArray[i-1] + 1 for j in range(endIndexArray[i] - startIndex + 1 ): readNum = i offset = j clusterIndex = tempfmapping[j+startIndex][1] returnfmapping.append([clusterIndex, readNum, offset]) logging.fmappingSave(returnfmapping, folderName) return returnfmapping
def groupNoisyKmers(noisyReads,parameterRobot, typeOfOpt= 'fast'): fmapping = [] N = parameterRobot.N L = parameterRobot.L K = parameterRobot.K G= parameterRobot.G threshold = parameterRobot.threshold liid = parameterRobot.liid #clusterRounds, fingerPrint, clusterTreeSize = 3 , 6, int(N*L/G) clusterRounds, fingerPrint, clusterTreeSize = parameterRobot.clusterRounds , parameterRobot.fingerPrint, int(parameterRobot.clusterRatio*N*L/G) print "clusterRounds, fingerPrint, clusterTreeSize",clusterRounds, fingerPrint ,clusterTreeSize specification = str(K)+'int8, 3int64' kmerList = np.empty(N*(L-K+1), dtype=specification) tempKmer = np.zeros(1, dtype=specification) folderName = parameterRobot.defaultFolder # list of K mers for indexN in range(N): for indexL in range(L- K+1): tempKmer[0][0][:] = noisyReads[indexN][indexL:indexL+K] tempKmer[0][1][0] = indexN tempKmer[0][1][1] = indexL tempKmer[0][1][2] = indexN*(L-K+1)+indexL kmerList[indexN*(L-K+1)+indexL] = tempKmer[0] clusterList = [] for index in range(N*(L-K+1)): clusterList.append(clusterElem(index)) if typeOfOpt == 'fast': print "Fast Clustering" fastClusteringAlgo(N, L, K, kmerList, clusterList, noisyReads, threshold, clusterRounds, fingerPrint,clusterTreeSize, liid) elif typeOfOpt == 'pair': print "Pairwise Clustering" pairwiseCompareKmers(kmerList,clusterList, N, L, K, threshold, liid) fmapping = formatClusteringMap(clusterList) returnfmapping = [] checksum = 0 for eachitem in fmapping : #print "Group ", eachitem[0] , len(eachitem[1]) checksum = checksum + len(eachitem[1]) for eachsubitem in eachitem[1]: #print "kmer id, read num", eachsubitem.id, int( eachsubitem.id / (L-K+1) ) returnfmapping.append([eachitem[0], int(eachsubitem.id/(L-K+1)), int(np.mod(eachsubitem.id, L-K+1))]) logging.fmappingSave(returnfmapping, folderName) print "correctNum", N*(L-K+1) print "checksum" , checksum return returnfmapping
def groupIndelNoisyKmers(noisyReadsDummy, parameterRobot, typeOfOpt="fast"): ### Setup returnfmapping = [] N = parameterRobot.N L = parameterRobot.L G = parameterRobot.G threshold = parameterRobot.threshold liid = parameterRobot.liid folderName = parameterRobot.defaultFolder #clusterRounds, fingerPrint, clusterTreeSize = 3 , 6, int(N*L/G) clusterRounds, fingerPrint, clusterTreeSize = parameterRobot.clusterRounds, parameterRobot.fingerPrint, int( parameterRobot.clusterRatio * N * L / G) overhang = 5 # Find fingerPrint K = 10 ### Shared memory objects global noisyReads global noisyReads_base global kmerlinkGraph noisyReads_base = multiprocessing.Array(ctypes.c_double, N * 2 * L) noisyReads = np.ctypeslib.as_array(noisyReads_base.get_obj()) noisyReads = noisyReads.reshape(N, 2 * L) assert noisyReads.base.base is noisyReads_base.get_obj() noisyReads[:] = noisyReadsDummy[:] ### End sharing print "clusterRounds, fingerPrint, clusterTreeSize", clusterRounds, fingerPrint, clusterTreeSize specification = str(K) + 'int8, 3int64' kmerList = np.zeros(N * (L - K + 1) * 2, dtype=specification) tempKmer = np.zeros(1, dtype=specification) endIndexArray = [] print "len(noisyReads)", len(noisyReads) runningindex = 0 for indexN in range(N): indexL = 0 #print indexN, indexL, K #print noisyReads while (noisyReads[indexN][indexL + K - 1] != 0): tempKmer[0][0][:] = noisyReads[indexN][indexL:indexL + K] tempKmer[0][1][0] = indexN tempKmer[0][1][1] = indexL tempKmer[0][1][2] = runningindex kmerList[runningindex] = tempKmer[0] runningindex += 1 indexL = indexL + 1 endIndexArray.append(runningindex - 1) activeKmerUbdd = runningindex - 1 activeKmerList = kmerList[0:activeKmerUbdd + 1] ### Computing overlap using fingerprint toCompareList = [[] for i in range(N)] print "len(activeKmerList): ", activeKmerList[0] #activeKmerList = sorted(activeKmerList, cmp=multiplier(0,K)) #activeKmerList = sorted(activeKmerList, key = itemgetterkk(range(0,K))) #activeKmerList = sorted(activeKmerList, key = itemgetter(slice(0))) mytime = time.time() activeKmerList.sort() print "len(activeKmerList): ", activeKmerList[0], time.time() - mytime toCompareList = fromCompareList(toCompareList, activeKmerList, liid, threshold) print "activeKmerList[0] : ", activeKmerList[0] print "toCompareList[0] : ", toCompareList[0] ### Alignment and establish K-mer link graphs K = parameterRobot.K runningindex = 0 endIndexArray = [] endOfEachReadArr = [] for indexN in range(N): indexL = 0 while (noisyReads[indexN][indexL + K - 1] != 0): runningindex += 1 indexL = indexL + 1 endOfEachReadArr.append(indexL + K - 1) endIndexArray.append(runningindex - 1) activeKmerUbdd = runningindex - 1 kmerlinkGraph = [[] for i in range(activeKmerUbdd + 1)] print "Check the slow checking:" tkk = time.time() numProc = 4 myParaObjList = [] ''' def __init__(self,index): self.index = index def loadParameters(self, toCompareListi,endIndexArray, parameterRobot,endOfEachReadArr): self.toCompareListi , self.endIndexArray, self.parameterRobot, self.endOfEachReadArr = toCompareListi , endIndexArray, parameterRobot,endOfEachReadArr ''' for i in range(N): myNewObj = sharedMemoryWrapper(i) myNewObj.loadParameters(toCompareList[i], endIndexArray, parameterRobot, endOfEachReadArr) myParaObjList.append(myNewObj) print "len(myParaObjList)", len(myParaObjList) pool = multiprocessing.Pool(processes=numProc) #pool.map(alignParallel,myParaObjList) r = pool.map_async(alignParallel, myParaObjList, callback=myCallBack) #r = pool.map_async(alignParallel2, range(N), callback= myCallBack) r.wait() print kmerlinkGraph[0] print "taken: (sec)", time.time() - tkk ### Form clusters and form fmapping tempfmapping = formClusterMapping(kmerlinkGraph) ### Format Return returnfmapping = [] for i in range(N): if i == 0: startIndex = 0 else: startIndex = endIndexArray[i - 1] + 1 for j in range(endIndexArray[i] - startIndex + 1): readNum = i offset = j clusterIndex = tempfmapping[j + startIndex][1] returnfmapping.append([clusterIndex, readNum, offset]) logging.fmappingSave(returnfmapping, folderName) return returnfmapping
def groupNoisyKmers(noisyReads, parameterRobot, typeOfOpt='fast'): fmapping = [] N = parameterRobot.N L = parameterRobot.L K = parameterRobot.K G = parameterRobot.G threshold = parameterRobot.threshold liid = parameterRobot.liid #clusterRounds, fingerPrint, clusterTreeSize = 3 , 6, int(N*L/G) clusterRounds, fingerPrint, clusterTreeSize = parameterRobot.clusterRounds, parameterRobot.fingerPrint, int( parameterRobot.clusterRatio * N * L / G) print "clusterRounds, fingerPrint, clusterTreeSize", clusterRounds, fingerPrint, clusterTreeSize specification = str(K) + 'int8, 3int64' kmerList = np.empty(N * (L - K + 1), dtype=specification) tempKmer = np.zeros(1, dtype=specification) folderName = parameterRobot.defaultFolder # list of K mers for indexN in range(N): for indexL in range(L - K + 1): tempKmer[0][0][:] = noisyReads[indexN][indexL:indexL + K] tempKmer[0][1][0] = indexN tempKmer[0][1][1] = indexL tempKmer[0][1][2] = indexN * (L - K + 1) + indexL kmerList[indexN * (L - K + 1) + indexL] = tempKmer[0] clusterList = [] for index in range(N * (L - K + 1)): clusterList.append(clusterElem(index)) if typeOfOpt == 'fast': print "Fast Clustering" fastClusteringAlgo(N, L, K, kmerList, clusterList, noisyReads, threshold, clusterRounds, fingerPrint, clusterTreeSize, liid) elif typeOfOpt == 'pair': print "Pairwise Clustering" pairwiseCompareKmers(kmerList, clusterList, N, L, K, threshold, liid) fmapping = formatClusteringMap(clusterList) returnfmapping = [] checksum = 0 for eachitem in fmapping: #print "Group ", eachitem[0] , len(eachitem[1]) checksum = checksum + len(eachitem[1]) for eachsubitem in eachitem[1]: #print "kmer id, read num", eachsubitem.id, int( eachsubitem.id / (L-K+1) ) returnfmapping.append([ eachitem[0], int(eachsubitem.id / (L - K + 1)), int(np.mod(eachsubitem.id, L - K + 1)) ]) logging.fmappingSave(returnfmapping, folderName) print "correctNum", N * (L - K + 1) print "checksum", checksum return returnfmapping