def clearResidual(returnfmapping, G1, parameterRobot):
    
    logging.fmappingSave(returnfmapping, parameterRobot.defaultFolder)
    
    UBd = parameterRobot.brachingDepth               
    for index in range(UBd-1,UBd,1):    
        print "index", index 
        parameterRobot.brachingDepth = index
        returnfmapping, G1= clearResidualMain(returnfmapping,G1,parameterRobot)
    
    
    f2, G2 = returnfmapping, G1
    
    G2 = sorted(G2)
    eachnodeindex =0 
    while ( eachnodeindex < len( G2) -1 ):
        if G2[eachnodeindex] == G2[eachnodeindex +1]:
            G2.pop(eachnodeindex)
        else:
            eachnodeindex = eachnodeindex +1 
            
    f2 = sorted(f2) 
    logging.fmapfusedSave(f2, parameterRobot.defaultFolder)
    logging.storeGraph(G2, parameterRobot.defaultFolder )
    return f2, G2 
def groupIndelNoisyKmers(noisyReadsDummy, parameterRobot, typeOfOpt = "fast"):

    ### Setup 
    returnfmapping = [] 
    N = parameterRobot.N
    L = parameterRobot.L

    G=  parameterRobot.G
    threshold = parameterRobot.threshold
    liid = parameterRobot.liid
    

    folderName = parameterRobot.defaultFolder
    
    #clusterRounds, fingerPrint, clusterTreeSize = 3 , 6, int(N*L/G)
    clusterRounds, fingerPrint, clusterTreeSize = parameterRobot.clusterRounds , parameterRobot.fingerPrint, int(parameterRobot.clusterRatio*N*L/G)
    overhang = 5

    # Find fingerPrint
    K = 10
    ### Shared memory objects
    global noisyReads
    global noisyReads_base
    global kmerlinkGraph
    
    noisyReads_base = multiprocessing.Array(ctypes.c_double, N*2*L) 
    noisyReads = np.ctypeslib.as_array(noisyReads_base.get_obj())
    noisyReads = noisyReads.reshape(N, 2*L)   
    
    assert noisyReads.base.base is noisyReads_base.get_obj()
    
    noisyReads[:] = noisyReadsDummy[:]
    
    ### End sharing
    
    print "clusterRounds, fingerPrint, clusterTreeSize",clusterRounds, fingerPrint ,clusterTreeSize
    
    specification = str(K)+'int8, 3int64'
    kmerList = np.zeros(N*(L-K+1)*2, dtype=specification)
    tempKmer = np.zeros(1, dtype=specification)
    
    endIndexArray = []
    print "len(noisyReads)", len(noisyReads)
    
    runningindex =0 
    for indexN in range(N):
        indexL = 0
        #print indexN, indexL, K 
        #print noisyReads
        while (noisyReads[indexN][indexL+K-1] !=0 ):
            tempKmer[0][0][:] = noisyReads[indexN][indexL:indexL+K]                
            tempKmer[0][1][0]  = indexN
            tempKmer[0][1][1] =  indexL
            tempKmer[0][1][2] =  runningindex 
            kmerList[runningindex] = tempKmer[0]
            
            runningindex += 1
            indexL = indexL +1 
            
        endIndexArray.append(runningindex - 1)

    activeKmerUbdd = runningindex -1
    activeKmerList = kmerList[0:activeKmerUbdd+1]
    
    ### Computing overlap using fingerprint
    toCompareList = [[] for i in range(N)]
    print "len(activeKmerList): ",activeKmerList[0]
    #activeKmerList = sorted(activeKmerList, cmp=multiplier(0,K))
    #activeKmerList = sorted(activeKmerList, key = itemgetterkk(range(0,K)))
    #activeKmerList = sorted(activeKmerList, key = itemgetter(slice(0)))
    mytime = time.time()
    activeKmerList.sort()
    print "len(activeKmerList): ", activeKmerList[0], time.time() - mytime
    
    
    toCompareList = fromCompareList(toCompareList, activeKmerList, liid, threshold)
    
    print "activeKmerList[0] : ", activeKmerList[0]
    print "toCompareList[0] : " , toCompareList[0]     
    
    ### Alignment and establish K-mer link graphs
    K = parameterRobot.K
    runningindex =0 
    endIndexArray = []
    endOfEachReadArr = []
    for indexN in range(N):
        indexL = 0
        while (noisyReads[indexN][indexL+K-1] !=0 ):
            runningindex += 1
            indexL = indexL +1 
            
        endOfEachReadArr.append(indexL+K-1)  
        endIndexArray.append(runningindex - 1)
        
    activeKmerUbdd = runningindex -1
    
    
    
    kmerlinkGraph =  [[] for i in range(activeKmerUbdd +1 ) ]

    
    print "Check the slow checking:"
    tkk = time.time()
    numProc = 4
    myParaObjList = []
    
    '''
    def __init__(self,index):
        self.index = index
    def loadParameters(self, toCompareListi,endIndexArray, parameterRobot,endOfEachReadArr):
        self.toCompareListi , self.endIndexArray, self.parameterRobot, self.endOfEachReadArr = toCompareListi , endIndexArray, parameterRobot,endOfEachReadArr
    '''
    for i in range(N):
        myNewObj =  sharedMemoryWrapper(i)
        myNewObj.loadParameters(toCompareList[i],endIndexArray, parameterRobot,endOfEachReadArr)
        myParaObjList.append(myNewObj)
        
    
    print "len(myParaObjList)" , len(myParaObjList)
 
    pool = multiprocessing.Pool(processes=numProc)
    #pool.map(alignParallel,myParaObjList)
    r = pool.map_async(alignParallel,myParaObjList, callback=myCallBack)
    #r = pool.map_async(alignParallel2, range(N),  callback= myCallBack)
    
    
    r.wait()
    
    print kmerlinkGraph[0]
    print "taken: (sec)", time.time() - tkk                
    ### Form clusters and form fmapping
    tempfmapping = formClusterMapping(kmerlinkGraph)
    
    ### Format Return  
    returnfmapping = []
    for i in range(N):
        if i == 0 :
            startIndex = 0
        else:
            startIndex = endIndexArray[i-1] + 1
            
        for j in range(endIndexArray[i] - startIndex + 1 ):
            readNum = i 
            offset = j 
            clusterIndex = tempfmapping[j+startIndex][1]
            returnfmapping.append([clusterIndex, readNum, offset])
    
    
    logging.fmappingSave(returnfmapping, folderName)


    return returnfmapping
def groupNoisyKmers(noisyReads,parameterRobot, typeOfOpt= 'fast'):
    fmapping = []
    N = parameterRobot.N
    L = parameterRobot.L
    K = parameterRobot.K
    G=  parameterRobot.G
    threshold = parameterRobot.threshold
    liid = parameterRobot.liid
    

    #clusterRounds, fingerPrint, clusterTreeSize = 3 , 6, int(N*L/G)
    clusterRounds, fingerPrint, clusterTreeSize = parameterRobot.clusterRounds , parameterRobot.fingerPrint, int(parameterRobot.clusterRatio*N*L/G)

    
    print "clusterRounds, fingerPrint, clusterTreeSize",clusterRounds, fingerPrint ,clusterTreeSize
    
    specification = str(K)+'int8, 3int64'
    kmerList = np.empty(N*(L-K+1), dtype=specification)
    tempKmer = np.zeros(1, dtype=specification)
    
    folderName = parameterRobot.defaultFolder
    
    # list of K mers
    for indexN in range(N):
        for indexL in range(L- K+1):
            tempKmer[0][0][:] = noisyReads[indexN][indexL:indexL+K]                
            tempKmer[0][1][0]  = indexN
            tempKmer[0][1][1] =  indexL
            tempKmer[0][1][2] =  indexN*(L-K+1)+indexL

            kmerList[indexN*(L-K+1)+indexL] = tempKmer[0]

    
    clusterList = []
    for index in range(N*(L-K+1)):
        clusterList.append(clusterElem(index))
   
    if typeOfOpt == 'fast':
        print "Fast Clustering"
        fastClusteringAlgo(N, L, K, kmerList, clusterList, noisyReads, threshold, clusterRounds, fingerPrint,clusterTreeSize, liid)
    elif typeOfOpt == 'pair':
        print "Pairwise Clustering"
        pairwiseCompareKmers(kmerList,clusterList, N, L, K, threshold, liid)

    fmapping = formatClusteringMap(clusterList)       
    
    returnfmapping = [] 
    
    checksum = 0
    for eachitem in fmapping : 
        #print "Group ", eachitem[0] , len(eachitem[1])
        checksum = checksum + len(eachitem[1])
        for eachsubitem in eachitem[1]:
            #print "kmer id, read num", eachsubitem.id, int( eachsubitem.id / (L-K+1) )
            returnfmapping.append([eachitem[0], int(eachsubitem.id/(L-K+1)), int(np.mod(eachsubitem.id, L-K+1))])
            
    logging.fmappingSave(returnfmapping, folderName)
    
    print "correctNum", N*(L-K+1)
    print "checksum" , checksum
    return returnfmapping              
Beispiel #4
0
def groupIndelNoisyKmers(noisyReadsDummy, parameterRobot, typeOfOpt="fast"):

    ### Setup
    returnfmapping = []
    N = parameterRobot.N
    L = parameterRobot.L

    G = parameterRobot.G
    threshold = parameterRobot.threshold
    liid = parameterRobot.liid

    folderName = parameterRobot.defaultFolder

    #clusterRounds, fingerPrint, clusterTreeSize = 3 , 6, int(N*L/G)
    clusterRounds, fingerPrint, clusterTreeSize = parameterRobot.clusterRounds, parameterRobot.fingerPrint, int(
        parameterRobot.clusterRatio * N * L / G)
    overhang = 5

    # Find fingerPrint
    K = 10
    ### Shared memory objects
    global noisyReads
    global noisyReads_base
    global kmerlinkGraph

    noisyReads_base = multiprocessing.Array(ctypes.c_double, N * 2 * L)
    noisyReads = np.ctypeslib.as_array(noisyReads_base.get_obj())
    noisyReads = noisyReads.reshape(N, 2 * L)

    assert noisyReads.base.base is noisyReads_base.get_obj()

    noisyReads[:] = noisyReadsDummy[:]

    ### End sharing

    print "clusterRounds, fingerPrint, clusterTreeSize", clusterRounds, fingerPrint, clusterTreeSize

    specification = str(K) + 'int8, 3int64'
    kmerList = np.zeros(N * (L - K + 1) * 2, dtype=specification)
    tempKmer = np.zeros(1, dtype=specification)

    endIndexArray = []
    print "len(noisyReads)", len(noisyReads)

    runningindex = 0
    for indexN in range(N):
        indexL = 0
        #print indexN, indexL, K
        #print noisyReads
        while (noisyReads[indexN][indexL + K - 1] != 0):
            tempKmer[0][0][:] = noisyReads[indexN][indexL:indexL + K]
            tempKmer[0][1][0] = indexN
            tempKmer[0][1][1] = indexL
            tempKmer[0][1][2] = runningindex
            kmerList[runningindex] = tempKmer[0]

            runningindex += 1
            indexL = indexL + 1

        endIndexArray.append(runningindex - 1)

    activeKmerUbdd = runningindex - 1
    activeKmerList = kmerList[0:activeKmerUbdd + 1]

    ### Computing overlap using fingerprint
    toCompareList = [[] for i in range(N)]
    print "len(activeKmerList): ", activeKmerList[0]
    #activeKmerList = sorted(activeKmerList, cmp=multiplier(0,K))
    #activeKmerList = sorted(activeKmerList, key = itemgetterkk(range(0,K)))
    #activeKmerList = sorted(activeKmerList, key = itemgetter(slice(0)))
    mytime = time.time()
    activeKmerList.sort()
    print "len(activeKmerList): ", activeKmerList[0], time.time() - mytime

    toCompareList = fromCompareList(toCompareList, activeKmerList, liid,
                                    threshold)

    print "activeKmerList[0] : ", activeKmerList[0]
    print "toCompareList[0] : ", toCompareList[0]

    ### Alignment and establish K-mer link graphs
    K = parameterRobot.K
    runningindex = 0
    endIndexArray = []
    endOfEachReadArr = []
    for indexN in range(N):
        indexL = 0
        while (noisyReads[indexN][indexL + K - 1] != 0):
            runningindex += 1
            indexL = indexL + 1

        endOfEachReadArr.append(indexL + K - 1)
        endIndexArray.append(runningindex - 1)

    activeKmerUbdd = runningindex - 1

    kmerlinkGraph = [[] for i in range(activeKmerUbdd + 1)]

    print "Check the slow checking:"
    tkk = time.time()
    numProc = 4
    myParaObjList = []
    '''
    def __init__(self,index):
        self.index = index
    def loadParameters(self, toCompareListi,endIndexArray, parameterRobot,endOfEachReadArr):
        self.toCompareListi , self.endIndexArray, self.parameterRobot, self.endOfEachReadArr = toCompareListi , endIndexArray, parameterRobot,endOfEachReadArr
    '''

    for i in range(N):
        myNewObj = sharedMemoryWrapper(i)
        myNewObj.loadParameters(toCompareList[i], endIndexArray,
                                parameterRobot, endOfEachReadArr)
        myParaObjList.append(myNewObj)

    print "len(myParaObjList)", len(myParaObjList)

    pool = multiprocessing.Pool(processes=numProc)
    #pool.map(alignParallel,myParaObjList)
    r = pool.map_async(alignParallel, myParaObjList, callback=myCallBack)
    #r = pool.map_async(alignParallel2, range(N),  callback= myCallBack)

    r.wait()
    print kmerlinkGraph[0]
    print "taken: (sec)", time.time() - tkk
    ### Form clusters and form fmapping
    tempfmapping = formClusterMapping(kmerlinkGraph)

    ### Format Return
    returnfmapping = []
    for i in range(N):
        if i == 0:
            startIndex = 0
        else:
            startIndex = endIndexArray[i - 1] + 1

        for j in range(endIndexArray[i] - startIndex + 1):
            readNum = i
            offset = j
            clusterIndex = tempfmapping[j + startIndex][1]
            returnfmapping.append([clusterIndex, readNum, offset])

    logging.fmappingSave(returnfmapping, folderName)

    return returnfmapping
Beispiel #5
0
def groupNoisyKmers(noisyReads, parameterRobot, typeOfOpt='fast'):
    fmapping = []
    N = parameterRobot.N
    L = parameterRobot.L
    K = parameterRobot.K
    G = parameterRobot.G
    threshold = parameterRobot.threshold
    liid = parameterRobot.liid

    #clusterRounds, fingerPrint, clusterTreeSize = 3 , 6, int(N*L/G)
    clusterRounds, fingerPrint, clusterTreeSize = parameterRobot.clusterRounds, parameterRobot.fingerPrint, int(
        parameterRobot.clusterRatio * N * L / G)

    print "clusterRounds, fingerPrint, clusterTreeSize", clusterRounds, fingerPrint, clusterTreeSize

    specification = str(K) + 'int8, 3int64'
    kmerList = np.empty(N * (L - K + 1), dtype=specification)
    tempKmer = np.zeros(1, dtype=specification)

    folderName = parameterRobot.defaultFolder

    # list of K mers
    for indexN in range(N):
        for indexL in range(L - K + 1):
            tempKmer[0][0][:] = noisyReads[indexN][indexL:indexL + K]
            tempKmer[0][1][0] = indexN
            tempKmer[0][1][1] = indexL
            tempKmer[0][1][2] = indexN * (L - K + 1) + indexL

            kmerList[indexN * (L - K + 1) + indexL] = tempKmer[0]

    clusterList = []
    for index in range(N * (L - K + 1)):
        clusterList.append(clusterElem(index))

    if typeOfOpt == 'fast':
        print "Fast Clustering"
        fastClusteringAlgo(N, L, K, kmerList, clusterList, noisyReads,
                           threshold, clusterRounds, fingerPrint,
                           clusterTreeSize, liid)
    elif typeOfOpt == 'pair':
        print "Pairwise Clustering"
        pairwiseCompareKmers(kmerList, clusterList, N, L, K, threshold, liid)

    fmapping = formatClusteringMap(clusterList)

    returnfmapping = []

    checksum = 0
    for eachitem in fmapping:
        #print "Group ", eachitem[0] , len(eachitem[1])
        checksum = checksum + len(eachitem[1])
        for eachsubitem in eachitem[1]:
            #print "kmer id, read num", eachsubitem.id, int( eachsubitem.id / (L-K+1) )
            returnfmapping.append([
                eachitem[0],
                int(eachsubitem.id / (L - K + 1)),
                int(np.mod(eachsubitem.id, L - K + 1))
            ])

    logging.fmappingSave(returnfmapping, folderName)

    print "correctNum", N * (L - K + 1)
    print "checksum", checksum
    return returnfmapping