예제 #1
0
def iterativeEM(fm,
                b,
                alphabet,
                reads,
                starts,
                errors,
                maxIters,
                readLen=50,
                genomeLen=5000,
                prop=1):
    unmatched = [1] * len(reads)
    numUnmatched = len(reads)
    prevSize = 2 * len(reads)
    currIter = 0

    sizes = []
    correct = 0
    incorrect = 0
    while numUnmatched > 0 and currIter < maxIters and float(
            prevSize - numUnmatched) / prevSize > 0.1:
        threshold = 0.25 * prop * numUnmatched * readLen / genomeLen

        currIter += 1
        prevSize = numUnmatched

        # Match reads against t2
        mutations = dict()

        # match all reads to genome, collect mutations
        for i in xrange(len(reads)):
            if unmatched[i] == 1:
                m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]),
                                        errors)

                if len(m) > 0:
                    unmatched[i] = 0

                    if i < prop * len(reads):
                        for k, edits in m.items():
                            for v in edits:
                                if v[0] == 2:
                                    vnew = (v[0], v[1] + k)
                                else:
                                    vnew = (v[0], v[1] + k, v[2])
                                if vnew in mutations:
                                    mutations[vnew] += 1
                                else:
                                    mutations[vnew] = 1

                    found = False
                    for j in xrange(-errors, errors + 1):
                        if starts[i] + j in m and not found:
                            correct += 1
                            found = True
                    if not found:
                        incorrect += 1

        mutationsString = cPickle.dumps(mutations)
        sizes += [sys.getsizeof(mutationsString)]

        # apply mutations to fm index
        for k, v in mutations.items():
            if v >= threshold:
                if k[0] == 1:
                    fm = bwt.insert(fm, b, alphabet, k[1], k[2])
                    for i in xrange(len(starts)):
                        if starts[i] >= k[1]:
                            starts[i] += 1
                elif k[0] == 0:
                    fm = bwt.substitute(fm, b, alphabet, k[1], k[2])
                elif k[0] == 2:
                    fm = bwt.delete(fm, b, alphabet, k[1])
                    for i in xrange(len(starts)):
                        if starts[i] >= k[1]:
                            starts[i] -= 1
                else:
                    print 'Error: k[0] = ' + str(k[0])

        numUnmatched = sum(unmatched)
        #print "    Iter " + str(currIter) + " - " + str(correct) + " correct, " + str(incorrect) + " incorrect, " + str(len(reads)-correct-incorrect) + ' unmatched, length = ' + str(len(mutations)) + ', size = ' + str(sys.getsizeof(mutationsString))

    #print "    Accuracy: " + str(float(correct) / len(reads))
    return float(correct) / len(reads), sizes[0]
예제 #2
0
def countCorrect(genomeLen, numReads, readLen, mutFreq, errors):
    # Initialize the reference gene
    t = ['$']
    for i in range(genomeLen):
        t = [random.choice(['A', 'C', 'T', 'G'])] + t

    # constuct the fm index
    alphabet = ['$', 'A', 'C', 'G', 'T']
    b = 5
    fm = bwt.constructFM(t, b, alphabet)

    startsOrig = []
    for i in range(numReads):
        start = random.randint(0, genomeLen-readLen-1)
        startsOrig += [start]
    starts = startsOrig[:]

    # mutate the reference genome to get new genome
    t2 = t[:]
    for i in range(int(round(mutFreq*genomeLen))):
        base = random.randint(0, len(t2)-1)

        mutType = random.randint(0,2)
        # substitution
        if mutType == 0:
            t2[base] = random.choice(['A', 'C', 'T', 'G'])
        # insertion
        elif mutType == 1:
            t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:]
 
            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] += 1
        # deletion
        else:
            t2 = t2[:base] + t2[base+1:]
            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] -= 1

    # generate reads from new genome
    reads = []
    for i in xrange(len(starts)):
        reads += [t2[starts[i]:starts[i]+readLen]]

    # Match reads against t2
    correct = 0
    incorrect = 0
    for i in range(numReads):
        #print 'Read ' + str(i+1)
        #print '  ' + ''.join(reads[i])
        m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]), errors)
        found = False
        #print 'Searching for ' + str(startsOrig[i])
        #print m
        #print ''.join(reads[i])
        #print ''.join(t[starts[i]:starts[i]+readLen])
        for j in xrange(-errors, errors+1):
            if startsOrig[i]+j in m.keys() and not found:
                #print 'Found!\n'
                correct += 1
                found = True
        if not found:
            #print 'Not found\n'
            incorrect += 1
    print '  Accuracy: ' + str(correct) + ' / ' + str(correct+incorrect) + ' = ' + str(float(correct)/(incorrect+correct))
    return float(correct) / (incorrect+correct)
예제 #3
0
def iterativeUpdate(fm, b, alphabet, reads, starts, errors, maxIters, threshold=False, readLen=50, genomeLen=5000):
    unmatched = [1]*len(reads)
    numUnmatched = len(reads)
    prevSize = 2*len(reads)
    currIter = 0

    firstIter = True
    initialAcc = 0.0
    correct = 0
    incorrect = 0
    while numUnmatched > 0 and currIter < maxIters and float(prevSize - numUnmatched) / prevSize > 0.1:
        threshold = 0.5 * numUnmatched * readLen / genomeLen

        currIter += 1
        prevSize = numUnmatched
    
        # Match reads against t2
        mutations = dict()

        # match all reads to genome, collect mutations
        for i in xrange(len(reads)):
            if unmatched[i] == 1:
                m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]), errors)
            
                if len(m) > 0:
                    unmatched[i] = 0
                    for k,edits in m.items():
                        for v in edits:
                            if v[0] == 2:
                                vnew = (v[0],v[1]+k)
                            else:
                                vnew = (v[0],v[1]+k,v[2])
                            if vnew in mutations:
                                mutations[vnew] += 1
                            else:
                                mutations[vnew] = 1

                    found = False
                    for j in xrange(-errors, errors+1):
                        if starts[i]+j in m and not found:
                            correct += 1
                            found = True
                    if not found:
                        incorrect += 1

        if firstIter:
            firstIter = False
            initialAcc = float(correct) / len(reads)

        # apply mutations to fm index
        for k,v in mutations.items():
            if v >= threshold:
                if k[0] == 1:
                    fm = bwt.insert(fm, b, alphabet, k[1], k[2])
                    for i in xrange(len(starts)):
                        if starts[i] >= k[1]:
                            starts[i] += 1
                elif k[0] == 0:
                    fm = bwt.substitute(fm, b, alphabet, k[1], k[2])
                elif k[0] == 2:
                    fm = bwt.delete(fm, b, alphabet, k[1])
                    for i in xrange(len(starts)):
                        if starts[i] >= k[1]:
                            starts[i] -= 1
                else:
                    print 'Error: k[0] = ' + str(k[0])
 
        numUnmatched = sum(unmatched)
        print "    Iter " + str(currIter) + " - " + str(correct) + " correct, " + str(incorrect) + " incorrect, " + str(len(reads)-correct-incorrect) + ' unmatched'


    #print "    Accuracy: " + str(float(correct) / len(reads))
    return initialAcc, float(correct) / len(reads)
예제 #4
0
def iterativeEM(fm, b, alphabet, reads, starts, errors, maxIters, readLen=50, genomeLen=5000, prop=1):
    unmatched = [1] * len(reads)
    numUnmatched = len(reads)
    prevSize = 2 * len(reads)
    currIter = 0

    sizes = []
    correct = 0
    incorrect = 0
    while numUnmatched > 0 and currIter < maxIters and float(prevSize - numUnmatched) / prevSize > 0.1:
        threshold = 0.25 * prop * numUnmatched * readLen / genomeLen

        currIter += 1
        prevSize = numUnmatched

        # Match reads against t2
        mutations = dict()

        # match all reads to genome, collect mutations
        for i in xrange(len(reads)):
            if unmatched[i] == 1:
                m = bwt.findApproximate(fm, b, alphabet, "".join(reads[i]), errors)

                if len(m) > 0:
                    unmatched[i] = 0

                    if i < prop * len(reads):
                        for k, edits in m.items():
                            for v in edits:
                                if v[0] == 2:
                                    vnew = (v[0], v[1] + k)
                                else:
                                    vnew = (v[0], v[1] + k, v[2])
                                if vnew in mutations:
                                    mutations[vnew] += 1
                                else:
                                    mutations[vnew] = 1

                    found = False
                    for j in xrange(-errors, errors + 1):
                        if starts[i] + j in m and not found:
                            correct += 1
                            found = True
                    if not found:
                        incorrect += 1

        mutationsString = cPickle.dumps(mutations)
        sizes += [sys.getsizeof(mutationsString)]

        # apply mutations to fm index
        for k, v in mutations.items():
            if v >= threshold:
                if k[0] == 1:
                    fm = bwt.insert(fm, b, alphabet, k[1], k[2])
                    for i in xrange(len(starts)):
                        if starts[i] >= k[1]:
                            starts[i] += 1
                elif k[0] == 0:
                    fm = bwt.substitute(fm, b, alphabet, k[1], k[2])
                elif k[0] == 2:
                    fm = bwt.delete(fm, b, alphabet, k[1])
                    for i in xrange(len(starts)):
                        if starts[i] >= k[1]:
                            starts[i] -= 1
                else:
                    print "Error: k[0] = " + str(k[0])

        numUnmatched = sum(unmatched)
        # print "    Iter " + str(currIter) + " - " + str(correct) + " correct, " + str(incorrect) + " incorrect, " + str(len(reads)-correct-incorrect) + ' unmatched, length = ' + str(len(mutations)) + ', size = ' + str(sys.getsizeof(mutationsString))

    # print "    Accuracy: " + str(float(correct) / len(reads))
    return float(correct) / len(reads), sizes[0]
예제 #5
0
def countCorrect(genomeLen, numReads, readLen, mutFreq, errors):
    # Initialize the reference gene
    t = ['$']
    for i in range(genomeLen):
        t = [random.choice(['A', 'C', 'T', 'G'])] + t

    # constuct the fm index
    alphabet = ['$', 'A', 'C', 'G', 'T']
    b = 5
    fm = bwt.constructFM(t, b, alphabet)

    startsOrig = []
    for i in range(numReads):
        start = random.randint(0, genomeLen - readLen - 1)
        startsOrig += [start]
    starts = startsOrig[:]

    # mutate the reference genome to get new genome
    t2 = t[:]
    for i in range(int(round(mutFreq * genomeLen))):
        base = random.randint(0, len(t2) - 1)

        mutType = random.randint(0, 2)
        # substitution
        if mutType == 0:
            t2[base] = random.choice(['A', 'C', 'T', 'G'])
        # insertion
        elif mutType == 1:
            t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:]

            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] += 1
        # deletion
        else:
            t2 = t2[:base] + t2[base + 1:]
            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] -= 1

    # generate reads from new genome
    reads = []
    for i in xrange(len(starts)):
        reads += [t2[starts[i]:starts[i] + readLen]]

    # Match reads against t2
    correct = 0
    incorrect = 0
    for i in range(numReads):
        #print 'Read ' + str(i+1)
        #print '  ' + ''.join(reads[i])
        m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]), errors)
        found = False
        #print 'Searching for ' + str(startsOrig[i])
        #print m
        #print ''.join(reads[i])
        #print ''.join(t[starts[i]:starts[i]+readLen])
        for j in xrange(-errors, errors + 1):
            if startsOrig[i] + j in m.keys() and not found:
                #print 'Found!\n'
                correct += 1
                found = True
        if not found:
            #print 'Not found\n'
            incorrect += 1
    print '  Accuracy: ' + str(correct) + ' / ' + str(
        correct + incorrect) + ' = ' + str(
            float(correct) / (incorrect + correct))
    return float(correct) / (incorrect + correct)
예제 #6
0
def iterativeEMDist(fm, b, alphabet, reads, starts, errors, maxIters, readLen=50, genomeLen=5000, depth=1, chunkSize=1):
    unmatched = [1]*len(reads)
    numUnmatched = len(reads)
    prevSize = 2*len(reads)
    currIter = 0

    numChunks = int(math.ceil(float(genomeLen) / chunkSize))
    sizes = []
    correct = 0
    incorrect = 0
    while numUnmatched > 0 and currIter < maxIters and float(prevSize - numUnmatched) / prevSize > 0.1:
        coverage = [0]*numChunks
        threshold = 0.25 * depth

        currIter += 1
        prevSize = numUnmatched
    
        # Match reads against t2
        mutations = dict()

        # match all reads to genome, collect mutations
        for i in xrange(len(reads)):
            if unmatched[i] == 1:
                m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]), errors)
            
                if len(m) > 0:
                    unmatched[i] = 0

                    for k,edits in m.items():
                        # Add mutations to list
                        for v in edits:
                            chunk = (v[1]+k) / chunkSize
                            if coverage[chunk] < depth*chunkSize: 
                                if v[0] == 2:
                                    vnew = (v[0],v[1]+k)
                                else:
                                    vnew = (v[0],v[1]+k,v[2])
                                if vnew in mutations:
                                    mutations[vnew] += 1
                                else:
                                    mutations[vnew] = 1

                        # Update coverage for matched reads
                        chunk = k / chunkSize + 1
                        while chunk*chunkSize < k+readLen:
                            coverage[chunk-1] += chunk*chunkSize - max((chunk-1)*chunkSize, k)
                            chunk += 1
                        coverage[chunk-1] += min(chunk*chunkSize, k+readLen) - max((chunk-1)*chunkSize, k)

                    # Test whether any of the matches are correct
                    found = False
                    for j in xrange(-errors, errors+1):
                        if starts[i]+j in m and not found:
                            correct += 1
                            found = True
                    if not found:
                        incorrect += 1

        mutationsString = cPickle.dumps(mutations)
        sizes += [sys.getsizeof(mutationsString) + sys.getsizeof(coverage)]

        # apply mutations to fm index
        for k,v in mutations.items():
            if v >= threshold:
                if k[0] == 1:
                    fm = bwt.insert(fm, b, alphabet, k[1], k[2])
                    for i in xrange(len(starts)):
                        if starts[i] >= k[1]:
                            starts[i] += 1
                elif k[0] == 0:
                    fm = bwt.substitute(fm, b, alphabet, k[1], k[2])
                elif k[0] == 2:
                    fm = bwt.delete(fm, b, alphabet, k[1])
                    for i in xrange(len(starts)):
                        if starts[i] >= k[1]:
                            starts[i] -= 1
                else:
                    print 'Error: k[0] = ' + str(k[0])
 
        numUnmatched = sum(unmatched)
        #print "    Iter " + str(currIter) + " - " + str(correct) + " correct, " + str(incorrect) + " incorrect, " + str(len(reads)-correct-incorrect) + ' unmatched, length = ' + str(len(mutations)) + ', size = ' + str(sys.getsizeof(mutationsString))


    #print "    Accuracy: " + str(float(correct) / len(reads))
    return float(correct) / len(reads), sizes[0]