def iterativeEM(fm, b, alphabet, reads, starts, errors, maxIters, readLen=50, genomeLen=5000, prop=1): unmatched = [1] * len(reads) numUnmatched = len(reads) prevSize = 2 * len(reads) currIter = 0 sizes = [] correct = 0 incorrect = 0 while numUnmatched > 0 and currIter < maxIters and float( prevSize - numUnmatched) / prevSize > 0.1: threshold = 0.25 * prop * numUnmatched * readLen / genomeLen currIter += 1 prevSize = numUnmatched # Match reads against t2 mutations = dict() # match all reads to genome, collect mutations for i in xrange(len(reads)): if unmatched[i] == 1: m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]), errors) if len(m) > 0: unmatched[i] = 0 if i < prop * len(reads): for k, edits in m.items(): for v in edits: if v[0] == 2: vnew = (v[0], v[1] + k) else: vnew = (v[0], v[1] + k, v[2]) if vnew in mutations: mutations[vnew] += 1 else: mutations[vnew] = 1 found = False for j in xrange(-errors, errors + 1): if starts[i] + j in m and not found: correct += 1 found = True if not found: incorrect += 1 mutationsString = cPickle.dumps(mutations) sizes += [sys.getsizeof(mutationsString)] # apply mutations to fm index for k, v in mutations.items(): if v >= threshold: if k[0] == 1: fm = bwt.insert(fm, b, alphabet, k[1], k[2]) for i in xrange(len(starts)): if starts[i] >= k[1]: starts[i] += 1 elif k[0] == 0: fm = bwt.substitute(fm, b, alphabet, k[1], k[2]) elif k[0] == 2: fm = bwt.delete(fm, b, alphabet, k[1]) for i in xrange(len(starts)): if starts[i] >= k[1]: starts[i] -= 1 else: print 'Error: k[0] = ' + str(k[0]) numUnmatched = sum(unmatched) #print " Iter " + str(currIter) + " - " + str(correct) + " correct, " + str(incorrect) + " incorrect, " + str(len(reads)-correct-incorrect) + ' unmatched, length = ' + str(len(mutations)) + ', size = ' + str(sys.getsizeof(mutationsString)) #print " Accuracy: " + str(float(correct) / len(reads)) return float(correct) / len(reads), sizes[0]
def countCorrect(genomeLen, numReads, readLen, mutFreq, errors): # Initialize the reference gene t = ['$'] for i in range(genomeLen): t = [random.choice(['A', 'C', 'T', 'G'])] + t # constuct the fm index alphabet = ['$', 'A', 'C', 'G', 'T'] b = 5 fm = bwt.constructFM(t, b, alphabet) startsOrig = [] for i in range(numReads): start = random.randint(0, genomeLen-readLen-1) startsOrig += [start] starts = startsOrig[:] # mutate the reference genome to get new genome t2 = t[:] for i in range(int(round(mutFreq*genomeLen))): base = random.randint(0, len(t2)-1) mutType = random.randint(0,2) # substitution if mutType == 0: t2[base] = random.choice(['A', 'C', 'T', 'G']) # insertion elif mutType == 1: t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] += 1 # deletion else: t2 = t2[:base] + t2[base+1:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] -= 1 # generate reads from new genome reads = [] for i in xrange(len(starts)): reads += [t2[starts[i]:starts[i]+readLen]] # Match reads against t2 correct = 0 incorrect = 0 for i in range(numReads): #print 'Read ' + str(i+1) #print ' ' + ''.join(reads[i]) m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]), errors) found = False #print 'Searching for ' + str(startsOrig[i]) #print m #print ''.join(reads[i]) #print ''.join(t[starts[i]:starts[i]+readLen]) for j in xrange(-errors, errors+1): if startsOrig[i]+j in m.keys() and not found: #print 'Found!\n' correct += 1 found = True if not found: #print 'Not found\n' incorrect += 1 print ' Accuracy: ' + str(correct) + ' / ' + str(correct+incorrect) + ' = ' + str(float(correct)/(incorrect+correct)) return float(correct) / (incorrect+correct)
def iterativeUpdate(fm, b, alphabet, reads, starts, errors, maxIters, threshold=False, readLen=50, genomeLen=5000): unmatched = [1]*len(reads) numUnmatched = len(reads) prevSize = 2*len(reads) currIter = 0 firstIter = True initialAcc = 0.0 correct = 0 incorrect = 0 while numUnmatched > 0 and currIter < maxIters and float(prevSize - numUnmatched) / prevSize > 0.1: threshold = 0.5 * numUnmatched * readLen / genomeLen currIter += 1 prevSize = numUnmatched # Match reads against t2 mutations = dict() # match all reads to genome, collect mutations for i in xrange(len(reads)): if unmatched[i] == 1: m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]), errors) if len(m) > 0: unmatched[i] = 0 for k,edits in m.items(): for v in edits: if v[0] == 2: vnew = (v[0],v[1]+k) else: vnew = (v[0],v[1]+k,v[2]) if vnew in mutations: mutations[vnew] += 1 else: mutations[vnew] = 1 found = False for j in xrange(-errors, errors+1): if starts[i]+j in m and not found: correct += 1 found = True if not found: incorrect += 1 if firstIter: firstIter = False initialAcc = float(correct) / len(reads) # apply mutations to fm index for k,v in mutations.items(): if v >= threshold: if k[0] == 1: fm = bwt.insert(fm, b, alphabet, k[1], k[2]) for i in xrange(len(starts)): if starts[i] >= k[1]: starts[i] += 1 elif k[0] == 0: fm = bwt.substitute(fm, b, alphabet, k[1], k[2]) elif k[0] == 2: fm = bwt.delete(fm, b, alphabet, k[1]) for i in xrange(len(starts)): if starts[i] >= k[1]: starts[i] -= 1 else: print 'Error: k[0] = ' + str(k[0]) numUnmatched = sum(unmatched) print " Iter " + str(currIter) + " - " + str(correct) + " correct, " + str(incorrect) + " incorrect, " + str(len(reads)-correct-incorrect) + ' unmatched' #print " Accuracy: " + str(float(correct) / len(reads)) return initialAcc, float(correct) / len(reads)
def iterativeEM(fm, b, alphabet, reads, starts, errors, maxIters, readLen=50, genomeLen=5000, prop=1): unmatched = [1] * len(reads) numUnmatched = len(reads) prevSize = 2 * len(reads) currIter = 0 sizes = [] correct = 0 incorrect = 0 while numUnmatched > 0 and currIter < maxIters and float(prevSize - numUnmatched) / prevSize > 0.1: threshold = 0.25 * prop * numUnmatched * readLen / genomeLen currIter += 1 prevSize = numUnmatched # Match reads against t2 mutations = dict() # match all reads to genome, collect mutations for i in xrange(len(reads)): if unmatched[i] == 1: m = bwt.findApproximate(fm, b, alphabet, "".join(reads[i]), errors) if len(m) > 0: unmatched[i] = 0 if i < prop * len(reads): for k, edits in m.items(): for v in edits: if v[0] == 2: vnew = (v[0], v[1] + k) else: vnew = (v[0], v[1] + k, v[2]) if vnew in mutations: mutations[vnew] += 1 else: mutations[vnew] = 1 found = False for j in xrange(-errors, errors + 1): if starts[i] + j in m and not found: correct += 1 found = True if not found: incorrect += 1 mutationsString = cPickle.dumps(mutations) sizes += [sys.getsizeof(mutationsString)] # apply mutations to fm index for k, v in mutations.items(): if v >= threshold: if k[0] == 1: fm = bwt.insert(fm, b, alphabet, k[1], k[2]) for i in xrange(len(starts)): if starts[i] >= k[1]: starts[i] += 1 elif k[0] == 0: fm = bwt.substitute(fm, b, alphabet, k[1], k[2]) elif k[0] == 2: fm = bwt.delete(fm, b, alphabet, k[1]) for i in xrange(len(starts)): if starts[i] >= k[1]: starts[i] -= 1 else: print "Error: k[0] = " + str(k[0]) numUnmatched = sum(unmatched) # print " Iter " + str(currIter) + " - " + str(correct) + " correct, " + str(incorrect) + " incorrect, " + str(len(reads)-correct-incorrect) + ' unmatched, length = ' + str(len(mutations)) + ', size = ' + str(sys.getsizeof(mutationsString)) # print " Accuracy: " + str(float(correct) / len(reads)) return float(correct) / len(reads), sizes[0]
def countCorrect(genomeLen, numReads, readLen, mutFreq, errors): # Initialize the reference gene t = ['$'] for i in range(genomeLen): t = [random.choice(['A', 'C', 'T', 'G'])] + t # constuct the fm index alphabet = ['$', 'A', 'C', 'G', 'T'] b = 5 fm = bwt.constructFM(t, b, alphabet) startsOrig = [] for i in range(numReads): start = random.randint(0, genomeLen - readLen - 1) startsOrig += [start] starts = startsOrig[:] # mutate the reference genome to get new genome t2 = t[:] for i in range(int(round(mutFreq * genomeLen))): base = random.randint(0, len(t2) - 1) mutType = random.randint(0, 2) # substitution if mutType == 0: t2[base] = random.choice(['A', 'C', 'T', 'G']) # insertion elif mutType == 1: t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] += 1 # deletion else: t2 = t2[:base] + t2[base + 1:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] -= 1 # generate reads from new genome reads = [] for i in xrange(len(starts)): reads += [t2[starts[i]:starts[i] + readLen]] # Match reads against t2 correct = 0 incorrect = 0 for i in range(numReads): #print 'Read ' + str(i+1) #print ' ' + ''.join(reads[i]) m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]), errors) found = False #print 'Searching for ' + str(startsOrig[i]) #print m #print ''.join(reads[i]) #print ''.join(t[starts[i]:starts[i]+readLen]) for j in xrange(-errors, errors + 1): if startsOrig[i] + j in m.keys() and not found: #print 'Found!\n' correct += 1 found = True if not found: #print 'Not found\n' incorrect += 1 print ' Accuracy: ' + str(correct) + ' / ' + str( correct + incorrect) + ' = ' + str( float(correct) / (incorrect + correct)) return float(correct) / (incorrect + correct)
def iterativeEMDist(fm, b, alphabet, reads, starts, errors, maxIters, readLen=50, genomeLen=5000, depth=1, chunkSize=1): unmatched = [1]*len(reads) numUnmatched = len(reads) prevSize = 2*len(reads) currIter = 0 numChunks = int(math.ceil(float(genomeLen) / chunkSize)) sizes = [] correct = 0 incorrect = 0 while numUnmatched > 0 and currIter < maxIters and float(prevSize - numUnmatched) / prevSize > 0.1: coverage = [0]*numChunks threshold = 0.25 * depth currIter += 1 prevSize = numUnmatched # Match reads against t2 mutations = dict() # match all reads to genome, collect mutations for i in xrange(len(reads)): if unmatched[i] == 1: m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]), errors) if len(m) > 0: unmatched[i] = 0 for k,edits in m.items(): # Add mutations to list for v in edits: chunk = (v[1]+k) / chunkSize if coverage[chunk] < depth*chunkSize: if v[0] == 2: vnew = (v[0],v[1]+k) else: vnew = (v[0],v[1]+k,v[2]) if vnew in mutations: mutations[vnew] += 1 else: mutations[vnew] = 1 # Update coverage for matched reads chunk = k / chunkSize + 1 while chunk*chunkSize < k+readLen: coverage[chunk-1] += chunk*chunkSize - max((chunk-1)*chunkSize, k) chunk += 1 coverage[chunk-1] += min(chunk*chunkSize, k+readLen) - max((chunk-1)*chunkSize, k) # Test whether any of the matches are correct found = False for j in xrange(-errors, errors+1): if starts[i]+j in m and not found: correct += 1 found = True if not found: incorrect += 1 mutationsString = cPickle.dumps(mutations) sizes += [sys.getsizeof(mutationsString) + sys.getsizeof(coverage)] # apply mutations to fm index for k,v in mutations.items(): if v >= threshold: if k[0] == 1: fm = bwt.insert(fm, b, alphabet, k[1], k[2]) for i in xrange(len(starts)): if starts[i] >= k[1]: starts[i] += 1 elif k[0] == 0: fm = bwt.substitute(fm, b, alphabet, k[1], k[2]) elif k[0] == 2: fm = bwt.delete(fm, b, alphabet, k[1]) for i in xrange(len(starts)): if starts[i] >= k[1]: starts[i] -= 1 else: print 'Error: k[0] = ' + str(k[0]) numUnmatched = sum(unmatched) #print " Iter " + str(currIter) + " - " + str(correct) + " correct, " + str(incorrect) + " incorrect, " + str(len(reads)-correct-incorrect) + ' unmatched, length = ' + str(len(mutations)) + ', size = ' + str(sys.getsizeof(mutationsString)) #print " Accuracy: " + str(float(correct) / len(reads)) return float(correct) / len(reads), sizes[0]