Esempio n. 1
0
def testIter(genomeLen, numReads, readLen, mutFreq, errors, errorFreq, prop=1):
    # Initialize the reference gene
    t = ['$']
    for i in range(genomeLen):
        t = [random.choice(['A', 'C', 'T', 'G'])] + t

    # constuct the fm index
    alphabet = ['$', 'A', 'C', 'G', 'T']
    b = 5
    fm = bwt.constructFM(t, b, alphabet)

    startsOrig = []
    for i in range(numReads):
        start = random.randint(0, genomeLen - readLen - 1)
        startsOrig += [start]
    starts = startsOrig[:]

    # mutate the reference genome to get new genome
    t2 = t[:]
    for i in range(int(round(mutFreq * genomeLen))):
        base = random.randint(0, len(t2) - 1)

        mutType = random.randint(0, 2)
        # substitution
        if mutType == 0:
            t2[base] = random.choice(['A', 'C', 'T', 'G'])
        # insertion
        elif mutType == 1:
            t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:]

            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] += 1
        # deletion
        else:
            t2 = t2[:base] + t2[base + 1:]
            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] -= 1

    # generate reads from new genome
    reads = []
    for i in xrange(len(starts)):
        reads += [t2[starts[i]:starts[i] + readLen]]

        # introduce substitution errors with 1% chance at each base
        for j in xrange(readLen):
            if random.random() < errorFreq:
                reads[i][j] = random.choice(['A', 'C', 'T', 'G'])

    return iu.iterativeUpdateError(fm, b, alphabet, reads, startsOrig, errors,
                                   5, True, readLen, genomeLen, prop)
def testIter(genomeLen, numReads, readLen, mutFreq, errors, errorFreq):
    # Initialize the reference gene
    t = ['$']
    for i in range(genomeLen):
        t = [random.choice(['A', 'C', 'T', 'G'])] + t

    # constuct the fm index
    alphabet = ['$', 'A', 'C', 'G', 'T']
    b = 5
    fm = bwt.constructFM(t, b, alphabet)

    startsOrig = []
    for i in range(numReads):
        start = random.randint(0, genomeLen-readLen-1)
        startsOrig += [start]
    starts = startsOrig[:]

    # mutate the reference genome to get new genome
    t2 = t[:]
    for i in range(int(round(mutFreq*genomeLen))):
        base = random.randint(2*readLen, len(t2)-2*readLen)

        mutType = random.randint(0,2)
        # substitution
        if mutType == 0:
            t2[base] = random.choice(['A', 'C', 'T', 'G'])
        # insertion
        elif mutType == 1:
            t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:]
 
            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] += 1
        # deletion
        else:
            t2 = t2[:base] + t2[base+1:]
            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] -= 1


    # generate reads from new genome
    reads = []
    for i in xrange(len(starts)):
        reads += [t2[starts[i]:starts[i]+readLen]]
 
        # introduce substitution errors with 1% chance at each base
        for j in xrange(readLen):
            if random.random() < errorFreq:
                reads[i][j] = random.choice(['A', 'C', 'T', 'G'])

    return iu.iterativeUpdateError(fm, b, alphabet, reads, startsOrig, errors, 5, True, readLen, genomeLen)
Esempio n. 3
0
def iterEM(genomeLen, numReads, readLen, mutFreq, errors, errorFreq):
    ''' Prop = proportion of reads to contribute to mutations '''
    # Initialize the reference gene
    t = ['$']
    for i in range(genomeLen):
        t = [random.choice(['A', 'C', 'T', 'G'])] + t

    # constuct the fm index
    alphabet = ['$', 'A', 'C', 'G', 'T']
    b = 5
    fm = bwt.constructFM(t, b, alphabet)

    startsOrig = []
    for i in range(numReads):
        start = random.randint(0, genomeLen-readLen-1)
        startsOrig += [start]
    starts = startsOrig[:]

    # mutate the reference genome to get new genome
    t2 = t[:]
    for i in range(int(round(mutFreq*genomeLen))):
        base = random.randint(0, len(t2)-1)

        mutType = random.randint(0,2)
        # substitution
        if mutType == 0:
            t2[base] = random.choice(['A', 'C', 'T', 'G'])
        # insertion
        elif mutType == 1:
            t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:]
 
            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] += 1
        # deletion
        else:
            t2 = t2[:base] + t2[base+1:]
            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] -= 1


    # generate reads from new genome
    reads = []
    for i in xrange(len(starts)):
        reads += [t2[starts[i]:starts[i]+readLen]]
 
        # introduce substitution errors with 1% chance at each base
        for j in xrange(readLen):
            if random.random() < errorFreq:
                reads[i][j] = random.choice(['A', 'C', 'T', 'G'])

    tempReads = reads[:]
    tempFM = copy.deepcopy(fm)
    tempStarts = startsOrig[:]
    accuracyOrig, sizeOrig = iterativeEM.iterativeEM(tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 1)

    tempFM = copy.deepcopy(fm)
    tempReads = reads[:]
    tempStarts = startsOrig[:]
    accuracyRed, sizeRed = iterativeEM.iterativeEM(tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 0.1)
    
    tempFM = copy.deepcopy(fm)
    tempReads = reads[:]
    tempStarts = startsOrig[:]
    accuracyDist1, sizeDist1 = iterativeEMDist.iterativeEMDist(tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 1, 1)

    tempFM = copy.deepcopy(fm)
    tempReads = reads[:]
    tempStarts = startsOrig[:]
    accuracyDist2, sizeDist2 = iterativeEMDist.iterativeEMDist(tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 2, 1)

    tempFM = copy.deepcopy(fm)
    tempReads = reads[:]
    tempStarts = startsOrig[:]
    accuracyDistChunk2, sizeDistChunk2 = iterativeEMDist.iterativeEMDist(tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 2, 50)
    return (accuracyOrig, accuracyRed, accuracyDist1, accuracyDist2, accuracyDistChunk2), (sizeOrig, sizeRed, sizeDist1, sizeDist2, sizeDistChunk2)
Esempio n. 4
0
def countCorrect(genomeLen, numReads, readLen, mutFreq, errors):
    # Initialize the reference gene
    t = ['$']
    for i in range(genomeLen):
        t = [random.choice(['A', 'C', 'T', 'G'])] + t

    # constuct the fm index
    alphabet = ['$', 'A', 'C', 'G', 'T']
    b = 5
    fm = bwt.constructFM(t, b, alphabet)

    startsOrig = []
    for i in range(numReads):
        start = random.randint(0, genomeLen-readLen-1)
        startsOrig += [start]
    starts = startsOrig[:]

    # mutate the reference genome to get new genome
    t2 = t[:]
    for i in range(int(round(mutFreq*genomeLen))):
        base = random.randint(0, len(t2)-1)

        mutType = random.randint(0,2)
        # substitution
        if mutType == 0:
            t2[base] = random.choice(['A', 'C', 'T', 'G'])
        # insertion
        elif mutType == 1:
            t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:]
 
            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] += 1
        # deletion
        else:
            t2 = t2[:base] + t2[base+1:]
            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] -= 1

    # generate reads from new genome
    reads = []
    for i in xrange(len(starts)):
        reads += [t2[starts[i]:starts[i]+readLen]]

    # Match reads against t2
    correct = 0
    incorrect = 0
    for i in range(numReads):
        #print 'Read ' + str(i+1)
        #print '  ' + ''.join(reads[i])
        m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]), errors)
        found = False
        #print 'Searching for ' + str(startsOrig[i])
        #print m
        #print ''.join(reads[i])
        #print ''.join(t[starts[i]:starts[i]+readLen])
        for j in xrange(-errors, errors+1):
            if startsOrig[i]+j in m.keys() and not found:
                #print 'Found!\n'
                correct += 1
                found = True
        if not found:
            #print 'Not found\n'
            incorrect += 1
    print '  Accuracy: ' + str(correct) + ' / ' + str(correct+incorrect) + ' = ' + str(float(correct)/(incorrect+correct))
    return float(correct) / (incorrect+correct)
Esempio n. 5
0
#!/usr/bin/env python3
import bwt
import random

alpha = ['A', 'C', 'G', 'T']
t = ['$'] + [random.choice(alpha) for x in range(10000)]
alphabet = alpha + ['$']

b = 5
print('OldRow, First, Checkpt, NewRow, SA, Checkpt, Reord')
fm = bwt.constructFM(t, b, alphabet)
bwt.insert(fm, b, alphabet, 1, random.choice(alpha), timing=True)
bwt.insert(fm, b, alphabet, 5, random.choice(alpha), timing=True)
bwt.insert(fm, b, alphabet, 10, random.choice(alpha), timing=True)
bwt.insert(fm, b, alphabet, 50, random.choice(alpha), timing=True)
bwt.insert(fm, b, alphabet, 100, random.choice(alpha), timing=True)
bwt.insert(fm, b, alphabet, 500, random.choice(alpha), timing=True)
bwt.insert(fm, b, alphabet, 1000, random.choice(alpha), timing=True)
bwt.insert(fm, b, alphabet, 5000, random.choice(alpha), timing=True)
print()
fm = bwt.constructFM(t, b, alphabet)
bwt.insert(fm, b, alphabet, 1, 'A', timing=True)
bwt.insert(fm, b, alphabet, 5, 'A', timing=True)
bwt.insert(fm, b, alphabet, 10, 'A', timing=True)
bwt.insert(fm, b, alphabet, 50, 'A', timing=True)
bwt.insert(fm, b, alphabet, 100, 'A', timing=True)
bwt.insert(fm, b, alphabet, 500, 'A', timing=True)
bwt.insert(fm, b, alphabet, 1000, 'A', timing=True)
bwt.insert(fm, b, alphabet, 5000, 'A', timing=True)
bwt.insert(fm, b, alphabet, 10000, 'A', timing=True)
print()
Esempio n. 6
0
def iterEM(genomeLen, numReads, readLen, mutFreq, errors, errorFreq):
    ''' Prop = proportion of reads to contribute to mutations '''
    # Initialize the reference gene
    t = ['$']
    for i in range(genomeLen):
        t = [random.choice(['A', 'C', 'T', 'G'])] + t

    # constuct the fm index
    alphabet = ['$', 'A', 'C', 'G', 'T']
    b = 5
    fm = bwt.constructFM(t, b, alphabet)

    startsOrig = []
    for i in range(numReads):
        start = random.randint(0, genomeLen - readLen - 1)
        startsOrig += [start]
    starts = startsOrig[:]

    # mutate the reference genome to get new genome
    t2 = t[:]
    for i in range(int(round(mutFreq * genomeLen))):
        base = random.randint(0, len(t2) - 1)

        mutType = random.randint(0, 2)
        # substitution
        if mutType == 0:
            t2[base] = random.choice(['A', 'C', 'T', 'G'])
        # insertion
        elif mutType == 1:
            t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:]

            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] += 1
        # deletion
        else:
            t2 = t2[:base] + t2[base + 1:]
            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] -= 1

    # generate reads from new genome
    reads = []
    for i in xrange(len(starts)):
        reads += [t2[starts[i]:starts[i] + readLen]]

        # introduce substitution errors with 1% chance at each base
        for j in xrange(readLen):
            if random.random() < errorFreq:
                reads[i][j] = random.choice(['A', 'C', 'T', 'G'])

    tempReads = reads[:]
    tempFM = copy.deepcopy(fm)
    tempStarts = startsOrig[:]
    accuracyOrig, sizeOrig = iterativeEM.iterativeEM(tempFM, b, alphabet,
                                                     tempReads, tempStarts,
                                                     errors, 5, readLen,
                                                     genomeLen, 1)

    tempFM = copy.deepcopy(fm)
    tempReads = reads[:]
    tempStarts = startsOrig[:]
    accuracyRed, sizeRed = iterativeEM.iterativeEM(tempFM, b, alphabet,
                                                   tempReads, tempStarts,
                                                   errors, 5, readLen,
                                                   genomeLen, 0.1)

    tempFM = copy.deepcopy(fm)
    tempReads = reads[:]
    tempStarts = startsOrig[:]
    accuracyDist1, sizeDist1 = iterativeEMDist.iterativeEMDist(
        tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen,
        genomeLen, 1, 1)

    tempFM = copy.deepcopy(fm)
    tempReads = reads[:]
    tempStarts = startsOrig[:]
    accuracyDist2, sizeDist2 = iterativeEMDist.iterativeEMDist(
        tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen,
        genomeLen, 2, 1)

    tempFM = copy.deepcopy(fm)
    tempReads = reads[:]
    tempStarts = startsOrig[:]
    accuracyDistChunk2, sizeDistChunk2 = iterativeEMDist.iterativeEMDist(
        tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen,
        genomeLen, 2, 50)
    return (accuracyOrig, accuracyRed, accuracyDist1, accuracyDist2,
            accuracyDistChunk2), (sizeOrig, sizeRed, sizeDist1, sizeDist2,
                                  sizeDistChunk2)
Esempio n. 7
0
def iterEM(genomeLen, numReads, readLen, mutFreq, errors, errorFreq):
    """ Prop = proportion of reads to contribute to mutations """
    # Initialize the reference gene
    t = ["$"]
    for i in range(genomeLen):
        t = [random.choice(["A", "C", "T", "G"])] + t

    # constuct the fm index
    alphabet = ["$", "A", "C", "G", "T"]
    b = 5
    fm = bwt.constructFM(t, b, alphabet)

    # initialize starting points for reads
    # reads are twice as likely to originate from the first half of the genome
    startsOrig = []
    for i in range(numReads):
        start = random.randint(0, round(1.5 * (genomeLen - readLen - 1)))
        if start > genomeLen / 2:
            start -= genomeLen / 2
        startsOrig += [int(start)]
    starts = startsOrig[:]

    # mutate the reference genome to get new genome
    t2 = t[:]
    for i in range(int(round(mutFreq * genomeLen))):
        base = random.randint(0, len(t2) - 1)

        mutType = random.randint(0, 2)
        # substitution
        if mutType == 0:
            t2[base] = random.choice(["A", "C", "T", "G"])
        # insertion
        elif mutType == 1:
            t2 = t2[:base] + [random.choice(["A", "C", "T", "G"])] + t2[base:]

            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] += 1
        # deletion
        else:
            t2 = t2[:base] + t2[base + 1 :]
            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] -= 1

    # generate reads from new genome
    reads = []
    for i in xrange(len(starts)):
        reads += [t2[starts[i] : starts[i] + readLen]]

        # introduce substitution errors with 1% chance at each base
        for j in xrange(readLen):
            if random.random() < errorFreq:
                reads[i][j] = random.choice(["A", "C", "T", "G"])

    tempReads = reads[:]
    tempFM = copy.deepcopy(fm)
    tempStarts = startsOrig[:]
    accuracyOrig, sizeOrig = iterativeEM.iterativeEM(
        tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 1
    )

    tempFM = copy.deepcopy(fm)
    tempReads = reads[:]
    tempStarts = startsOrig[:]
    accuracyRed, sizeRed = iterativeEM.iterativeEM(
        tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 0.1
    )

    tempFM = copy.deepcopy(fm)
    tempReads = reads[:]
    tempStarts = startsOrig[:]
    accuracyChunk50, sizeChunk50 = iterativeEMDist.iterativeEMDist(
        tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 2, 50
    )

    tempFM = copy.deepcopy(fm)
    tempReads = reads[:]
    tempStarts = startsOrig[:]
    accuracyChunk100, sizeChunk100 = iterativeEMDist.iterativeEMDist(
        tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 2, 100
    )
    return (
        (accuracyOrig, accuracyRed, accuracyChunk50, accuracyChunk100),
        (sizeOrig, sizeRed, sizeChunk50, sizeChunk100),
    )
Esempio n. 8
0
    length = lengths[i]
    numRuns = runLens[i]
    for n in xrange(numRuns):
    
        # Generate a long random string of random length
        t = ['$']
        for i in range(length):
            t = [random.choice(['A', 'C', 'T', 'G'])] + t

        alphabet = ['$', 'A', 'C', 'G', 'T']
        b = 50

        # Construct the fm index
        startBuild = time.time()
        fm = bwt.constructFM(t, b, alphabet)
        buildTime += time.time() - startBuild

        letters = set(t)
        letters.remove('$')
    
        # Substitution of a character
        subId = random.randint(0,length-1)
        newChar = random.choice(list(letters))
        t2 = t[:subId] + [newChar] + t[subId+1:]

        startUpdate = time.time()
        fm_new = bwt.substitute(fm, b, alphabet, subId, newChar)
        timeUpdate = time.time() - startUpdate

        updateTimes[0] += timeUpdate
Esempio n. 9
0
def countCorrect(genomeLen, numReads, readLen, mutFreq, errors):
    # Initialize the reference gene
    t = ['$']
    for i in range(genomeLen):
        t = [random.choice(['A', 'C', 'T', 'G'])] + t

    # constuct the fm index
    alphabet = ['$', 'A', 'C', 'G', 'T']
    b = 5
    fm = bwt.constructFM(t, b, alphabet)

    startsOrig = []
    for i in range(numReads):
        start = random.randint(0, genomeLen - readLen - 1)
        startsOrig += [start]
    starts = startsOrig[:]

    # mutate the reference genome to get new genome
    t2 = t[:]
    for i in range(int(round(mutFreq * genomeLen))):
        base = random.randint(0, len(t2) - 1)

        mutType = random.randint(0, 2)
        # substitution
        if mutType == 0:
            t2[base] = random.choice(['A', 'C', 'T', 'G'])
        # insertion
        elif mutType == 1:
            t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:]

            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] += 1
        # deletion
        else:
            t2 = t2[:base] + t2[base + 1:]
            for s in xrange(len(starts)):
                if starts[s] >= base:
                    starts[s] -= 1

    # generate reads from new genome
    reads = []
    for i in xrange(len(starts)):
        reads += [t2[starts[i]:starts[i] + readLen]]

    # Match reads against t2
    correct = 0
    incorrect = 0
    for i in range(numReads):
        #print 'Read ' + str(i+1)
        #print '  ' + ''.join(reads[i])
        m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]), errors)
        found = False
        #print 'Searching for ' + str(startsOrig[i])
        #print m
        #print ''.join(reads[i])
        #print ''.join(t[starts[i]:starts[i]+readLen])
        for j in xrange(-errors, errors + 1):
            if startsOrig[i] + j in m.keys() and not found:
                #print 'Found!\n'
                correct += 1
                found = True
        if not found:
            #print 'Not found\n'
            incorrect += 1
    print '  Accuracy: ' + str(correct) + ' / ' + str(
        correct + incorrect) + ' = ' + str(
            float(correct) / (incorrect + correct))
    return float(correct) / (incorrect + correct)