Example #1
0
def index_match(p, t, n):
    """Return matches allowing n mismatches."""
    segment_length = int(len(p) / (n + 1))
    all_matches = set()
    for i in range(n + 1):
        start = i * segment_length
        end = min((i + 1) * segment_length, len(p))
        p_bm = BoyerMoore(p[start:end], alphabet='ACGT')
        matches = boyer_moore(p[start:end], p_bm, t)

        for m in matches:
            if m < start or m - start + len(p) > len(t):
                continue

            mismatches = 0
            for j in range(0, start):
                if p[j] != t[m - start + j]:
                    mismatches += 1
                    if mismatches > n:
                        break
            for j in range(end, len(p)):
                if p[j] != t[m - start + j]:
                    mismatches += 1
                    if mismatches > n:
                        break

            if mismatches <= n:
                all_matches.add(m - start)

    return list(all_matches), matches
Example #2
0
def approximate_match(p, t, n):
    from bm_preproc import BoyerMoore
    """ pigeon hole """
    segment_length = int(round(len(p) / (n + 1)))
    all_matches = set()
    num_index_hits = 0
    for i in range(n + 1):
        start = i * segment_length
        end = min((i + 1) * segment_length, len(p))
        p_bm = BoyerMoore(p[start:end])
        matches = boyer_moore(p[start:end], p_bm, t)
        num_index_hits += len(matches)
        for m in matches:
            if (m < start) or (m + len(p) - start > len(t)):
                continue

            mistaches = 0
            for j in range(0, start):
                if p[j] != t[m - start + j]:
                    mistaches += 1
                    if mistaches > n:
                        break
            for j in range(end, len(p)):
                if p[j] != t[m + j - start]:
                    mistaches += 1
                    if mistaches > n:
                        break

            if mistaches <= n:
                all_matches.add(m - start)

    return list(all_matches), num_index_hits
def approximate_match(p, t, n):
    segment_length = int(round(len(p) / (n + 1)))
    all_matches = set()
    all_hits = 0
    for i in range(n + 1):
        start = i * segment_length
        end = min(len(p), (i + 1) * segment_length)
        p_bm = BoyerMoore(p[start:end], alphabet='ACGT')
        matches = boyer_moore(p[start:end], t, p_bm)
        all_hits += len(matches)
        for m in matches:
            if m < start or m - start + len(p) > len(t):
                continue
            mismatches = 0
            for j in range(0, start):
                if not p[j] == t[m - start + j]:
                    mismatches += 1
                    if mismatches > n:
                        break

            for j in range(end, len(p)):
                if not p[j] == t[m - start + j]:
                    mismatches += 1
                    if mismatches > n:
                        break

            if mismatches <= n:
                all_matches.add(m - start)
    return list(all_matches), all_hits
def approximate_match(p, t, n):  # n: max distance
    segment_length = int(round(len(p) / (n + 1)))
    # we use a set because if the whole pattern exact matches,every partition will match,
    # verification for every partition will succeed ,and every partition would return
    # a begining to the pattern,so to avoid repetition we use a set
    all_matches = set()
    for i in range(n + 1):
        start = i * segment_length
        end = min((i + 1) * segment_length, len(p))
        # Make BoyerMoore object which preprocesses the pattern to use good suffix and bad character rules
        p_bm = BoyerMoore(p[start:end], alphabet='ACGT')
        matches = boyer_moore(p[start:end], p_bm, t)
        for m in matches:
            # check if pattern goes out of bounds
            if m - start + len(p) > len(t) or m < start:
                continue
            missmatches = 0
            for j in range(0, start):  # verification for pattern before the begining of partition
                if p[j] != t[m - start + j]:
                    missmatches += 1
                if missmatches > n:
                    break

            for j in range(end, len(p)):  # verification for pattern after end of partition
                if p[j] != t[m - start + j]:
                    missmatches += 1
                if missmatches > n:
                    break
            if missmatches <= n:
                all_matches.add(m - start)
    return list(all_matches)
def example_2_2():
    from bm_preproc import BoyerMoore
    p = 'needle'
    t = 'needle need noodle needle'
    lowercase_alphabet = 'abcdefghijklmnopqrstuvwxyz '
    p_bm = BoyerMoore(p, lowercase_alphabet)
    #print(boyer_moore_with_counts(p, p_bm, t))
    assert boyer_moore_with_counts(p, p_bm, t) == ([0, 19], 18, 5)
def example_2_1():
    from bm_preproc import BoyerMoore
    p = 'word'
    t = 'there would have been a time for such a word'
    lowercase_alphabet = 'abcdefghijklmnopqrstuvwxyz '
    p_bm = BoyerMoore(p, lowercase_alphabet)
    #print(boyer_moore_with_counts(p, p_bm, t))
    assert boyer_moore_with_counts(p, p_bm, t) == ([40], 15, 12)
Example #7
0
def main():
    chr1 = readGenome('chr1.GRCh38.excerpt.fasta')
    p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'
    p_bm = BoyerMoore(p)
    print(naive(p, chr1)[2])
    print(naive(p, chr1)[1])
    print(boyer_moore(p, p_bm, chr1)[2])
    p = 'GGCGCGGTGGCTCACGCCTGTAAT'
    print(len(approximate_match(p, chr1, 2)[0]))
    print(approximate_match(p, chr1, 2)[1])
    print(approximate_match_subseq(p, chr1, 2, 3)[1])
Example #8
0
def main():
    chr = readGenome("chr1.GRCh38.excerpt.fasta")
    p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'
    p_bm = BoyerMoore(p)
    print("Question 1: %d" % naive(p, chr)[1])
    print("Question 2: %d" % naive(p, chr)[2])
    print("Question 3: %d" % boyer_moore(p, p_bm, chr)[1])
    p = 'GGCGCGGTGGCTCACGCCTGTAAT'
    print("Question 4: %d" % len(approximate_match(p, chr, 2)[0]))
    print("Question 5: %d" % approximate_match(p, chr, 2)[1])
    print("Question 6: %d" % approximate_match_subseq(p, chr, 2, 3)[1])
def question3():
    p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'
    reads, qualities = readFastq('chr1.GRCh38.excerpt.fasta')
    assert len(reads) == len(qualities)

    total_char_comp = 0
    total_align_comp = 0

    from bm_preproc import BoyerMoore
    p_bm = BoyerMoore(p)
    for t in reads:
        occurrences, num_char_comp, num_aligments_tried = boyer_moore_with_counts(
            p, p_bm, t)
        total_char_comp += num_char_comp
        total_align_comp += num_aligments_tried
    """How many alignments does Boyer-Moore try when matching the string 
    GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG (derived from human Alu sequences) 
    to the excerpt of human chromosome 1? (Don't consider reverse complements.)"""
    print 'Question3: ', total_align_comp
Example #10
0
def approximate_match(p, t, n):
    segment_length = int(round(len(p) / (n + 1)))
    all_matches = set(
    )  #create a set to fill all the indices where matches were found
    for i in range(
            n + 1
    ):  #calculate the bounds of p for the segment we are searching for
        start = i * segment_length
        #p might not be a perfect multiple of n+1 so we resize the last segment accordingly
        end = min((i + 1) * segment_length,
                  len(p))  #take minimum to not run past end of p
        p_bm = BoyerMoore(p[start:end],
                          alphabet='ACGT')  #preprocessing of the pattern
        matches = boyer_moore(p[start:end], p_bm, t)

        for m in matches:
            if m < start or m - start + len(p) > len(
                    t):  #test if location does not let p run off
                continue
            mismatches = 0
            #test part of pattern p before the segment that we just compared (and that was an exact match)
            for j in range(0, start):  #compare number of mismatches
                if not p[j] == t[m - start + j]:
                    mismatches += 1
                    if mismatches > n:
                        break
        #compare the suffix after the segment
            for j in range(end, len(p)):
                if not p[j] == t[m - start + j]:
                    mismatches += 1
                    if mismatches > n:
                        break
            if mismatches <= n:  #double check if the number of mismatches is no more than n
                all_matches.add(
                    m - start
                )  #want the beginning of the pattern (not the beginning of the subpattern)

    return list(all_matches)
Example #11
0
    genome = ''
    with open(
            chromosome, 'r'
    ) as f:  #indicates that we are opening a file for reading, 'w' indicates writing
        for line in f:
            if not line[0] == '>':
                genome += line.rstrip()


"""
Pattern matching with Boyer Moore (pre-processing the pattern)
"""

from bm_preproc import BoyerMoore

p_bm = BoyerMoore(pattern)


def boyer_moore(pattern, p_bm, genome):
    i = 0
    occurrences = []
    character_match = 0
    character_mismatch = 0
    num_alignments = 0

    while i < len(genome) - len(
            pattern
    ) + 1:  #loop through all the positions in t where p can start
        shift = 1  #how much we can move after character comparison
        mismatched = False
        num_alignments += 1
    num_character_comparisons = 0
    while i < len(t) - len(p) + 1:  # loop through pos in t where p can start
        num_alignments += 1  # increment alignment count
        shift = 1
        mismatched = False
        for j in range(len(p) - 1, -1, -1):  # loop through pattern p
            num_character_comparisons += 1  # increment character comparisons
            if p[j] != t[i + j]:
                skip_bc = p_bm.bad_character_rule(j, t[i + j])
                skip_gs = p_bm.good_suffix_rule(j)
                shift = max(shift, skip_bc, skip_gs)
                mismatched = True
                break
        if not mismatched:
            occurrences.append(i)
            skip_gs = p_bm.match_skip()
            shift = max(shift, skip_gs)
        i += shift
    return occurrences, num_alignments, num_character_comparisons


# Test the program using excerpt of human chromosome 1.
pattern = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'
chromosome1_text = readGenome('chr1GRCh38.fa')
lowercase_alphabet = 'abcdefghijklmnopqrstuvwxyz'
uppercase_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
p_bm = BoyerMoore(pattern, uppercase_alphabet)
occurrences, num_alignments, num_character_comparisons = \
    boyer_moore_with_counts(pattern, p_bm, chromosome1_text)
print(occurrences, num_alignments, num_character_comparisons)
                skip_gs = p_bm.good_suffix_rule(j)
                shift = max(shift, skip_bc, skip_gs)
                mismatched = True
                break
        if not mismatched:
            occurrences.append(i)
            skip_gs = p_bm.match_skip()
            shift = max(shift, skip_gs)
        i += shift
    return(occurrences, num_alignments, num_character_comparisons)


def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome


"""Script to run boyer_moore_algorithm on human chromosome and given pattern"""

p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'
t = readGenome('chr1.GRCh38.excerpt.fasta')
p_bm = BoyerMoore(p, 'ATGC')
matches, alignments, char_comparisons = boyer_moore_with_counts(p, p_bm, t)
print(matches, alignments, char_comparisons)

        0)[1]
    patterns = findPatternV2("GGCGCGGTGGCTCACGCCTGTAAT", filename)

    #Q2: How many character comparisons does the naive exact matching algorithm
    #try when matching the string GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG
    #(derived from human Alu sequences) to the excerpt of human chromosome 1?
    #(Don't consider reverse complements.)
    print "Q2: The characters comparisons for naive match algorithm is %d\n" % patterns.naiveMatch(
        0)[2]

    #How many alignments does Boyer-Moore try when matching the string
    #GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG
    #(derived from human Alu sequences) to the excerpt of human chromosome 1?
    #(Don't consider reverse complements.)
    print "Q3: The alignments for Boyer-Moore algorithm is %d\n"%patterns.boyerMoore(0, \
          BoyerMoore(pattern, "ACGT"))[1]

    #Q4: How many times does the string GGCGCGGTGGCTCACGCCTGTAAT,
    #which is derived from a human Alu sequence, occur with up to 2
    #substitutions in the excerpt of human chromosome 1?
    #(Don't consider reverse complements here.)
    k_mer = 8
    pattern = "GGCGCGGTGGCTCACGCCTGTAAT"
    genome = patterns.readGenome()
    index = Index(genome, k_mer)
    occurances, numberOfOccurs, numberOfhits = patterns.matchedIndex(index, \
                                               k_mer, pattern, isSubseqIndex = False)
    print "Q4: Within 2 mismatchs, the string occurs %d times\n" % numberOfOccurs

    #Q5:Using the instructions given in Question 4, how many total index hits
    #are there when searching for occurrences of GGCGCGGTGGCTCACGCCTGTAAT with
from naive_with_counts import naive_with_counts
from bm_with_counts import boyer_moore_with_counts
from utils import readGenome
from bm_preproc import BoyerMoore

f = "chr1.GRCh38.excerpt.fasta"
t = readGenome(f)

p1 = "GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG"
_, n_align_naive, n_char_naive = naive_with_counts(p1, t)

print(f'Q1: {n_align_naive}')
print(f'Q2: {n_char_naive}')

p_bm = BoyerMoore(p1, 'ACGT')

_, n_align_bm, _ = boyer_moore_with_counts(p1, p_bm, t)

print(f'Q3: {n_align_bm}')

def boyer_moore(p, p_bm,
                t):  # p: pattern , p_bm: Boyer Moore class object, t: index
    i = 0
    occurences = []
    while i < len(t) - len(p) + 1:
        shift = 1  # how many chars should be skipped
        missmatched = False
        for j in range(len(p) - 1, -1,
                       -1):  # right-to-left search (last to first)
            if p[j] != t[i +
                         j]:  # missmatch -> bad_char_rule -> good_sufix_rule
                skip_bc = p_bm.bad_character_rule(j, t[i + j])
                skip_gs = p_bm.good_suffix_rule(j)
                shift = max(skip_bc, skip_gs)
                missmatched = True
                break
        if missmatched is False:
            occurences.append(i)
            skip_gs = p_bm.match_skip()  # skipp pattern in text
            shift = max(shift, skip_gs)
        i += shift
    return occurences


t = 'GCTAGCTCTACGAGTCTA'
p = 'TCTA'
p_bm = BoyerMoore(p, alphabet='ACGT')
print(boyer_moore(p, p_bm, t))