def index_match(p, t, n): """Return matches allowing n mismatches.""" segment_length = int(len(p) / (n + 1)) all_matches = set() for i in range(n + 1): start = i * segment_length end = min((i + 1) * segment_length, len(p)) p_bm = BoyerMoore(p[start:end], alphabet='ACGT') matches = boyer_moore(p[start:end], p_bm, t) for m in matches: if m < start or m - start + len(p) > len(t): continue mismatches = 0 for j in range(0, start): if p[j] != t[m - start + j]: mismatches += 1 if mismatches > n: break for j in range(end, len(p)): if p[j] != t[m - start + j]: mismatches += 1 if mismatches > n: break if mismatches <= n: all_matches.add(m - start) return list(all_matches), matches
def approximate_match(p, t, n): from bm_preproc import BoyerMoore """ pigeon hole """ segment_length = int(round(len(p) / (n + 1))) all_matches = set() num_index_hits = 0 for i in range(n + 1): start = i * segment_length end = min((i + 1) * segment_length, len(p)) p_bm = BoyerMoore(p[start:end]) matches = boyer_moore(p[start:end], p_bm, t) num_index_hits += len(matches) for m in matches: if (m < start) or (m + len(p) - start > len(t)): continue mistaches = 0 for j in range(0, start): if p[j] != t[m - start + j]: mistaches += 1 if mistaches > n: break for j in range(end, len(p)): if p[j] != t[m + j - start]: mistaches += 1 if mistaches > n: break if mistaches <= n: all_matches.add(m - start) return list(all_matches), num_index_hits
def approximate_match(p, t, n): segment_length = int(round(len(p) / (n + 1))) all_matches = set() all_hits = 0 for i in range(n + 1): start = i * segment_length end = min(len(p), (i + 1) * segment_length) p_bm = BoyerMoore(p[start:end], alphabet='ACGT') matches = boyer_moore(p[start:end], t, p_bm) all_hits += len(matches) for m in matches: if m < start or m - start + len(p) > len(t): continue mismatches = 0 for j in range(0, start): if not p[j] == t[m - start + j]: mismatches += 1 if mismatches > n: break for j in range(end, len(p)): if not p[j] == t[m - start + j]: mismatches += 1 if mismatches > n: break if mismatches <= n: all_matches.add(m - start) return list(all_matches), all_hits
def approximate_match(p, t, n): # n: max distance segment_length = int(round(len(p) / (n + 1))) # we use a set because if the whole pattern exact matches,every partition will match, # verification for every partition will succeed ,and every partition would return # a begining to the pattern,so to avoid repetition we use a set all_matches = set() for i in range(n + 1): start = i * segment_length end = min((i + 1) * segment_length, len(p)) # Make BoyerMoore object which preprocesses the pattern to use good suffix and bad character rules p_bm = BoyerMoore(p[start:end], alphabet='ACGT') matches = boyer_moore(p[start:end], p_bm, t) for m in matches: # check if pattern goes out of bounds if m - start + len(p) > len(t) or m < start: continue missmatches = 0 for j in range(0, start): # verification for pattern before the begining of partition if p[j] != t[m - start + j]: missmatches += 1 if missmatches > n: break for j in range(end, len(p)): # verification for pattern after end of partition if p[j] != t[m - start + j]: missmatches += 1 if missmatches > n: break if missmatches <= n: all_matches.add(m - start) return list(all_matches)
def example_2_2(): from bm_preproc import BoyerMoore p = 'needle' t = 'needle need noodle needle' lowercase_alphabet = 'abcdefghijklmnopqrstuvwxyz ' p_bm = BoyerMoore(p, lowercase_alphabet) #print(boyer_moore_with_counts(p, p_bm, t)) assert boyer_moore_with_counts(p, p_bm, t) == ([0, 19], 18, 5)
def example_2_1(): from bm_preproc import BoyerMoore p = 'word' t = 'there would have been a time for such a word' lowercase_alphabet = 'abcdefghijklmnopqrstuvwxyz ' p_bm = BoyerMoore(p, lowercase_alphabet) #print(boyer_moore_with_counts(p, p_bm, t)) assert boyer_moore_with_counts(p, p_bm, t) == ([40], 15, 12)
def main(): chr1 = readGenome('chr1.GRCh38.excerpt.fasta') p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG' p_bm = BoyerMoore(p) print(naive(p, chr1)[2]) print(naive(p, chr1)[1]) print(boyer_moore(p, p_bm, chr1)[2]) p = 'GGCGCGGTGGCTCACGCCTGTAAT' print(len(approximate_match(p, chr1, 2)[0])) print(approximate_match(p, chr1, 2)[1]) print(approximate_match_subseq(p, chr1, 2, 3)[1])
def main(): chr = readGenome("chr1.GRCh38.excerpt.fasta") p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG' p_bm = BoyerMoore(p) print("Question 1: %d" % naive(p, chr)[1]) print("Question 2: %d" % naive(p, chr)[2]) print("Question 3: %d" % boyer_moore(p, p_bm, chr)[1]) p = 'GGCGCGGTGGCTCACGCCTGTAAT' print("Question 4: %d" % len(approximate_match(p, chr, 2)[0])) print("Question 5: %d" % approximate_match(p, chr, 2)[1]) print("Question 6: %d" % approximate_match_subseq(p, chr, 2, 3)[1])
def question3(): p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG' reads, qualities = readFastq('chr1.GRCh38.excerpt.fasta') assert len(reads) == len(qualities) total_char_comp = 0 total_align_comp = 0 from bm_preproc import BoyerMoore p_bm = BoyerMoore(p) for t in reads: occurrences, num_char_comp, num_aligments_tried = boyer_moore_with_counts( p, p_bm, t) total_char_comp += num_char_comp total_align_comp += num_aligments_tried """How many alignments does Boyer-Moore try when matching the string GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG (derived from human Alu sequences) to the excerpt of human chromosome 1? (Don't consider reverse complements.)""" print 'Question3: ', total_align_comp
def approximate_match(p, t, n): segment_length = int(round(len(p) / (n + 1))) all_matches = set( ) #create a set to fill all the indices where matches were found for i in range( n + 1 ): #calculate the bounds of p for the segment we are searching for start = i * segment_length #p might not be a perfect multiple of n+1 so we resize the last segment accordingly end = min((i + 1) * segment_length, len(p)) #take minimum to not run past end of p p_bm = BoyerMoore(p[start:end], alphabet='ACGT') #preprocessing of the pattern matches = boyer_moore(p[start:end], p_bm, t) for m in matches: if m < start or m - start + len(p) > len( t): #test if location does not let p run off continue mismatches = 0 #test part of pattern p before the segment that we just compared (and that was an exact match) for j in range(0, start): #compare number of mismatches if not p[j] == t[m - start + j]: mismatches += 1 if mismatches > n: break #compare the suffix after the segment for j in range(end, len(p)): if not p[j] == t[m - start + j]: mismatches += 1 if mismatches > n: break if mismatches <= n: #double check if the number of mismatches is no more than n all_matches.add( m - start ) #want the beginning of the pattern (not the beginning of the subpattern) return list(all_matches)
genome = '' with open( chromosome, 'r' ) as f: #indicates that we are opening a file for reading, 'w' indicates writing for line in f: if not line[0] == '>': genome += line.rstrip() """ Pattern matching with Boyer Moore (pre-processing the pattern) """ from bm_preproc import BoyerMoore p_bm = BoyerMoore(pattern) def boyer_moore(pattern, p_bm, genome): i = 0 occurrences = [] character_match = 0 character_mismatch = 0 num_alignments = 0 while i < len(genome) - len( pattern ) + 1: #loop through all the positions in t where p can start shift = 1 #how much we can move after character comparison mismatched = False num_alignments += 1
num_character_comparisons = 0 while i < len(t) - len(p) + 1: # loop through pos in t where p can start num_alignments += 1 # increment alignment count shift = 1 mismatched = False for j in range(len(p) - 1, -1, -1): # loop through pattern p num_character_comparisons += 1 # increment character comparisons if p[j] != t[i + j]: skip_bc = p_bm.bad_character_rule(j, t[i + j]) skip_gs = p_bm.good_suffix_rule(j) shift = max(shift, skip_bc, skip_gs) mismatched = True break if not mismatched: occurrences.append(i) skip_gs = p_bm.match_skip() shift = max(shift, skip_gs) i += shift return occurrences, num_alignments, num_character_comparisons # Test the program using excerpt of human chromosome 1. pattern = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG' chromosome1_text = readGenome('chr1GRCh38.fa') lowercase_alphabet = 'abcdefghijklmnopqrstuvwxyz' uppercase_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" p_bm = BoyerMoore(pattern, uppercase_alphabet) occurrences, num_alignments, num_character_comparisons = \ boyer_moore_with_counts(pattern, p_bm, chromosome1_text) print(occurrences, num_alignments, num_character_comparisons)
skip_gs = p_bm.good_suffix_rule(j) shift = max(shift, skip_bc, skip_gs) mismatched = True break if not mismatched: occurrences.append(i) skip_gs = p_bm.match_skip() shift = max(shift, skip_gs) i += shift return(occurrences, num_alignments, num_character_comparisons) def readGenome(filename): genome = '' with open(filename, 'r') as f: for line in f: # ignore header line with genome information if not line[0] == '>': genome += line.rstrip() return genome """Script to run boyer_moore_algorithm on human chromosome and given pattern""" p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG' t = readGenome('chr1.GRCh38.excerpt.fasta') p_bm = BoyerMoore(p, 'ATGC') matches, alignments, char_comparisons = boyer_moore_with_counts(p, p_bm, t) print(matches, alignments, char_comparisons)
0)[1] patterns = findPatternV2("GGCGCGGTGGCTCACGCCTGTAAT", filename) #Q2: How many character comparisons does the naive exact matching algorithm #try when matching the string GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG #(derived from human Alu sequences) to the excerpt of human chromosome 1? #(Don't consider reverse complements.) print "Q2: The characters comparisons for naive match algorithm is %d\n" % patterns.naiveMatch( 0)[2] #How many alignments does Boyer-Moore try when matching the string #GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG #(derived from human Alu sequences) to the excerpt of human chromosome 1? #(Don't consider reverse complements.) print "Q3: The alignments for Boyer-Moore algorithm is %d\n"%patterns.boyerMoore(0, \ BoyerMoore(pattern, "ACGT"))[1] #Q4: How many times does the string GGCGCGGTGGCTCACGCCTGTAAT, #which is derived from a human Alu sequence, occur with up to 2 #substitutions in the excerpt of human chromosome 1? #(Don't consider reverse complements here.) k_mer = 8 pattern = "GGCGCGGTGGCTCACGCCTGTAAT" genome = patterns.readGenome() index = Index(genome, k_mer) occurances, numberOfOccurs, numberOfhits = patterns.matchedIndex(index, \ k_mer, pattern, isSubseqIndex = False) print "Q4: Within 2 mismatchs, the string occurs %d times\n" % numberOfOccurs #Q5:Using the instructions given in Question 4, how many total index hits #are there when searching for occurrences of GGCGCGGTGGCTCACGCCTGTAAT with
from naive_with_counts import naive_with_counts from bm_with_counts import boyer_moore_with_counts from utils import readGenome from bm_preproc import BoyerMoore f = "chr1.GRCh38.excerpt.fasta" t = readGenome(f) p1 = "GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG" _, n_align_naive, n_char_naive = naive_with_counts(p1, t) print(f'Q1: {n_align_naive}') print(f'Q2: {n_char_naive}') p_bm = BoyerMoore(p1, 'ACGT') _, n_align_bm, _ = boyer_moore_with_counts(p1, p_bm, t) print(f'Q3: {n_align_bm}')
def boyer_moore(p, p_bm, t): # p: pattern , p_bm: Boyer Moore class object, t: index i = 0 occurences = [] while i < len(t) - len(p) + 1: shift = 1 # how many chars should be skipped missmatched = False for j in range(len(p) - 1, -1, -1): # right-to-left search (last to first) if p[j] != t[i + j]: # missmatch -> bad_char_rule -> good_sufix_rule skip_bc = p_bm.bad_character_rule(j, t[i + j]) skip_gs = p_bm.good_suffix_rule(j) shift = max(skip_bc, skip_gs) missmatched = True break if missmatched is False: occurences.append(i) skip_gs = p_bm.match_skip() # skipp pattern in text shift = max(shift, skip_gs) i += shift return occurences t = 'GCTAGCTCTACGAGTCTA' p = 'TCTA' p_bm = BoyerMoore(p, alphabet='ACGT') print(boyer_moore(p, p_bm, t))