def DistanceBetweenPatternAndStrings(pattern, text): k = len(pattern) # print(k) dist = 0 for str in text: HammingDistance = float("inf") for i in range(len(str)-k+1): if hamming_distance(pattern,str[i:i+k]) < HammingDistance: HammingDistance = hamming_distance(pattern,str[i:i+k]) dist += HammingDistance return dist
def MotifScore(Motifs): consensus = [0 for i in range(len(Motifs[0]))] dist = 0 for i in range(len(Motifs[0])): col = [motif[i] for motif in Motifs] items = dict((col.count(i),i) for i in col) consensus[i] = items[max(list(items.keys()))] consensus = ''.join(consensus) for motif in Motifs: dist += hamming_distance(consensus,motif) return dist
def approx_pattern_matching(pattern, text, d): matches = [] for i in range(len(text) - len(pattern) + 1): if len(text[i:i + len(pattern)]) == len(pattern): val1 = pattern val2 = text[i:i + len(pattern)] else: val2 = text[i:i + len(pattern)] val1 = pattern[:len(val2)] if hamming_distance(val1, val2) <= d: matches.append(i) return matches
def string_scores(pattern, strings): k = len(pattern) # need to examine k-mers of same length as pattern score = 0 #initialize score as zero # go through each string to identify closest match to pattern for s in strings: #set hamming distance as infinity ham = float("inf") # for each bp in sequence for i, bp in enumerate(s): end_index = i+k - 1 # if you haven't gone too far down the pattern (possibilitiy of finding k_mer still exists) if end_index < len(s): k_mer = s[i:i+k] # picks out k_mer if hamming_distance(k_mer, pattern) < ham: ham = hamming_distance(k_mer, pattern) # set ham to lowest hamming distance (closest k-mer) score += ham return score
def Neighbors(Pattern, d, nucleotides={'A', 'C', 'G', 'T'}): if d == 0: return Pattern elif len(Pattern) == 1: return nucleotides Neighborhood = [] SuffixNeighbors = Neighbors(Pattern[1:], d) for Text in SuffixNeighbors: if hamming_distance(Pattern[1:], Text) < d: for x in nucleotides: Neighborhood.append(x + Text) else: Neighborhood.append(Pattern[0] + Text) return Neighborhood
def neighbors(pattern, d): neighborhood = set() neighborhood.add(pattern) if d == 0: return pattern if len(pattern) == 1: return ['A', 'C', 'G', 'T'] suffix_neighbors = list([ ''.join(p) for p in product(['A', 'C', 'G', 'T'], repeat=len(pattern)) ]) for text in suffix_neighbors: if hamming_distance(pattern, text) == d: neighborhood.add(text) return neighborhood
def approx_pattern_count(pattern, text, d): positions = [] for i, b in enumerate(text): # if index is far enough from end of sequence that possibility of finding pattern still exists if i < (1 + len(text) - len(pattern)): # if the section of text matches the pattern if text[i:i + len(pattern)] == pattern: positions.append(i) # add it to the list of positions elif hamming_distance(pattern, text[i:i + len(pattern)]) <= d: positions.append(i) print positions print len(positions) return len(positions)
def Neighbors(pattern, d): if d == 0: return {pattern} if len(pattern) == 1: return {'A','C','G','T'} Neighborhood = set() SufficeNeighbors = Neighbors(pattern[1:],d) for str in SufficeNeighbors: if hamming_distance(str,pattern[1:]) < d: for base in Nucleotides: # print(base) Neighborhood.add(base+str) else: Neighborhood.add(pattern[0]+str) return Neighborhood
def distance_between_pattern_and_strings(pattern, dna): k = len(pattern) distance = 0 tmp = [] for text in dna: text = text.strip() result = list([ hamming_distance(text[i:i + k], pattern) for i in range(len(text) - k + 1) ]) try: distance += min(result) except ValueError: pass return distance
def approximate_pattern(pattern, text ,n): result = [] for i in range(len(text)-len(pattern)+1): if hamming_distance(pattern,text[i:i+len(pattern)]) <= n: result.append(i) return result