def neighbors(kmer, distance): """ Return a set of all the neighbors of a k-mer :param kmer: the k-mer :param distance: the maximum distance between two k-mers for them to be neighbors :return: the set of all neighbors """ k = len(kmer) if k == 0: return {} if distance == 0: return {kmer} if k == 1: return {'A', 'T', 'C', 'G'} suffix = kmer[1:] suffix_neighbors = neighbors(suffix, distance) result = set() for suffix_neighbor in suffix_neighbors: if hamming_distance(suffix_neighbor, suffix) == distance: result.add(kmer[0] + suffix_neighbor) else: result.add('A' + suffix_neighbor) result.add('T' + suffix_neighbor) result.add('C' + suffix_neighbor) result.add('G' + suffix_neighbor) return result
def neighbors(pattern: str, d: int) -> list: """ This generates the d-neighborhood Neighbors(Pattern, d) """ if d == 0: return pattern if len(pattern) == 1: return ["A", "C", "G", "T"] neighborhood: list = [] first_symbol = pattern[0] suffix = pattern[1:] suffix_neighbors = neighbors(suffix, d) for text in suffix_neighbors: print("suffix neighbors", suffix_neighbors) if hamming_distance(text, suffix) < d: for base in "ACGT": text = base + text print(neighborhood) neighborhood.append(text) print(neighborhood) else: text = first_symbol + text neighborhood.append(text) print("end of for loop", suffix_neighbors) return neighborhood
def neighbors(pattern, d): ''' Finds d-neighborhood, or set of strings with <= d mismatches, of a pattern. Does so by recursively building d-neighborhoods of the pattern's suffixes and substituting each nucleotide to build neighborhood for previous recursive call. Runs in O(n^2) time Pattern: pattern (str): Pattern from which d-neighborhood is being generated d (int): Maximum Hamming Distance from pattern and accepted neighbors Returns: neighbors (str(set)): Set of all d-neighbors of pattern ''' nucs = ['A', 'T', 'C', 'G'] if d == 0: return set([pattern]) if len(pattern) == 1: return set(['A', 'T', 'C', 'G']) neighborhood = set() suffix_neighbors = neighbors(pattern[1:], d) for neighbor in suffix_neighbors: if hamming_distance(pattern[1:], neighbor) < d: for nuc in nucs: neighborhood.add(nuc + neighbor) else: neighborhood.add(pattern[:1] + neighbor) return neighborhood
def approximate_pattern_count(text, pattern, d): count = 0 for i in range(len(text)-len(pattern)+1): pat = text[i:i+len(pattern)] if hamming_distance.hamming_distance(pat, pattern) <= d: count = count + 1 return count
def find_approximate_substring(text, substring, max_mismatches): positions = [] for i in range(len(text) - len(substring) + 1): window = text[i:i + len(substring)] if (hamming_distance.hamming_distance(window, substring) <= max_mismatches): positions.append(i) return positions
def approximate_pattern_count(text, pattern, d): count = 0 pattern_length = len(pattern) for i in range(0, len(text) - pattern_length + 1): _pattern = text[i:i + pattern_length] if (hamming_distance(pattern, _pattern) <= d): count += 1 return count
def pattern_matching(pattern, genome, d): pattern_length = len(pattern) pos = [] for i in range(0, len(genome) - pattern_length + 1): _pattern = genome[i:i + pattern_length] if (hamming_distance(pattern, _pattern) <= d): pos.append(i) return pos
def test_hamming_distance_list(self): """ Test the case when the second parameter is a list """ self.assertEqual( hamming_distance("AAA", [ 'TTACCTTAAC', 'GATATCTGTC', 'ACGGCGTTCG', 'CCCTAAAGAG', 'CGTCAGAGGT' ]), 5)
def test_hamming_distance(self): """ Test the case when the strings are the same size and different :return: """ self.assertEqual( hamming_distance( "TGACCCGTTATGCTCGAGTTCGGTCAGAGCGTCATTGCGAGTAGTCGTTTGCTTTCTCAAACTCC", "GAGCGATTAAGCGTGACAGCCCCAGGGAACCCACAAAACGTGATCGCAGTCCATCCGATCATACA" ), 50)
def test_hamming_equal(self): """ Test the case when the strings are the same :return: """ self.assertEqual( hamming_distance( "TGACCCGTTATGCTCGAGTTCGGTCAGAGCGTCATTGCGAGTAGTCGTTTGCTTTCTCAAACTCC", "TGACCCGTTATGCTCGAGTTCGGTCAGAGCGTCATTGCGAGTAGTCGTTTGCTTTCTCAAACTCC" ), 0)
def score(motifs): """ Score a list of motifs found in sequences. The lower the score the better. This is done by computing the consensus string of the motifs, and computing the hamming distance between this string and the list of sequences. :param motifs: the motifs to score found in sequences :param sequences: the sequences :return: a score in [0; +inf[, the lower being the better """ motifs_consensus = consensus(motifs) return hamming_distance(motifs_consensus, motifs)
def general_pattern_matching(genome, sequence, distance): """ Return a set of starting positions where sequence is found in genome :param genome: the genome :param sequence: the sequence :return: a ascending sorted list of positions """ result = set() len_sequence = len(sequence) max = len(genome) - len_sequence for i in range(0, max + 1): if hamming_distance(genome[i:i + len_sequence], sequence) <= distance: result.add(i) return sorted(result)
def median_string(k, dna): """ Find the a k-mer x that minimizes the HammingDistance(x, dna) :param k: the size of the k-mer to find :param dna: a list of DNA sequences :return: a k-mer that minimizes the distance between itself and the list of sequences. If multiple k-mers are found, return only a single one. Efficiency: O(4^k * kns) """ min_distance = (float("inf"), None) for pattern in all_kmers(k): distance = hamming_distance(pattern, dna) if distance < min_distance[0]: min_distance = (distance, pattern) return min_distance[1]
def median_strings(k, dna): """ Find all k-mers x that minimizes the HammingDistance(x, dna) :param k: the size of the k-mer to find :param dna: a list of DNA sequences :return: all k-mer that minimizes the distance between themselves and the list of sequences Efficiency: O(4^k * kns) """ result = {} for pattern in all_kmers(k): distance = hamming_distance(pattern, dna) result[pattern] = distance min_value = min(result.values()) mins = [ sequence for sequence, value in result.items() if value == min_value ] return mins
def count_hamming(genome, kmer, distance): """ Return the number of occurrences of a given sequence and its similar sequences in a genome :param genome: the genome :param kmer: the sequence :param distance: the maximum hamming distance for 2 sequence to be similar :return: the number of occurrences """ k = len(kmer) len_genome = len(genome) if k > len_genome: return 0 count = 0 max_index = len_genome - k for i in range(0, max_index + 1): word = genome[i:i + k] if hamming_distance(kmer, word) <= distance: count += 1 return count
def approximate_pattern_count(text, pattern, d): ''' Finds the number of times a pattern appears in a text with at most d mismatches. Uses sliding window method. Runs in O(n*k) time, where n is the length of the text and k is the length of the pattern. Parameters: text (str): Text in which the pattern is being searched for pattern (str): Pattern that is being searched for in the text d (int): Max number of mismatches between text and patter. Returns: count (int): Number of times pattern appears in text with <= d mismatches ''' count = 0 for i in range(len(text) - len(pattern)): approx = text[i:i + len(pattern)] if hamming_distance(pattern, approx) <= d: count += 1 return count
def motif_enumeration(dna, k, d): ''' Finds all k-mers that appear in multiple DNA sequences with no more than d mismatches. Does so by generating all k-mer neighbors for the first sequence, and checking if they occur in other sequences with a hamming distance <= d. Time complexity is very poor (O(n^2 * k^3 * s), where n = len(seq[0]), k = len(k-mer), and s = len(seq) Parameters: dna (str): dna sequences for which motifs are being found, separated by \n k (int): size of motif d (int): maximum Hamming Distance between sequence and pattern Returns: patterns (set): motifs found in all dna strands ''' seqs = dna.split('\n') if seqs[-1] == '': seqs.pop() patterns = set() #O(n) for i in range(len(seqs[0]) + 1 - k): pattern = seqs[0][i:i + k] #O(k^2) neighborhood = neighbors(pattern, d) for neighbor in neighborhood: all_match = True #O(s). Checks that k-mer with <= d mismatches appears in all seqs for seq in seqs: match = False #O(n^2) for l in range(len(seq) + 1 - k): window = seq[l:l + k] #O(k) if hamming_distance(neighbor, window) <= d: match = True if not match: all_match = False if all_match: patterns.add(neighbor) return patterns
def approximate_pattern_matching(genome: str, pattern: str, n: int) -> list: """ This function returns a list containing indexes of the starting point where the pattern is present as a substring of the genome with at most n mismatch Param: pattern: The pattern to find in the genome genome: The genome n: maximum number of mismatch allowed Returns: list(list): A list containing the index of the starting positions of the patterns in the genome """ output: list = [] k = len(pattern) for i in range(len(genome)): k_mer = genome[i:i + k] """ if this is an issue, try to find a better way to take care of cases where the length of the remaining string is less than the length of the pattern""" if (len(k_mer) < k): break hd = hamming_distance(pattern, k_mer) if hd <= n: output.append(str(i)) return (output)
# Compare adjacent pixels. difference = [] for row in xrange(hash_size): for col in xrange(hash_size): pixel_left = image.getpixel((col, row)) pixel_right = image.getpixel((col + 1, row)) difference.append(pixel_left > pixel_right) # Convert the binary array to a hexadecimal string. decimal_value = 0 hex_string = [] for index, value in enumerate(difference): if value: decimal_value += 2**(index % 8) if (index % 8) == 7: hex_string.append(hex(decimal_value)[2:].rjust(2, '0')) decimal_value = 0 return ''.join(hex_string) if __name__ == "__main__": image1 = Image.open("../images/hans1.jpg") image2 = Image.open("../images/hans2.jpg") hash1 = dhash(image1, 4) hash2 = dhash(image2, 4) print hash1 print hash2 print hamming_distance(hash1, hash2)
#feat_q = feat_q / LA.norm(feat_q, 2.0) #feat_ref = feat_ref / LA.norm(feat_ref, 2.0) if FEATURE_JITTER == 10: feat_q = np.reshape(feat_q, (1,feat_q.shape[0]*feat_q.shape[1])) feat_ref = np.reshape(feat_ref,(1,feat_ref.shape[0]*feat_ref.shape[1])) else: feat_q = np.reshape(feat_q, (1,feat_q.shape[0])) feat_ref = np.reshape(feat_ref,(1,feat_ref.shape[0])) bins = np.array([0],dtype=np.uint8) dig_feat_q = np.digitize(feat_q,bins,right=True) dig_feat_ref = np.digitize(feat_ref,bins,right=True) feat_q = np.uint64(np.packbits(np.uint8(dig_feat_q),axis=1)) feat_ref = np.uint64(np.packbits(np.uint8(dig_feat_ref),axis=1)) import pdb; pdb.set_trace() for i in range (0,3): q_shift = feat_q << 8*(2**i) ref_shift = feat_ref << 8*(2**i) feat_q = q_shift[:,0::2] + feat_q[:,1::2] feat_ref = ref_shift[:,0::2] + feat_ref[:,1::2] distance = h_dist.hamming_distance(feat_q,feat_ref) #distance = LA.norm(feat_q - feat_ref, 2.0) #distance = np.sum(np.sqrt(np.power(feat_q - feat_ref, 2.0))) print('%d %s %s %d %d' %(n, query_filename, ref_filename, distance, label)) log.write('%d\t%d\n' %(label, distance)) except: print 'ERROR: query_filename: ', fname[0] log.close()
def test_hamming_distance_1(self): v = [1, 1, 1, 1, 1, 1, 1] w = [1, 1, 1, 1, 1, 1, 1] self.assertEqual(0, hamming_distance(v, w))
def test_hamming_distance_3(self): v = [0, 1, 0, 1, 0, 0, 1] w = [1, 0, 1, 0, 1, 1, 0] self.assertEqual(7, hamming_distance(v, w))
def test_hamming_distance(self): self.assertEqual(2, hamming_distance(1, 4))
def test_hamming_distance_bigger(self): """ Test the case when the second string is bigger than the first one """ self.assertEqual(hamming_distance("GATTCTCA", "GCAAAGACGCTGACCAA"), 3)
print('Test for hamming_distance.popcnt') query = np.random.randint(0, 100000) ref = np.random.randint(0,100000) start = time.time() diff = np.bitwise_xor(query,ref) dist = hamming_distance.popcnt(diff) elapsed = time.time() - start print('%d in %.6f' % (dist, elapsed)) import pdb; pdb.set_trace(); dimension = 320 query = np.zeros((1,dimension), dtype=np.uint64) ref = np.ones((1,dimension), dtype=np.uint64) print('Test for hamming_distance.hamming_distance for %d dim.' % dimension) start = time.time() distance = hamming_distance.hamming_distance(query, ref) elapsed = time.time() - start print('%d in %.6f' % (distance, elapsed)) num_ref = 1000000 print('Test for hamming_distance.hamming_distance_ref for %d samples' % num_ref) start_mem_load = time.time() ref = np.ones((num_ref,dimension), dtype=np.uint64) dist= np.zeros((num_ref), dtype=np.uint32) elapsed_mem_load = time.time() - start_mem_load print('Memory alloc. for %d samples in %f' % (num_ref, elapsed_mem_load)) ref[num_ref-1,dimension-1] = 0 ref[num_ref-1,dimension-2] = 0 ref[num_ref-1,dimension-3] = 0 ref[num_ref-2,dimension-3] = 0 start = time.time()
from hamming_distance import hamming_distance import sys file = open('./words.txt', 'r') list = file.readlines() file.close() testword = sys.argv[1] for word in list: word = str.rstrip(word) if(len(word) == len(testword)): if(hamming_distance(testword, word) == 1): print word + '\n'