def neighbors(kmer, distance):
    """
    Return a set of all the neighbors of a k-mer
    :param kmer: the k-mer
    :param distance: the maximum distance between two k-mers for them to be neighbors
    :return: the set of all neighbors
    """
    k = len(kmer)
    if k == 0:
        return {}
    if distance == 0:
        return {kmer}
    if k == 1:
        return {'A', 'T', 'C', 'G'}
    suffix = kmer[1:]
    suffix_neighbors = neighbors(suffix, distance)
    result = set()
    for suffix_neighbor in suffix_neighbors:
        if hamming_distance(suffix_neighbor, suffix) == distance:
            result.add(kmer[0] + suffix_neighbor)
        else:
            result.add('A' + suffix_neighbor)
            result.add('T' + suffix_neighbor)
            result.add('C' + suffix_neighbor)
            result.add('G' + suffix_neighbor)
    return result
Beispiel #2
0
def neighbors(pattern: str, d: int) -> list:
    """
    This generates the d-neighborhood Neighbors(Pattern, d)
    """
    if d == 0:
        return pattern
    if len(pattern) == 1:
        return ["A", "C", "G", "T"]

    neighborhood: list = []
    first_symbol = pattern[0]
    suffix = pattern[1:]
    suffix_neighbors = neighbors(suffix, d)
    for text in suffix_neighbors:
        print("suffix neighbors", suffix_neighbors)
        if hamming_distance(text, suffix) < d:
            for base in "ACGT":
                text = base + text
                print(neighborhood)
                neighborhood.append(text)
                print(neighborhood)
        else:
            text = first_symbol + text
            neighborhood.append(text)
        print("end of for loop", suffix_neighbors)
    return neighborhood
def neighbors(pattern, d):
    '''
    Finds d-neighborhood, or set of strings with <= d mismatches, of a pattern. Does so by
    recursively building d-neighborhoods of the pattern's suffixes and substituting each
    nucleotide to build neighborhood for previous recursive call. Runs in O(n^2) time

    Pattern:
    pattern (str): Pattern from which d-neighborhood is being generated
    d (int): Maximum Hamming Distance from pattern and accepted neighbors

    Returns:
    neighbors (str(set)): Set of all d-neighbors of pattern
    '''
    nucs = ['A', 'T', 'C', 'G']
    if d == 0:
        return set([pattern])
    if len(pattern) == 1:
        return set(['A', 'T', 'C', 'G'])
    neighborhood = set()
    suffix_neighbors = neighbors(pattern[1:], d)
    for neighbor in suffix_neighbors:
        if hamming_distance(pattern[1:], neighbor) < d:
            for nuc in nucs:
                neighborhood.add(nuc + neighbor)
        else:
            neighborhood.add(pattern[:1] + neighbor)
    return neighborhood
def approximate_pattern_count(text, pattern, d):
	count = 0
	for i in range(len(text)-len(pattern)+1):
		pat = text[i:i+len(pattern)]
		if hamming_distance.hamming_distance(pat, pattern) <= d:
			count = count + 1
	return count
def find_approximate_substring(text, substring, max_mismatches):
  positions = []
  for i in range(len(text) - len(substring) + 1):
    window = text[i:i + len(substring)]
    if (hamming_distance.hamming_distance(window, substring) <= max_mismatches):
      positions.append(i)
  return positions
Beispiel #6
0
def approximate_pattern_count(text, pattern, d):
    count = 0
    pattern_length = len(pattern)
    for i in range(0, len(text) - pattern_length + 1):
        _pattern = text[i:i + pattern_length]
        if (hamming_distance(pattern, _pattern) <= d):
            count += 1
    return count
def pattern_matching(pattern, genome, d):
    pattern_length = len(pattern)
    pos = []
    for i in range(0, len(genome) - pattern_length + 1):
        _pattern = genome[i:i + pattern_length]
        if (hamming_distance(pattern, _pattern) <= d):
            pos.append(i)
    return pos
Beispiel #8
0
 def test_hamming_distance_list(self):
     """
     Test the case when the second parameter is a list
     """
     self.assertEqual(
         hamming_distance("AAA", [
             'TTACCTTAAC', 'GATATCTGTC', 'ACGGCGTTCG', 'CCCTAAAGAG',
             'CGTCAGAGGT'
         ]), 5)
Beispiel #9
0
 def test_hamming_distance(self):
     """
     Test the case when the strings are the same size and different
     :return:
     """
     self.assertEqual(
         hamming_distance(
             "TGACCCGTTATGCTCGAGTTCGGTCAGAGCGTCATTGCGAGTAGTCGTTTGCTTTCTCAAACTCC",
             "GAGCGATTAAGCGTGACAGCCCCAGGGAACCCACAAAACGTGATCGCAGTCCATCCGATCATACA"
         ), 50)
Beispiel #10
0
 def test_hamming_equal(self):
     """
     Test the case when the strings are the same
     :return:
     """
     self.assertEqual(
         hamming_distance(
             "TGACCCGTTATGCTCGAGTTCGGTCAGAGCGTCATTGCGAGTAGTCGTTTGCTTTCTCAAACTCC",
             "TGACCCGTTATGCTCGAGTTCGGTCAGAGCGTCATTGCGAGTAGTCGTTTGCTTTCTCAAACTCC"
         ), 0)
Beispiel #11
0
def score(motifs):
    """
    Score a list of motifs found in sequences. The lower the score the better.
    This is done by computing the consensus string of the motifs, and computing the hamming distance between this
    string and the list of sequences.
    :param motifs: the motifs to score found in sequences
    :param sequences: the sequences
    :return: a score in [0; +inf[, the lower being the better
    """
    motifs_consensus = consensus(motifs)
    return hamming_distance(motifs_consensus, motifs)
def general_pattern_matching(genome, sequence, distance):
    """
    Return a set of starting positions where sequence is found in genome
    :param genome: the genome
    :param sequence: the sequence
    :return: a ascending sorted list of positions
    """
    result = set()
    len_sequence = len(sequence)
    max = len(genome) - len_sequence
    for i in range(0, max + 1):
        if hamming_distance(genome[i:i + len_sequence], sequence) <= distance:
            result.add(i)
    return sorted(result)
def median_string(k, dna):
    """
    Find the a k-mer x that minimizes the HammingDistance(x, dna)
    :param k: the size of the k-mer to find
    :param dna: a list of DNA sequences
    :return: a k-mer that minimizes the distance between itself and the list of sequences. If multiple k-mers are found,
    return only a single one.

    Efficiency: O(4^k * kns)
    """
    min_distance = (float("inf"), None)
    for pattern in all_kmers(k):
        distance = hamming_distance(pattern, dna)
        if distance < min_distance[0]:
            min_distance = (distance, pattern)
    return min_distance[1]
def median_strings(k, dna):
    """
    Find all k-mers x that minimizes the HammingDistance(x, dna)
    :param k: the size of the k-mer to find
    :param dna: a list of DNA sequences
    :return: all k-mer that minimizes the distance between themselves and the list of sequences

    Efficiency: O(4^k * kns)
    """
    result = {}
    for pattern in all_kmers(k):
        distance = hamming_distance(pattern, dna)
        result[pattern] = distance
    min_value = min(result.values())
    mins = [
        sequence for sequence, value in result.items() if value == min_value
    ]
    return mins
def count_hamming(genome, kmer, distance):
    """
    Return the number of occurrences of a given sequence and its similar sequences in a genome
    :param genome: the genome
    :param kmer: the sequence
    :param distance: the maximum hamming distance for 2 sequence to be similar
    :return: the number of occurrences
    """
    k = len(kmer)
    len_genome = len(genome)
    if k > len_genome:
        return 0
    count = 0
    max_index = len_genome - k
    for i in range(0, max_index + 1):
        word = genome[i:i + k]
        if hamming_distance(kmer, word) <= distance:
            count += 1
    return count
Beispiel #16
0
def approximate_pattern_count(text, pattern, d):
    '''
    Finds the number of times a pattern appears in a text with at most d mismatches. Uses sliding
    window method. Runs in O(n*k) time, where n is the length of the text and k is the length of
    the pattern.

    Parameters:
    text (str): Text in which the pattern is being searched for
    pattern (str): Pattern that is being searched for in the text
    d (int): Max number of mismatches between text and patter.

    Returns:
    count (int): Number of times pattern appears in text with <= d mismatches
    '''
    count = 0
    for i in range(len(text) - len(pattern)):
        approx = text[i:i + len(pattern)]
        if hamming_distance(pattern, approx) <= d:
            count += 1
    return count
def motif_enumeration(dna, k, d):
    '''
    Finds all k-mers that appear in multiple DNA sequences with no more than d mismatches. Does so
    by generating all k-mer neighbors for the first sequence, and checking if they occur in other
    sequences with a hamming distance <= d. Time complexity is very poor (O(n^2 * k^3 * s), where
    n = len(seq[0]), k = len(k-mer), and s = len(seq)

    Parameters:
    dna (str): dna sequences for which motifs are being found, separated by \n
    k (int): size of motif
    d (int): maximum Hamming Distance between sequence and pattern

    Returns:

    patterns (set): motifs found in all dna strands
    '''
    seqs = dna.split('\n')
    if seqs[-1] == '':
        seqs.pop()
    patterns = set()
    #O(n)
    for i in range(len(seqs[0]) + 1 - k):
        pattern = seqs[0][i:i + k]
        #O(k^2)
        neighborhood = neighbors(pattern, d)
        for neighbor in neighborhood:
            all_match = True
            #O(s). Checks that k-mer with <= d mismatches appears in all seqs
            for seq in seqs:
                match = False
                #O(n^2)
                for l in range(len(seq) + 1 - k):
                    window = seq[l:l + k]
                    #O(k)
                    if hamming_distance(neighbor, window) <= d:
                        match = True
                if not match:
                    all_match = False
            if all_match:
                patterns.add(neighbor)
    return patterns
def approximate_pattern_matching(genome: str, pattern: str, n: int) -> list:
    """
    This function returns a list containing indexes of the starting point where the pattern is present as a substring of the genome with at most n mismatch
    Param: 
    pattern: The pattern to find in the genome
    genome: The genome
    n: maximum number of mismatch allowed
    Returns:
    list(list): A list containing the index of the starting positions of the patterns in the genome
    """
    output: list = []
    k = len(pattern)
    for i in range(len(genome)):
        k_mer = genome[i:i + k]
        """ if this is an issue, try to find a better way to take care of cases where the length of the remaining string is less than the length of the pattern"""
        if (len(k_mer) < k):
            break
        hd = hamming_distance(pattern, k_mer)
        if hd <= n:
            output.append(str(i))
    return (output)
Beispiel #19
0
    # Compare adjacent pixels.
    difference = []
    for row in xrange(hash_size):
        for col in xrange(hash_size):
            pixel_left = image.getpixel((col, row))
            pixel_right = image.getpixel((col + 1, row))
            difference.append(pixel_left > pixel_right)

    # Convert the binary array to a hexadecimal string.
    decimal_value = 0
    hex_string = []
    for index, value in enumerate(difference):
        if value:
            decimal_value += 2**(index % 8)
        if (index % 8) == 7:
            hex_string.append(hex(decimal_value)[2:].rjust(2, '0'))
            decimal_value = 0

    return ''.join(hex_string)


if __name__ == "__main__":
    image1 = Image.open("../images/hans1.jpg")
    image2 = Image.open("../images/hans2.jpg")
    hash1 = dhash(image1, 4)
    hash2 = dhash(image2, 4)
    print hash1
    print hash2
    print hamming_distance(hash1, hash2)
      #feat_q = feat_q / LA.norm(feat_q, 2.0)
      #feat_ref = feat_ref / LA.norm(feat_ref, 2.0)
      if FEATURE_JITTER == 10:
        feat_q = np.reshape(feat_q, (1,feat_q.shape[0]*feat_q.shape[1]))
        feat_ref = np.reshape(feat_ref,(1,feat_ref.shape[0]*feat_ref.shape[1]))
      else:
        feat_q = np.reshape(feat_q, (1,feat_q.shape[0]))
        feat_ref = np.reshape(feat_ref,(1,feat_ref.shape[0]))
      bins = np.array([0],dtype=np.uint8)
      dig_feat_q = np.digitize(feat_q,bins,right=True)
      dig_feat_ref = np.digitize(feat_ref,bins,right=True)
      feat_q = np.uint64(np.packbits(np.uint8(dig_feat_q),axis=1))
      feat_ref = np.uint64(np.packbits(np.uint8(dig_feat_ref),axis=1))
      import pdb; pdb.set_trace()
      for i in range (0,3):
        q_shift = feat_q << 8*(2**i) 
        ref_shift = feat_ref << 8*(2**i)
        feat_q = q_shift[:,0::2] + feat_q[:,1::2]
        feat_ref = ref_shift[:,0::2] + feat_ref[:,1::2]
  
      distance = h_dist.hamming_distance(feat_q,feat_ref)
      #distance = LA.norm(feat_q - feat_ref, 2.0)
      #distance = np.sum(np.sqrt(np.power(feat_q - feat_ref, 2.0)))
      print('%d %s %s %d %d' %(n, query_filename, ref_filename, distance, label))
      log.write('%d\t%d\n' %(label, distance))
    except:
      print 'ERROR: query_filename: ', fname[0]


log.close()
    def test_hamming_distance_1(self):

        v = [1, 1, 1, 1, 1, 1, 1]
        w = [1, 1, 1, 1, 1, 1, 1]

        self.assertEqual(0, hamming_distance(v, w))
    def test_hamming_distance_3(self):

        v = [0, 1, 0, 1, 0, 0, 1]
        w = [1, 0, 1, 0, 1, 1, 0]

        self.assertEqual(7, hamming_distance(v, w))
 def test_hamming_distance(self):
     self.assertEqual(2, hamming_distance(1, 4))
Beispiel #24
0
 def test_hamming_distance_bigger(self):
     """
     Test the case when the second string is bigger than the first one
     """
     self.assertEqual(hamming_distance("GATTCTCA", "GCAAAGACGCTGACCAA"), 3)
Beispiel #25
0
print('Test for hamming_distance.popcnt')
query = np.random.randint(0, 100000)
ref = np.random.randint(0,100000)
start = time.time()
diff = np.bitwise_xor(query,ref)
dist = hamming_distance.popcnt(diff)
elapsed = time.time() - start
print('%d in %.6f' % (dist, elapsed))

import pdb; pdb.set_trace();
dimension = 320
query = np.zeros((1,dimension), dtype=np.uint64)
ref = np.ones((1,dimension), dtype=np.uint64)
print('Test for hamming_distance.hamming_distance for %d dim.' % dimension)
start = time.time()
distance = hamming_distance.hamming_distance(query, ref)
elapsed = time.time() - start
print('%d in %.6f' % (distance, elapsed))

num_ref = 1000000
print('Test for hamming_distance.hamming_distance_ref for %d samples' % num_ref)
start_mem_load = time.time()
ref = np.ones((num_ref,dimension), dtype=np.uint64)
dist= np.zeros((num_ref), dtype=np.uint32)
elapsed_mem_load = time.time() - start_mem_load
print('Memory alloc. for %d samples in %f' % (num_ref, elapsed_mem_load))
ref[num_ref-1,dimension-1] = 0
ref[num_ref-1,dimension-2] = 0
ref[num_ref-1,dimension-3] = 0
ref[num_ref-2,dimension-3] = 0
start = time.time()
from hamming_distance import hamming_distance
import sys

file = open('./words.txt', 'r')
list = file.readlines()
file.close()
testword = sys.argv[1]

for word in list:
	word = str.rstrip(word)
	if(len(word) == len(testword)):
		if(hamming_distance(testword, word) == 1):
			 print word + '\n'