Beispiel #1
0
def main():

    args = get_args()

    # path to the text file of the vocabulary and create a dictionary
    vocabulary_path = args['vocabulary']
    vocabulary = create_dictionary(vocabulary_path)

    # decide which distance to use
    if args['distance'] == 'levenshtein':
        # Levenshtein distance
        from levenshtein import levenshtein_distance
        levenshtein_distance(get_misspelling(), 'lexique.json')

    elif args['distance'] == 'levenshtein2':
        # Levenshtein distance using another library
        from levenshtein_v2 import levenshtein_distance2
        levenshtein_distance2(get_misspelling(), vocabulary)

    elif args['distance'] == 'hamming':
        # Hamming distance
        from hamming import hamming_distance
        hamming_distance(get_misspelling(), vocabulary)

    elif args['distance'] == 'jarowinkler':
        # Jaro-Winkler
        from jarowinkler import jarowinkler_distance
        jarowinkler_distance(get_misspelling(), vocabulary)

    else:
        raise Exception("Unknown distance function : {}".format(
            args['distance']))
Beispiel #2
0
    def test_repulsion(self):
        p1 = [1, 2, 3, 4, 5, 6, 7, 8]
        p2 = [1, 5, 2, 8, 7, 4, 3, 6]

        start_distance = hamming_distance(p1, p2)
        res = repulsion(p1, p2)
        self.assertEqual(p1, [1, 2, 3, 4, 5, 6, 7, 8])
        self.assertEqual(p2, [1, 5, 2, 8, 7, 4, 3, 6])

        end_distance = hamming_distance(p1, res)
        self.assertTrue(end_distance >= start_distance)
Beispiel #3
0
def motifEnumerate(dna,k,d):
    seq_first=dna[0]
    patterns=[]
    l=len(seq_first)
    for i in xrange(l-k+1):
        patterns.append(seq_first[i:i+k])
    patterns=list(set(patterns))#unique patterns in the first string in DNA
    d_neighbor=[]
    for pattern in patterns:
        d_neighbor.extend(neighbors(pattern,d))
    d_neighbor=list(set(d_neighbor))#collection of unique d_neighbor for All kmer pattern from first string
#######Checking if d_neighbor pattern has a match in REST of dna collection of sequences##########    
    motif=[]
    for patt in d_neighbor:
        s=0
        for seq in dna[1:]:
            c=0
            for i in xrange(l-k+1):
                if hamming_distance(patt,seq[i:i+k])<=d:
                    c=1
                    s=s+1
                    break
                else:
                    pass
            if c==0:#that is patt never exists as a d-neighbor of patterns of an encountered seq! then NO NEED TO CHECK FOR SUBSEQUENT Sequences!
                break
            elif s==len(dna)-1:
                motif.append(patt)
    motif=list(set(motif))
    return motif
Beispiel #4
0
    def test_hamming_dist_timing(self):
        list_of_strings = [string_generator() for i in range(10)]
        print(list_of_strings)

        string_result = []
        string_start = time.process_time_ns()
        for s1 in list_of_strings:
            for s2 in list_of_strings:
                string_result.append(naive_hamming_distance(s1, s2))
        string_end = time.process_time_ns()
        print(f'no of mismatches = {string_result}')
        print("string hamming: {:,}".format(string_end - string_start))

        binary_result = []
        binary_start = time.process_time_ns()
        for s1 in list_of_strings:
            for s2 in list_of_strings:
                binary_result.append(hamming_distance(s1, s2))
        binary_end = time.process_time_ns()
        print("binary hamming: {:,}".format(binary_end - binary_start))

        list_of_preprocessed_binaries = []
        for s in list_of_strings:
            list_of_preprocessed_binaries.append(string_to_hamming_binary(s))

        binary_result_pre = []
        binary_start_pre = time.process_time_ns()
        for s1 in list_of_preprocessed_binaries:
            for s2 in list_of_preprocessed_binaries:
                binary_result_pre.append(binary_hamming_dist_calc(s1, s2))
        binary_end_pre = time.process_time_ns()
        print("prepro hamming: {:,}".format(binary_end_pre - binary_start_pre))
        self.assertEqual(string_result, binary_result)
        self.assertEqual(binary_result_pre, binary_result)
Beispiel #5
0
 def test_hamming_dist_all_different(self):
     s1 = 'CAT'
     s2 = 'GGG'
     expected_dist = 3
     actual_dist = naive_hamming_distance(s1, s2)
     self.assertEqual(expected_dist, actual_dist)
     actual_dist = hamming_distance(s1, s2)
     self.assertEqual(expected_dist, actual_dist)
Beispiel #6
0
 def test_hamming_dist_same(self):
     s1 = 'CAT'
     s2 = 'CAT'
     expected_dist = 0
     actual_dist = naive_hamming_distance(s1, s2)
     self.assertEqual(expected_dist, actual_dist)
     actual_dist = hamming_distance(s1, s2)
     self.assertEqual(expected_dist, actual_dist)
Beispiel #7
0
 def test_haystack_generator(self):
     string_length = 6
     expected_dist = 3
     haystack = string_generator(string_length)
     needle = string_mutator(haystack, expected_dist)
     actual_dist = naive_hamming_distance(needle, haystack)
     self.assertEqual(expected_dist, actual_dist)
     actual_dist = hamming_distance(needle, haystack)
     self.assertEqual(expected_dist, actual_dist)
def neighbors(pattern, d):
    if d == 0:
        return pattern
    elif len(pattern) == 1:
        return ['A', 'C', 'G', 'T']
    neighborhood = []  #should contain all d neighborhood k mer patterns
    suffixNeighbors = neighbors(suffix(pattern), d)
    for pat in suffixNeighbors:
        if hamming_distance(suffix(pattern), pat) < d:
            for nuc in ['A', 'C', 'G', 'T']:
                neighborhood.append(nuc + pat)
        else:
            neighborhood.append(first_symbol(pattern) + pat)
    return neighborhood
def aprxPattern(text,pattern,d=0):
    '''text-Nucleotide sequence\
    pattern-pattern to be matched 
    d-default value 0 other wise user entered hamming distance value\
    The function returns locations of the aproximate matches of the pattern in the text provided '''
    l=len(text)
    k=len(pattern)
    pos=[]
    for i in xrange(l-k+1):
        kmer=text[i:i+k]
        if hamming_distance(pattern,kmer)<=d:
          pos.append(i)
        else:
            pass
    return len(pos)
def imagediff(method, file_name1, file_name2):

    if method == "file size":
        try:
            size1 = os.path.getsize(file_name1)
        except os.error:
            print >> sys.stderr, "ERROR: Unable to access ", file_name1
            sys.exit(-1)
        try:
            size2 = os.path.getsize(file_name2)
        except os.error:
            print >> sys.stderr, "ERROR: Unable to access ", file_name2

        return float(abs(size1 - size2)) / max(size1, size2)

    else:
        try:
            file1 = open(file_name1, "r")
            string1 = file1.read()
        except IOError:
            print >> sys.stderr, "ERROR: Unable to open ", file_name1
        finally:
            file1.close()

        try:
            file2 = open(file_name2, "r")
            string2 = file2.read()
        except IOError:
            print >> sys.stderr, "ERROR: Unable to open ", file_name2
        finally:
            file2.close()

        if method == "levenshtein":
            try:
                return float(levenshtein_distance(string1, string2)) / max(len(string1), len(string2))
            except ZeroDivisionError:
                return 1
        elif method == "hamming":
            try:
                return float(hamming_distance(string1, string2)) / min(len(string1), len(string2))
            except ZeroDivisionError:
                return 1
        else:
            print >> sys.stderr, "ERROR: Invalid method."
            sys.exit(-1)
def d(pattern, dna):
    '''function takes in pattern and collection of strings dna and computes thelowest  distance between the pattern and collection '''
    k = len(pattern)
    #distance=k#maximum possible hamming distance between pattern and region of same length
    total_hd = 0
    for region in dna:
        distance = k + 1
        #print region
        for i in xrange(
                len(region) - k + 1
        ):  #for every region we compute the hamm distance between pattern and substrings of region and selct the substring with least distance
            hd = hamming.hamming_distance(pattern, region[i:i + k])
            if distance > hd:
                distance = hd  #we only keep the lowest hamming distance
        #print distance
        total_hd += distance  #sum of the least distances are computed for ALL REGIONS in dna for the PATTERN provided
    #print 'total_hd',total_hd,pattern
    return total_hd
def h_dis(pattern, seq):
    k = len(pattern)
    kmer_distance = dict()  #kmer and HD
    for i in xrange(len(seq) - k + 1):
        kmer = seq[i:i + k]
        kmer_distance[kmer] = hamming_distance(
            pattern,
            kmer)  #for all kmers in seq compute HD with pattern provided
    kmer_dist = kmer_distance.items()
    kmer_dist = sorted(kmer_dist, key=lambda x: x[1])
    smallest_hd = kmer_dist[0][1]
    kmer_dist_leastHD = [kmer_dist[0]]
    for k, v in kmer_dist[1:]:
        if v == smallest_hd:
            kmer_dist_leastHD.append((k, v))
        else:
            break
    return kmer_dist_leastHD  # return type is a tuple of form (KMER,leastHD)
Beispiel #13
0
def binary_inc_proccessing_time():
    SETUP_CODE = ''' 
from hamming import hamming_distance 
from random import choice
from __main__ import string_generator
list_of_strings1 = [string_generator() for i in range(10)]
list_of_strings2 = [string_generator() for i in range(10)]'''

    TEST_CODE = ''' 
s1 = choice(list_of_strings1)
s2 = choice(list_of_strings2)
hamming_distance(s1, s2)
    '''
    # timeit.repeat statement
    times = timeit.repeat(setup=SETUP_CODE,
                          stmt=TEST_CODE,
                          repeat=3,
                          number=10000)

    # printing minimum exec. time
    print('Binary hamming string search time (including preprocessing): {}'.
          format(min(times)))
def print_possible_key_sizes(encoded_content):
    assert len(encoded_content) >= MAX_KEY_LENGTH * 4

    size2score = {}

    for key_size in range(MIN_KEY_LENGTH, MAX_KEY_LENGTH):
        # let's try to average as many samples as we can
        dist = 0
        for i in range(int(len(encoded_content) / (2 * key_size))):
            dist += hamming.hamming_distance(
                encoded_content[i * 2 * key_size:(i * 2 + 1) * key_size],
                encoded_content[(i * 2 + 1) * key_size:(i * 2 + 2) * key_size])
        size2score[key_size] = dist / float(
            int(len(encoded_content) / (2 * key_size)) * key_size)

        # dist1 = hamming.hamming_distance(encoded_content[:key_size],
        #     encoded_content[key_size: 2*key_size])
        # dist2 = hamming.hamming_distance(encoded_content[2*key_size: 3*key_size],
        #     encoded_content[3*key_size: 4*key_size])
        # # average and normalize by dividing by key length
        # size2score[key_size] = (dist1 + dist2) / float((2 * key_size))

    for key_size, score in sorted(size2score.items(), key=lambda x: x[1]):
        print("%d: %.02f" % (key_size, score))
Beispiel #15
0
        cur_ctext = line
print("Key = %s" % hex(cur_key))
print("Score = %f" % largest)
print("Ciphertext = %s" % cur_ctext)
print("Plaintext = %s" % cur_ptext)

#Challenge 5
print("\n\nChallenge 5: Repeating Key XOR Encryption")
ptext = "Burning 'em, if you ain't quick and nimble\nI go crazy when I hear a cymbal"
key = "ICE"
print("Encrypting %s with key %s" % (ptext, key))
print(xor_encrypt(ptext, key))

#Challenge 6
print("\n\nChallenge 6: Break Repeating Key XOR Encrpytion")
distance = hamming_distance("this is a test", "wokka wokka!!!")

if distance != 37:
    print("Distance = %d" % distance)
    raise ValueError

ciphertext = base64.b64decode(open("./6.txt", "r").read())

smallest_distance = 1000
cur_keysize = 0
for keysize in range(2, 41):
    block1 = ciphertext[0:keysize]
    block2 = ciphertext[keysize:keysize * 2]
    block3 = ciphertext[keysize * 2:keysize * 3]
    block4 = ciphertext[keysize * 3:keysize * 4]