def test_compute_alignment_matrix(self):
     scoring_matrix_0 = project4.build_scoring_matrix(
         set(['a', 'b', 'c']), 10, 5, -1)
     alignment_0 = project4.compute_alignment_matrix('a',
                                                     'cab',
                                                     scoring_matrix_0,
                                                     global_flag=True)
     alignment_1 = project4.compute_alignment_matrix('a',
                                                     'cab',
                                                     scoring_matrix_0,
                                                     global_flag=False)
     self.assertEqual(alignment_0, [[0, -1, -2, -3], [-1, 5, 9, 8]])
     self.assertEqual(alignment_1, [[0, 0, 0, 0], [0, 5, 10, 9]])
     scoring_matrix_1 = project4.build_scoring_matrix(
         set(['a', 'b', 'c']), 10, 5, -1)
     alignment_2 = project4.compute_alignment_matrix('cc',
                                                     'cab',
                                                     scoring_matrix_1,
                                                     global_flag=True)
     alignment_3 = project4.compute_alignment_matrix('cc',
                                                     'cab',
                                                     scoring_matrix_1,
                                                     global_flag=False)
     self.assertEqual(alignment_2,
                      [[0, -1, -2, -3], [-1, 10, 9, 8], [-2, 9, 15, 14]])
     self.assertEqual(alignment_3,
                      [[0, 0, 0, 0], [0, 10, 9, 8], [0, 10, 15, 14]])
 def test_build_scoring_matrix(self):
     scores_0 = project4.build_scoring_matrix(set(['a', 'b', 'c']), 10, 4,
                                              -1)
     self.assertEqual(sorted(scores_0.keys()), ['-', 'a', 'b', 'c'])
     self.assertEqual(scores_0['a']['a'], 10)
     self.assertEqual(scores_0['a']['b'], 4)
     self.assertEqual(scores_0['a']['-'], -1)
     scores_1 = project4.build_scoring_matrix(set(['a', 'b', 'c', '-']), 10,
                                              4, -1)
     self.assertEqual(sorted(scores_1.keys()), ['-', 'a', 'b', 'c'])
     self.assertEqual(scores_1['a']['a'], 10)
     self.assertEqual(scores_1['a']['b'], 4)
     self.assertEqual(scores_1['a']['-'], -1)
Beispiel #3
0
def edit_dist(xs, ys):
    alphabet = ascii_lowercase  # what is ascii_lowercase??
    scoring = build_scoring_matrix(alphabet, 2, 1, 0)
    align = compute_alignment_matrix(xs, ys, scoring,
                                     True)  # True means global alignment.
    score, _, _ = compute_global_alignment(xs, ys, scoring, align)
    return len(xs) + len(ys) - score
Beispiel #4
0
def find_scoring_matrix(x, y, med, dim):
    """
    Find the scoring matrix that satisifes the definition of minimum edit
    distance: |x| + |y| - score(x, y)
    
    Inputs:
        x, y: english strings
        med: minimum edit distance between x, y
        dim: range of values to test for diag_score, off_score, dash_score
            note dash_scores will be <= 0
    """
    alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'])
    
    # med(kitten, sitting) = 3
    correct = len(x) + len(y) - med # 10
    solutions = np.zeros((dim, dim, dim))
    for diag in range(dim):
        for off in range(dim):
            for dash in range(dim):
                sm = seq.build_scoring_matrix(alphabet, diag, off, -1 * dash)
                am = seq.compute_alignment_matrix(x, y, sm)
                solutions[diag, off, dash] = seq.compute_global_alignment(x, y, sm, am)[0]
    
    parameters = np.transpose(np.nonzero(solutions == correct))
    parameters[:, 2] *= -1
    return parameters
def edit_dist(xs, ys):
    '''
    Helper function for Question 8
    '''
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    scoring = project4.build_scoring_matrix(alphabet, 2, 1, 0)
    align = project4.compute_alignment_matrix(xs, ys, scoring, True)
    score, x, y = project4.compute_global_alignment(xs, ys, scoring, align)
    return len(xs) + len(ys) - score
def check_spelling(checked_word, dist, word_list):
    # scoring matrix for edit distaion
    # edit distance = |x| + |y| - score(X,Y)
    # diag_socre = 2, off_diag_score = 1, dash_score = 0
    alphabets = set("abcdefghijklmnopqrstuvwxyz")
    scoring_matrix = project4.build_scoring_matrix(alphabets,2,1,0)
    string_set = set([])
    for word in word_list:
        alignment_matrix = project4.compute_alignment_matrix(checked_word ,word, scoring_matrix, True)
        score, _, _ = project4.compute_global_alignment(checked_word, word, scoring_matrix, alignment_matrix)
        score = len(checked_word) + len(word) - score
        if score <= dist:
            string_set.add(word)
    return string_set
Beispiel #7
0
def check_spelling(checked_word, dist, word_list):
    """
    Function for Question 8
    """
    # we should do some pre-processing with the word_list
    # only consider the words that has length between |checked_word| +- dist
    # (2) maybe should not consider the words that have letters not existed
    # in the checked_word
    #word_list_new = []
    #for each_word in word_list:
    #    if (len(each_word) >= (len(checked_word) - dist)) and (len(each_word) <= (len(checked_word) + dist)):
    #        word_list_new.append(each_word)

    alphabet = set([
        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
        'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
    ])
    #print len(alphabet)
    if (checked_word in word_list):
        return checked_word

    score_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0)
    words = []
    # build a set of chars in checked_word
    # I can even use a dictionary to check against the number of chars, it
    # would be more effective
    checked_word_chars = set(checked_word)
    num_checks = 0
    for each_word in word_list:
        each_word_chars = set(each_word)
        num_diffs = 0
        for char in each_word_chars:
            if char not in checked_word_chars:
                num_diffs += 1

        if (len(each_word) >=
            (len(checked_word) - dist)) and (len(each_word) <=
                                             (len(checked_word) + dist)
                                             and num_diffs <= 2):
            align_matrix = project4.compute_alignment_matrix(
                checked_word, each_word, score_matrix, True)
            result = project4.compute_global_alignment(checked_word, each_word,
                                                       score_matrix,
                                                       align_matrix)
            if ((len(checked_word) + len(each_word) - result[0]) <= dist):
                words.append(each_word)
            num_checks += 1
    print num_checks
    return words
Beispiel #8
0
def check_spelling(checked_word, dist, word_list):
    """
    Returns a set of words from word_list that are dist edit distance from 
    checked_word
    """
    alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 
    'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'])
    candidates = set([])
        
    for word in word_list:
        smtrx = seq.build_scoring_matrix(alphabet, 2, 1, 0)
        amtrx = seq.compute_alignment_matrix(checked_word, word, smtrx)
        score = seq.compute_global_alignment(checked_word, word, smtrx, amtrx)[0]
        if len(checked_word) + len(word) - score <= dist:
            candidates.add(word)
                
    return candidates
def question7():
    """
    Question 7
    """
    alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                    'y', 'z'])
    #print len(alphabet)
    score_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0)
    test1 = 'abcde'
    test2 = 'xycdefg'
    align_matrix = project4.compute_alignment_matrix(test1, test2, score_matrix, True)
    result = project4.compute_global_alignment(test1, test2, score_matrix, align_matrix)
    print test1
    print test2
    print result
    print len(test1) + len(test2) - result[0]
Beispiel #10
0
def  calculate_edit_distance(xseq, yseq):
   '''
   Return the edit distance of xseq and yseq
   http://en.wikipedia.org/wiki/Edit_distance
   '''
   alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                   'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                   'y', 'z', '-'])

   scoring_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0)
   global_alignment_matrix = project4.compute_alignment_matrix(xseq, yseq, scoring_matrix, True)
   global_alignment = project4.compute_global_alignment(xseq, yseq, scoring_matrix,global_alignment_matrix)

   edit_distance = len(xseq) + len(yseq) - global_alignment[0]

   #print global_alignment
   # print  edit_distance
   return edit_distance
def check_spelling(checked_word, dist, word_list):
    """
    Function for Question 8
    """
    # we should do some pre-processing with the word_list
    # only consider the words that has length between |checked_word| +- dist
    # (2) maybe should not consider the words that have letters not existed 
    # in the checked_word
    #word_list_new = []
    #for each_word in word_list:
    #    if (len(each_word) >= (len(checked_word) - dist)) and (len(each_word) <= (len(checked_word) + dist)): 
    #        word_list_new.append(each_word)

    alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                    'y', 'z'])
    #print len(alphabet)
    if (checked_word in word_list):
        return checked_word

    score_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0)
    words = []
    # build a set of chars in checked_word
    # I can even use a dictionary to check against the number of chars, it
    # would be more effective
    checked_word_chars = set(checked_word)
    num_checks = 0
    for each_word in word_list:
        each_word_chars = set(each_word)
        num_diffs = 0
        for char in each_word_chars:
            if char not in checked_word_chars:
                num_diffs += 1

        if (len(each_word) >= (len(checked_word) - dist)) and (len(each_word) <= (len(checked_word) + dist)
                and num_diffs <= 2): 
            align_matrix = project4.compute_alignment_matrix(checked_word, each_word, score_matrix, True)
            result = project4.compute_global_alignment(checked_word, each_word, score_matrix, align_matrix)
            if ((len(checked_word) + len(each_word) - result[0]) <= dist):
                words.append(each_word)
            num_checks += 1
    print num_checks    
    return words
def check_spelling(check_word, dist, word_list):
    """
    check spelling of check_word
    :param check_word: word to check
    :param dist: edit distance
    :param word_list: list of wrod (dictionary)
    :return: set of words from word_list that has the distance of 'dist' from check_word
    """
    result =[]
    alphabet = list(string.ascii_lowercase)
    score_matrix = student.build_scoring_matrix(alphabet, 2, 1, 0)

    for each in word_list:
        alignment_matrix = student.compute_alignment_matrix(each, check_word, score_matrix, True)
        global_align = student.compute_global_alignment(each, check_word, score_matrix, alignment_matrix)
        distance = len(each)+len(check_word)-global_align[0]
        if distance <= dist:
            result.append(each)

    return result
def check_spelling(check_word, dist, word_list):
    """
    check spelling of check_word
    :param check_word: word to check
    :param dist: edit distance
    :param word_list: list of wrod (dictionary)
    :return: set of words from word_list that has the distance of 'dist' from check_word
    """
    result =[]
    alphabet = list(string.ascii_lowercase)
    score_matrix = student.build_scoring_matrix(alphabet, 2, 1, 0)

    for each in word_list:
        alignment_matrix = student.compute_alignment_matrix(each, check_word, score_matrix, True)
        global_align = student.compute_global_alignment(each, check_word, score_matrix, alignment_matrix)
        distance = len(each)+len(check_word)-global_align[0]
        if distance <= dist:
            result.append(each)

    return result
 def check_spelling(checked_word, dist, word_list):
     """
     Iterates through word_list and returns the set of all
     words that are within edit distance dist of the string
     checked_word.
     """
     ans = set([])
     scoring_matrix = project4.build_scoring_matrix(
         'abcdefghijklmnopqrstuvwxyz', 2, 1, 0)
     checked_word_length = len(checked_word)
     for word in word_list:
         word_length = len(word)
         alignment_matrix = project4.compute_alignment_matrix(
             checked_word, word, scoring_matrix, True)
         global_score = project4.compute_global_alignment(
             checked_word, word, scoring_matrix, alignment_matrix)
         edit_dist = checked_word_length + word_length - global_score[0]
         if edit_dist <= dist:
             ans.add(word)
     return ans
Beispiel #15
0
def question7():
    """
    Question 7
    """
    alphabet = set([
        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
        'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
    ])
    #print len(alphabet)
    score_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0)
    test1 = 'abcde'
    test2 = 'xycdefg'
    align_matrix = project4.compute_alignment_matrix(test1, test2,
                                                     score_matrix, True)
    result = project4.compute_global_alignment(test1, test2, score_matrix,
                                               align_matrix)
    print test1
    print test2
    print result
    print len(test1) + len(test2) - result[0]
def check_spelling(checked_word, dist, word_list):
    """
    Iterates through word_list and returns the set
    of all words that are within edit distance dist
    of the string checked_word.

    Parameters
    ----------
    checked_word: str
    the word to be checked

    dist: int
    the edit distance

    word_list: list
    a list of words


    Returns
    -------
    result: list
    the list of words that are within edit distance
    of the checked_word.
    """
    alphabets = "abcdefghijklmnopqrstuvwxyz"
    score_mat = build_scoring_matrix(alphabets, 2, 1, 0)
    result = []
    for word in word_list:
        align_mat = compute_alignment_matrix(checked_word, word,
                                             score_mat, True)
        score = compute_global_alignment(checked_word, word,
                                         score_mat, align_mat)[0]
        current_dist = len(checked_word) + len(word) - score
        if current_dist <= dist:
            result.append(word)
    return result
Beispiel #17
0
def edit_distance(seq_x, seq_y):
    alphabet = string.ascii_lowercase
    scoring_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0)
    alignment_matrix = project4.compute_alignment_matrix(seq_x, seq_y, scoring_matrix,True)
    score = project4.compute_global_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix)
    return len(seq_x) + len(seq_y) - score[0]