Exemple #1
0
 def test_dl(self):
     self.assertEqual(dam_lev('', ''), 0)
     self.assertEqual(dam_lev('', 'a'), 1)
     self.assertEqual(dam_lev('a', ''), 1)
     self.assertEqual(dam_lev('a', 'b'), 1)
     self.assertEqual(dam_lev('a', 'ab'), 1)
     self.assertEqual(dam_lev('ab', 'ba'), 1)
     self.assertEqual(dam_lev('ab', 'bca'), 2)
     self.assertEqual(dam_lev('bca', 'ab'), 2)
     self.assertEqual(dam_lev('ab', 'bdca'), 3)
     self.assertEqual(dam_lev('bdca', 'ab'), 3)
 def test_dl(self):
     self.assertEqual(dam_lev('', ''), 0)
     self.assertEqual(dam_lev('', 'a'), 1)
     self.assertEqual(dam_lev('a', ''), 1)
     self.assertEqual(dam_lev('a', 'b'), 1)
     self.assertEqual(dam_lev('a', 'ab'), 1)
     self.assertEqual(dam_lev('ab', 'ba'), 1)
     self.assertEqual(dam_lev('ab', 'bca'), 2)
     self.assertEqual(dam_lev('bca', 'ab'), 2)
     self.assertEqual(dam_lev('ab', 'bdca'), 3)
     self.assertEqual(dam_lev('bdca', 'ab'), 3)
Exemple #3
0
def data_print(answer, recWord):
    substitute_costs = np.ones((128, 128), dtype=np.float64)
    for i in answer:
        print("The word may be : {} the change in char is of  : {} digits".
              format(
                  i, int(dam_lev(recWord, i,
                                 substitute_costs=substitute_costs))))
 def weighted_ed_rel(self, cand, token, del_costs, ins_costs, sub_costs,
                     trans_costs):
     w_editdist = dam_lev(token,
                          cand,
                          delete_costs=del_costs,
                          insert_costs=ins_costs,
                          substitute_costs=sub_costs,
                          transpose_costs=trans_costs)
     rel_w_editdist = w_editdist / len(token)
     return rel_w_editdist
Exemple #5
0
 def __score(self, wrong_word, candidate):
     dl_dist = dam_lev(wrong_word,
           candidate,
           insert_costs=self.insert_costs,
           substitute_costs=self.substitute_costs,
           delete_costs=self.delete_costs,
           transpose_costs=self.transpose_costs) / \
         max(len(wrong_word), len(candidate))
     log_prior = self.priors[candidate] if candidate in self.priors \
      else math.log(float(self.k) / self.N)
     return -dl_dist + self.lamda * log_prior
def edit_distance(edp: EditDistanceParams, a: str, b: str,
                  error: bool) -> np.float64:
    """Damerau-Levenshtein edit distance between two pixel strings."""
    res = weighted_levenshtein.dam_lev(
        a,
        b,
        insert_costs=edp.insert_costs,
        delete_costs=edp.delete_costs,
        substitute_costs=(edp.error_substitute_costs
                          if error else edp.substitute_costs),
    )

    # Make sure result can fit in a uint16
    assert (0 <= res < 2**16), res
    return res
count = 0

# read wiki misspell file line by line
# for line in f_mis:
while count < 4453:
    string = f_mis_array[count].strip()
    bestv = 10000000
    bests = ""
    smallest_list = []
    # compare with dict lines
    for num in range(len(string) - 2, len(string) + 3):
        for word in f_len_dict_array[str(num)]:
            # for entry in my_dict:
            # string is lines in target doc
            # entry is lines in dict
            thisv = dam_lev(string, word.strip(), delete_costs=higher_costs_1)
            #  thisv = editdistance.eval(string, word.strip())
            if (thisv > 0 and thisv < bestv):
                bestv = thisv
                bests = word.strip()
                smallest_list = [bests]
            elif (thisv == bestv):
                smallest_list.append(word.strip())

    print({
        "result_s": smallest_list,
        "distance": bestv,
        "match": f_cor_array[count].strip() in smallest_list,
        "original": f_cor_array[count].strip()
    })
Exemple #8
0
# RG3 to RG4
# RG3 to RG5
# RG4 to RG5

Root = ET.Element("xml")

f = open('spineData-ascii.txt')
f.readline() # read and ignore the first line
for line in f: # iterate over the remaining lines
    v = line.split('\t')
    app = v[0]
    rg1_2 = v[1] + '::' + v[3]
    if '#fMS' in rg1_2:
        print('#fMS is here')
        transpose_costs = np.full((128, 128), 0.25, dtype=np.float64)
        dist1_2 = dam_lev(v[2], v[4], transpose_costs=transpose_costs)
    else:
        transpose_costs = np.ones((128, 128), dtype=np.float64)
        dist1_2 = dam_lev(v[2], v[4])

    rg1_3 = v[1] + '::' + v[5]
    if '#fMS' in rg1_3:
        print('#fMS is here')
        transpose_costs = np.full((128, 128), 0.25, dtype=np.float64)
        dist1_3 = dam_lev(v[2], v[6], transpose_costs=transpose_costs)
    else:
        transpose_costs = np.ones((128, 128), dtype=np.float64)
        dist1_3 = dam_lev(v[2], v[6])

    rg1_4 = v[1] + '::' + v[7]
    if '#fMS' in rg1_4:
count = 0

# read wiki misspell file line by line
# for line in f_mis:
while count < 4453:
    string = f_mis_array[count].strip()
    bestv = 10000000
    bests = ""
    smallest_list = []
    # compare with dict lines
    for num in range(len(string) - 2, len(string) + 3):
        for word in f_len_dict_array[str(num)]:
            # for entry in my_dict:
            # string is lines in target doc
            # entry is lines in dict
            thisv = dam_lev(string, word.strip(), insert_costs=higher_costs_1)
            #  thisv = editdistance.eval(string, word.strip())
            if (thisv > 0 and thisv < bestv):
                bestv = thisv
                bests = word.strip()
                smallest_list = [bests]
            elif (thisv == bestv):
                smallest_list.append(word.strip())

    print({
        "result_s": smallest_list,
        "distance": bestv,
        "match": f_cor_array[count].strip() in smallest_list,
        "original": f_cor_array[count].strip()
    })
Exemple #10
0
 def _dl(self, x, y):
     return dam_lev(x, y, self.iw, self.dw, self.sw, self.tw)
Exemple #11
0
    str_build_orders,
    open("data/build_orders/TvZ_build_orders_{}_{}.p".format(n, l), "wb"))
pickle.dump(mmrs, open("data/mmr/TvZ_mmr_{}.p".format(n), "wb"))
pickle.dump(player_ids,
            open("data/player_ids/TvZ_player_ids_{}.p".format(n), "wb"))

i = 0
for y in range(n):
    print(i, "/", n)
    for x in range(n):
        str_build_orders[x] = str_build_orders[x]
        str_build_orders[y] = str_build_orders[y]
        if y == x:
            distance = 0
        else:
            distance = dam_lev(str1=str_build_orders[x],
                               str2=str_build_orders[y],
                               insert_costs=insert_costs,
                               delete_costs=delete_costs,
                               substitute_costs=substitute_costs,
                               transpose_costs=transpose_costs)
        distance = int(distance)
        D[y][x] = distance
        D[x][y] = distance
        #print(str_build_orders[x])
        #print(str_build_orders[y])
        #print(distance)
    i += 1

print("Storing distance matrix")
pickle.dump(D, open("data/distance_matrix/TvZ_{}_{}.p".format(n, l), "wb"))
count = 2779

# read wiki misspell file line by line
# for line in f_mis:
while count < 4453:
    string = f_mis_array[count].strip()
    bestv = 10000000
    bests = ""
    smallest_list = []
    # compare with dict lines
    for num in range(len(string)-2, len(string)+3):
        for word in f_len_dict_array[str(num)]:
   # for entry in my_dict:
        # string is lines in target doc
        # entry is lines in dict 
            thisv = dam_lev(string, word.strip(), transpose_costs=higher_costs_2, insert_costs=higher_costs_1)
          #  thisv = editdistance.eval(string, word.strip())
            if (thisv > 0 and thisv < bestv):
                bestv = thisv
                bests = word.strip()
                smallest_list = [bests]
            elif (thisv == bestv):
                smallest_list.append(word.strip())

    print({"result_s": smallest_list, "distance": bestv, "match": f_cor_array[count].strip() in smallest_list, "original": f_cor_array[count].strip()})

    # save to res dict
    res_dict[string] = {"result_s": smallest_list, "distance": bestv, "match": f_cor_array[count].strip() in smallest_list, "original": f_cor_array[count].strip()}
    count += 1
    print("count",count)
    
 def _dl(self, x, y):
     return dam_lev(x, y, self.iw, self.dw, self.sw, self.tw)