Example #1
0
 def test_distances(self):
     self.assertTrue(distance("слово", "слово") == 0)
     self.assertTrue(distance("слова", "слово") == 1)
     self.assertTrue(distance("удаление", "удаленние") == 1)
     self.assertTrue(distance("вставка", "вствка") == 1)
     self.assertTrue(distance("замена", "запена") == 1)
     self.assertTrue(distance("ячсмит", "йцукен") == 6)
     self.assertTrue(distance("", "кверти") == 6)
     self.assertTrue(distance("вместе", "местее") == 2)
     self.assertTrue(distance("", "") == 0)
Example #2
0
def HierarchicalCluster(datafile):
    sim_list = []
    sim_matrix = []
    data_list = [line.rstrip('\n') for line in codecs.open(datafile, "r", "utf-8")]
    data_list = [line.split('   ') for line in data_list]
    print(data_list)
    len_list = len(data_list)
    label_list = [data_list[l][0] for l in range(0,len(data_list))]
    text_list = [data_list[l][1] for l in range(0,len(data_list))]
    print(label_list)
    print(text_list)

    for i in range(0, len_list):
        pivot = text_list[i]
        for j in range(0, len_list):
            sim = distance(pivot, text_list[j]) # calcurate similarity(distance)
            print('n{}, n{} : {}'.format(i, j, sim))
            sim_list.append(sim)
            if j == len_list-1:
                sim_matrix.append(sim_list)
                sim_list = []

    print('-------------------------')
    print('matrix: {}'.format(sim_matrix))
    linkage_matrix = ward(sim_matrix)
    print('--------------------------')
    print(linkage_matrix)
    dendrogram(linkage_matrix, labels=label_list)
    plt.show()
def resemblance(username, password):
    truepass = get_password(username)
    l = Lev.distance(password, truepass)
    if len(password) >= len(truepass):
        return l / len(password)
    else:
        return l / len(truepass)
Example #4
0
def getSimilarDevices(device):
    assert isinstance(device, str)
    assert device.count(":") == 1, "bad formatting of device name"

    (slot, deviceName) = device.split(":")

    if glob.production:
        prefix = PREFIX_CCDB
    else:
        prefix = PREFIX_CCDB_TEST

    url = prefix + "slots/"

    # False because SSH connection is unsigned:
    request = requests.get(url, verify=False)
    tmpList = json.loads(request.text)["slot"]

    # get all devices in CCDB
    allDevices = map(lambda x: x["name"], tmpList)

    # convert unicode to String
    allDevices = map(lambda x: str(x), allDevices)

    # keep only device
    candidates = filter(lambda x: x.startswith(slot), allDevices)

    # compute Levenshtein distances
    distances  = \
        map(lambda x: (levenshtein.distance(device, x), x), candidates)
    distances.sort()

    return distances
def main():
    path1 = 'set'
    path2 = '115'
    files = os.listdir(path1)
    for file in files:
        if not os.path.isdir(file):
            f_w = os.path.basename(file)
            paths = "set/" + f_w
            # 读取文件
        with open(paths, 'r', encoding='ISO-8859-1') as f:
            all_text1 = (''.join(f.readlines())).lower()
        sentences1 = [x.strip().lower() for x in all_text1.split('.')]  #

        files2 = os.listdir(path2)
        for file in files2:
            if not os.path.isdir(file):
                f_f = os.path.basename(file)
                pathss = "115/" + f_f
            with open(pathss, 'r', encoding='ISO-8859-1') as f:
                all_text2 = (''.join(f.readlines())).lower()
            sentences2 = [x.strip().lower() for x in all_text2.split('.')]

            # search matching sentences
            count = 0
            # print('正在评估短语的精确匹配…')
            for i, s1 in enumerate(sentences1):
                if len(s1) < 16:
                    continue
                if rabin_karp(all_text2, s1) != -1:
                    count += 1
                    i += 1
                    print('match:', s1)

            # search similar sentences
            similar = 0
            # print('正在评估短语的类似措辞…')

            for i, si in enumerate(sentences1):
                wordsi = si.split()
                for j, sj in enumerate(sentences2):
                    wordsj = sj.split()
                    d = distance(wordsi, wordsj)
                    if d < min(len(wordsi), len(wordsj)) * .75 and min(
                            len(si), len(sj)) >= 16:
                        similar += 1
                        print('similar:\nsi: {}\nsj: {}'.format(si, sj))

            percentage = 100 * similar / len(sentences1)
            if percentage != 0:
                with open('result.txt', 'a') as file_object:

                    file_object.write(str(percentage) + '\t')
                    file_object.write(paths + '-----' + pathss + '\n')
                print('Similar phrases: {similar} ({percentage:.2f}%)'.format(
                    similar=similar, percentage=percentage))
                print(' "{file1}" found "{file2}"'.format(file1=paths,
                                                          file2=pathss))
                print('\n---------------------------------------------')
def score_of_words(w1, w2):
    dist = levenshtein.distance(w1, w2)
    len1 = len(w1)
    len2 = len(w2)
    if (len1 == 0 or len2 == 0):
        return 0
    if dist <= 4 and float(dist) / float(max(len1, len2)) <= 0.4:
        return 1
    return 0
    def viterbi(self, observations, max_error_rate):

        trellis = [{self.start_state: (1., None)}]

        # fill out trellis
        for obs_idx, observation in enumerate(observations):
            #print( 'progress: ' + str(obs_idx + 1) + "/" + str(len(observations)), end='\r', flush=True)

            current_states = {}
            trellis.append(current_states)

            # set the maximum amount of errors that are allowed
            max_errors = int(len(observation) * max_error_rate) + 1

            for prev_state in trellis[obs_idx].keys():

                prev_path_prob = trellis[obs_idx][prev_state][0]

                for future_state, transition_prob in self.get_future_states_and_transition_probs(
                        prev_state):

                    distance = levenshtein.distance(future_state, observation)

                    # if the levenshtein distance exceeds the maximum,
                    # we disregard the path
                    if distance > max_errors:
                        continue

                    emission_prob = poisson.poisson_distribution(0.1, distance)

                    path_prob = prev_path_prob * transition_prob * emission_prob

                    # only keep path with max probability
                    if future_state in current_states:
                        if current_states[future_state][0] >= path_prob:
                            continue

                    current_states[future_state] = (path_prob, prev_state)

        # termination step
        max_final_path_prob = -1
        max_final_state = None
        for state in trellis[-1].keys():
            final_path_prob = trellis[-1][state][
                0] * self.get_final_transition_prob(state)
            if final_path_prob > max_final_path_prob:
                max_final_path_prob = final_path_prob
                max_final_state = state

        if max_final_state is None:
            return None

        return self.trellis_to_states(trellis, max_final_state,
                                      len(trellis) - 1)
def is_similar_levenshtein(s, t):
  words_s = s.split()
  words_t = t.split()
  len_s = len(words_s)
  len_t = len(words_t)

  if s == t: return False
  if len_s != len_t: return False
  similarity = sum([ levenshtein.distance(words_s[i], words_t[i]) for i in range(len_s) ])
  if sum == 0 or sum > len_s*2: return False
  return True
Example #9
0
def is_similar_levenshtein(s, t):
    words_s = s.split()
    words_t = t.split()
    len_s = len(words_s)
    len_t = len(words_t)

    if s == t: return False
    if len_s != len_t: return False
    similarity = sum(
        [levenshtein.distance(words_s[i], words_t[i]) for i in range(len_s)])
    if sum == 0 or sum > len_s * 2: return False
    return True
Example #10
0
def main():
    titles = read_titles()
    for P in ["Hidden Treasures Of The Infinite", "Hidden_Treasures_Of_The_Infinite", "Hiden Tresure_Of_de_Infinity", "Hidden The Infinite"]:
        print(P)
        P = P.lower()
        ### LEVENSHTEIN ###
        start_time = time.time()
        bestmatch = ""
        bestscore = math.inf
        for title in titles:
            title = title.lower()
            score = levenshtein.distance(P, title)
            if(score < bestscore):
                bestmatch = title
                bestscore = score
        print_result("LEVENSHTEIN", bestmatch, start_time)


        ### SMITH-WATERMAN ###
        start_time = time.time()
        bestmatch = ""
        bestscore = -math.inf
        for title in titles:
            title = title.lower()
            score = smithwaterman.distance(P, title)
            if(score > bestscore):
                bestmatch = title
                bestscore = score
        print_result("SMITH-WATERMAN", bestmatch, start_time)

        
        ### HOME MADE FASTA ###
        start_time = time.time()
        bestmatch = fasta.find_best_match(P, titles)
        print_result("HOME MADE FASTA + SMITH-WATERMAN", bestmatch, start_time)


        ### JACCARD ###
        start_time = time.time()
        bestmatch = jaccard.find_best_match(P, titles)
        print_result("JACCARD + SMITH-WATERMAN", bestmatch, start_time)

        print("")
Example #11
0
 def field_exists_check(search_tokens, fields):
     for j, ((tag, _), token_field) in enumerate(zip(search_tokens,
                                                     fields)):
         if tag is None:
             lev_dict = {
                 k: distance(token_field.strip().lower(), v.lower())
                 for k, v in header_labels_dict.items()
             }
             min_dist = min(lev_dict.values())
             for k, d in lev_dict.items():
                 if d == min_dist:
                     if min_dist <= 3:
                         search_tokens[j][0] = k
                         _alert_string.append(
                             f'Corrected "{token_field.strip()}" to '
                             f'"{header_labels_dict[k]}".')
                     else:
                         _error_string.append(
                             f'"{token_field.strip()}" is not a field. '
                             f'It might be misspelled.')
                         break
 def test_empty(self):
     self.assertEqual(0, distance("", ""))
 def test_distance(self):
     self.assertEqual(1, distance("zipp", "zippy"))
     self.assertEqual(1, distance("ippy", "zippy"))  # oops...
     self.assertEqual(2, distance("sipry", "zippy"))
 def test_equal(self):
     self.assertEqual(0, distance("zippy", "zippy"))
 def test_unbalanced(self):
     self.assertEqual(6, distance("", "python"))
     self.assertEqual(4, distance("", "ruby"))
Example #16
0
def distance(a,b):
   if len(a) + len(b) < 1:
      return 0

   return float(levenshtein.distance(a.lower(), b.lower())) / max(len(a),len(b))
Example #17
0
# Basic input...
1 == 1
1.0 == 1.0
"1" == "1"
'1' == '1'
r"1" == r"1"
r'1' == r'1'
"1" + '1' == "11"
int.__add__(1, 0)
(0).__add__(1)
0 < 1 < 3 > 2
"a" < "c" > "b"
(9.0 < 11.0 < 10.0) == 0
import levenshtein
# This comment should not be necessary :S :S :S FIXME
levenshtein.distance("1", "b")
if 1:
    print 1
else:
    print 0

x = 5; a = ""
while x:
    x = 0

while x:
    x = 5

print "\n"

x == 0
 def test2(self):
     a = 'Petooshock'
     b = 'Toornickman'
     res = levenshtein.distance(a, b)
     exp = 8
     self.assertEqual(res, exp)
 def test1(self):
     a = 'Levenshtien'
     b = 'Frankenstein'
     res = levenshtein.distance(a, b)
     exp = 7
     self.assertEqual(res, exp)
 def test3(self):
     a = 'Assassin'
     b = 'Killer'
     res = levenshtein.distance(a, b)
     exp = 8
     self.assertEqual(res, exp)
Example #21
0
def distance(a, b):
    if len(a) + len(b) < 1:
        return 0

    return float(levenshtein.distance(a.lower(), b.lower())) / max(
        len(a), len(b))
Example #22
0
def test_distance():
    assert levenshtein.distance('roule', 'roule') == 0
    assert levenshtein.distance('roule', 'roules') == 1
    assert levenshtein.distance('roule', '') == 5
    assert levenshtein.distance('roule', 'raoul') == 2
def vec(paperID,
        authorID,
        aKw,
        kWp,
        aP,
        jKw,
        PJ,
        A,
        T,
        aff_Author,
        aff_PaperAuthor,
        nb_flag=0):
    vector = np.empty(0)
    #Feature-1
    ret = vecfeature.matching_keywords_pa(aKw, kWp, paperID, authorID)
    # print (ret)
    # print ("Ret Done")
    temp = np.zeros(50000)
    cnt = 0
    for keyword in ret:
        if len(keyword) == 0: continue
        temp[cnt] = T.calculate(authorID, keyword)
        cnt += 1
        # if(cnt%1000)==0: print ("Feature-1 Going on..")
    temp = np.resize(temp, cnt)
    temp = np.sort(temp, axis=None)
    temp[:] = temp[::-1]
    j = 0
    for i in temp:
        vector = np.append(vector, i)
        j += 1
        if (j >= 5): break
    for k in range(j, 5):
        vector = np.append(vector, 0)
    if (nb_flag == 1):
        #Feature-1_neg
        ret = vecfeature.matching_keywords_pa_neg(aKw, kWp, paperID, authorID)
        temp = np.zeros(50000)
        cnt = 0
        for keyword in ret:
            if len(keyword) == 0: continue
            temp[cnt] = T.calculate(authorID, keyword)
            cnt += 1
        temp = np.resize(temp, cnt)
        temp = np.sort(temp, axis=None)
        temp[:] = temp[::-1]
        j = 0
        for i in temp:
            vector = np.append(vector, i)
            j += 1
            if (j >= 5): break
        for k in range(j, 5):
            vector = np.append(vector, 0)

    journalID = vecfeature.get_journal(PJ, paperID)
    # print ("Feature-1 Done")
    #Feature-2
    ret = vecfeature.matching_keywords_aj(aKw, jKw, journalID, authorID)
    # print (ret)
    temp = np.zeros(50000)
    cnt = 0
    for keyword in ret:
        if len(keyword) == 0: continue
        temp[cnt] = T.calculate(authorID, keyword)
        cnt += 1
    temp = np.resize(temp, cnt)
    temp = np.sort(temp, axis=None)
    temp[:] = temp[::-1]
    j = 0
    for i in temp:
        vector = np.append(vector, i)
        j += 1
        if (j >= 3): break
    for k in range(j, 3):
        vector = np.append(vector, 0)
    # print ("Feature-2 Done")
    if (nb_flag == 1):
        #Feature-2_neg
        ret = vecfeature.matching_keywords_aj_neg(aKw, jKw, journalID,
                                                  authorID)
        temp = np.zeros(50000)
        cnt = 0
        for keyword in ret:
            if len(keyword) == 0: continue
            temp[cnt] = T.calculate(authorID, keyword)
            cnt += 1
        temp = np.resize(temp, cnt)
        temp = np.sort(temp, axis=None)
        temp[:] = temp[::-1]
        j = 0
        for i in temp:
            vector = np.append(vector, i)
            j += 1
            if (j >= 8): break
        for k in range(j, 8):
            vector = np.append(vector, 0)

    #Feature-3
    ret = vecfeature.get_authors(A, paperID)
    temp = np.zeros(50000)
    cnt = 0
    for i in ret:
        if i == authorID: continue
        temp[cnt] = vecfeature.matching_papers(aP, i, authorID)
        # if(cnt%1000)==0: print ("Feature-3 Going on..")
        cnt += 1
    temp = np.resize(temp, cnt)
    temp = np.sort(temp, axis=None)
    temp[:] = temp[::-1]
    j = 0
    for i in temp:
        vector = np.append(vector, i)
        j += 1
        if (j >= 4): break
    for k in range(j, 4):
        vector = np.append(vector, 0)
    if (nb_flag == 1):
        #Feature-3_neg
        ret = vecfeature.get_authors(A, paperID)
        temp = np.zeros(50000)
        cnt = 0
        for i in ret:
            if i == authorID: continue
            temp[cnt] = vecfeature.matching_papers_neg(aP, i, authorID)
            cnt += 1
        temp = np.resize(temp, cnt)
        temp = np.sort(temp, axis=None)
        temp[:] = temp[::-1]
        j = 0
        for i in temp:
            vector = np.append(vector, i)
            j += 1
            if (j >= 4): break
        for k in range(j, 4):
            vector = np.append(vector, 0)
    # print ("Feature-3 Done")
    #Feature 4
    vector = np.append(vector, vecfeature.noKeywords_Author(aKw, authorID))
    # print ("Feature-4 Done")
    #Feature 5
    vector = np.append(vector, vecfeature.noKeywords_Paper(kWp, paperID))
    # print ("Feature-5 Done")
    word1 = ""
    word2 = ""
    max2 = 0
    max3 = 0
    min4 = 500
    min5 = 500
    if authorID in aff_Author:
        word1 = aff_Author[authorID]
    if authorID in aff_PaperAuthor:
        word2 = aff_PaperAuthor[authorID]
    vector = np.append(vector, levenshtein.distance(word1, word2))
    if paperID in A:
        for i in A[paperID]:
            word3 = ""
            word4 = ""
            if i not in aff_Author and len(word1) == 0:
                max2 = max(max2, 18.976)
                min4 = min(min4, 18.976)
            if i not in aff_PaperAuthor and len(word2) == 0:
                max3 = max(max3, 18.65)
                min5 = min(min5, 18.65)
            if i in aff_Author:
                word3 = aff_Author[i]
                max2 = max(max2, levenshtein.distance(word1, word3))
                min4 = min(min4, levenshtein.distance(word1, word3))
            if i in aff_PaperAuthor:
                word4 = aff_PaperAuthor[i]
                max3 = max(max3, levenshtein.distance(word2, word4))
                min5 = min(min5, levenshtein.distance(word2, word4))
            if i not in aff_Author and len(word1) != 0:
                max2 = max(max2, levenshtein.distance(word1, word3))
                min4 = min(min4, levenshtein.distance(word1, word3))
            if i not in aff_PaperAuthor and len(word2) != 0:
                max3 = max(max3, levenshtein.distance(word2, word4))
                min5 = min(min5, levenshtein.distance(word2, word4))
    vector = np.append(vector, max2)
    vector = np.append(vector, max3)
    vector = np.append(vector, min4)
    vector = np.append(vector, min5)
    return vector
Example #24
0
# Basic input...
1 == 1
1.0 == 1.0
"1" == "1"
'1' == '1'
r"1" == r"1"
r'1' == r'1'
"1" + '1' == "11"
int.__add__(1, 0)
(0).__add__(1)
0 < 1 < 3 > 2
"a" < "c" > "b"
(9.0 < 11.0 < 10.0) == 0
import levenshtein
# This comment should not be necessary :S :S :S FIXME
levenshtein.distance("1", "b")
if 1:
    print 1
else:
    print 0

x = 5
a = ""
while x:
    x = 0

while x:
    x = 5

print "\n"