def test_distances(self): self.assertTrue(distance("слово", "слово") == 0) self.assertTrue(distance("слова", "слово") == 1) self.assertTrue(distance("удаление", "удаленние") == 1) self.assertTrue(distance("вставка", "вствка") == 1) self.assertTrue(distance("замена", "запена") == 1) self.assertTrue(distance("ячсмит", "йцукен") == 6) self.assertTrue(distance("", "кверти") == 6) self.assertTrue(distance("вместе", "местее") == 2) self.assertTrue(distance("", "") == 0)
def HierarchicalCluster(datafile): sim_list = [] sim_matrix = [] data_list = [line.rstrip('\n') for line in codecs.open(datafile, "r", "utf-8")] data_list = [line.split(' ') for line in data_list] print(data_list) len_list = len(data_list) label_list = [data_list[l][0] for l in range(0,len(data_list))] text_list = [data_list[l][1] for l in range(0,len(data_list))] print(label_list) print(text_list) for i in range(0, len_list): pivot = text_list[i] for j in range(0, len_list): sim = distance(pivot, text_list[j]) # calcurate similarity(distance) print('n{}, n{} : {}'.format(i, j, sim)) sim_list.append(sim) if j == len_list-1: sim_matrix.append(sim_list) sim_list = [] print('-------------------------') print('matrix: {}'.format(sim_matrix)) linkage_matrix = ward(sim_matrix) print('--------------------------') print(linkage_matrix) dendrogram(linkage_matrix, labels=label_list) plt.show()
def resemblance(username, password): truepass = get_password(username) l = Lev.distance(password, truepass) if len(password) >= len(truepass): return l / len(password) else: return l / len(truepass)
def getSimilarDevices(device): assert isinstance(device, str) assert device.count(":") == 1, "bad formatting of device name" (slot, deviceName) = device.split(":") if glob.production: prefix = PREFIX_CCDB else: prefix = PREFIX_CCDB_TEST url = prefix + "slots/" # False because SSH connection is unsigned: request = requests.get(url, verify=False) tmpList = json.loads(request.text)["slot"] # get all devices in CCDB allDevices = map(lambda x: x["name"], tmpList) # convert unicode to String allDevices = map(lambda x: str(x), allDevices) # keep only device candidates = filter(lambda x: x.startswith(slot), allDevices) # compute Levenshtein distances distances = \ map(lambda x: (levenshtein.distance(device, x), x), candidates) distances.sort() return distances
def main(): path1 = 'set' path2 = '115' files = os.listdir(path1) for file in files: if not os.path.isdir(file): f_w = os.path.basename(file) paths = "set/" + f_w # 读取文件 with open(paths, 'r', encoding='ISO-8859-1') as f: all_text1 = (''.join(f.readlines())).lower() sentences1 = [x.strip().lower() for x in all_text1.split('.')] # files2 = os.listdir(path2) for file in files2: if not os.path.isdir(file): f_f = os.path.basename(file) pathss = "115/" + f_f with open(pathss, 'r', encoding='ISO-8859-1') as f: all_text2 = (''.join(f.readlines())).lower() sentences2 = [x.strip().lower() for x in all_text2.split('.')] # search matching sentences count = 0 # print('正在评估短语的精确匹配…') for i, s1 in enumerate(sentences1): if len(s1) < 16: continue if rabin_karp(all_text2, s1) != -1: count += 1 i += 1 print('match:', s1) # search similar sentences similar = 0 # print('正在评估短语的类似措辞…') for i, si in enumerate(sentences1): wordsi = si.split() for j, sj in enumerate(sentences2): wordsj = sj.split() d = distance(wordsi, wordsj) if d < min(len(wordsi), len(wordsj)) * .75 and min( len(si), len(sj)) >= 16: similar += 1 print('similar:\nsi: {}\nsj: {}'.format(si, sj)) percentage = 100 * similar / len(sentences1) if percentage != 0: with open('result.txt', 'a') as file_object: file_object.write(str(percentage) + '\t') file_object.write(paths + '-----' + pathss + '\n') print('Similar phrases: {similar} ({percentage:.2f}%)'.format( similar=similar, percentage=percentage)) print(' "{file1}" found "{file2}"'.format(file1=paths, file2=pathss)) print('\n---------------------------------------------')
def score_of_words(w1, w2): dist = levenshtein.distance(w1, w2) len1 = len(w1) len2 = len(w2) if (len1 == 0 or len2 == 0): return 0 if dist <= 4 and float(dist) / float(max(len1, len2)) <= 0.4: return 1 return 0
def viterbi(self, observations, max_error_rate): trellis = [{self.start_state: (1., None)}] # fill out trellis for obs_idx, observation in enumerate(observations): #print( 'progress: ' + str(obs_idx + 1) + "/" + str(len(observations)), end='\r', flush=True) current_states = {} trellis.append(current_states) # set the maximum amount of errors that are allowed max_errors = int(len(observation) * max_error_rate) + 1 for prev_state in trellis[obs_idx].keys(): prev_path_prob = trellis[obs_idx][prev_state][0] for future_state, transition_prob in self.get_future_states_and_transition_probs( prev_state): distance = levenshtein.distance(future_state, observation) # if the levenshtein distance exceeds the maximum, # we disregard the path if distance > max_errors: continue emission_prob = poisson.poisson_distribution(0.1, distance) path_prob = prev_path_prob * transition_prob * emission_prob # only keep path with max probability if future_state in current_states: if current_states[future_state][0] >= path_prob: continue current_states[future_state] = (path_prob, prev_state) # termination step max_final_path_prob = -1 max_final_state = None for state in trellis[-1].keys(): final_path_prob = trellis[-1][state][ 0] * self.get_final_transition_prob(state) if final_path_prob > max_final_path_prob: max_final_path_prob = final_path_prob max_final_state = state if max_final_state is None: return None return self.trellis_to_states(trellis, max_final_state, len(trellis) - 1)
def is_similar_levenshtein(s, t): words_s = s.split() words_t = t.split() len_s = len(words_s) len_t = len(words_t) if s == t: return False if len_s != len_t: return False similarity = sum([ levenshtein.distance(words_s[i], words_t[i]) for i in range(len_s) ]) if sum == 0 or sum > len_s*2: return False return True
def is_similar_levenshtein(s, t): words_s = s.split() words_t = t.split() len_s = len(words_s) len_t = len(words_t) if s == t: return False if len_s != len_t: return False similarity = sum( [levenshtein.distance(words_s[i], words_t[i]) for i in range(len_s)]) if sum == 0 or sum > len_s * 2: return False return True
def main(): titles = read_titles() for P in ["Hidden Treasures Of The Infinite", "Hidden_Treasures_Of_The_Infinite", "Hiden Tresure_Of_de_Infinity", "Hidden The Infinite"]: print(P) P = P.lower() ### LEVENSHTEIN ### start_time = time.time() bestmatch = "" bestscore = math.inf for title in titles: title = title.lower() score = levenshtein.distance(P, title) if(score < bestscore): bestmatch = title bestscore = score print_result("LEVENSHTEIN", bestmatch, start_time) ### SMITH-WATERMAN ### start_time = time.time() bestmatch = "" bestscore = -math.inf for title in titles: title = title.lower() score = smithwaterman.distance(P, title) if(score > bestscore): bestmatch = title bestscore = score print_result("SMITH-WATERMAN", bestmatch, start_time) ### HOME MADE FASTA ### start_time = time.time() bestmatch = fasta.find_best_match(P, titles) print_result("HOME MADE FASTA + SMITH-WATERMAN", bestmatch, start_time) ### JACCARD ### start_time = time.time() bestmatch = jaccard.find_best_match(P, titles) print_result("JACCARD + SMITH-WATERMAN", bestmatch, start_time) print("")
def field_exists_check(search_tokens, fields): for j, ((tag, _), token_field) in enumerate(zip(search_tokens, fields)): if tag is None: lev_dict = { k: distance(token_field.strip().lower(), v.lower()) for k, v in header_labels_dict.items() } min_dist = min(lev_dict.values()) for k, d in lev_dict.items(): if d == min_dist: if min_dist <= 3: search_tokens[j][0] = k _alert_string.append( f'Corrected "{token_field.strip()}" to ' f'"{header_labels_dict[k]}".') else: _error_string.append( f'"{token_field.strip()}" is not a field. ' f'It might be misspelled.') break
def test_empty(self): self.assertEqual(0, distance("", ""))
def test_distance(self): self.assertEqual(1, distance("zipp", "zippy")) self.assertEqual(1, distance("ippy", "zippy")) # oops... self.assertEqual(2, distance("sipry", "zippy"))
def test_equal(self): self.assertEqual(0, distance("zippy", "zippy"))
def test_unbalanced(self): self.assertEqual(6, distance("", "python")) self.assertEqual(4, distance("", "ruby"))
def distance(a,b): if len(a) + len(b) < 1: return 0 return float(levenshtein.distance(a.lower(), b.lower())) / max(len(a),len(b))
# Basic input... 1 == 1 1.0 == 1.0 "1" == "1" '1' == '1' r"1" == r"1" r'1' == r'1' "1" + '1' == "11" int.__add__(1, 0) (0).__add__(1) 0 < 1 < 3 > 2 "a" < "c" > "b" (9.0 < 11.0 < 10.0) == 0 import levenshtein # This comment should not be necessary :S :S :S FIXME levenshtein.distance("1", "b") if 1: print 1 else: print 0 x = 5; a = "" while x: x = 0 while x: x = 5 print "\n" x == 0
def test2(self): a = 'Petooshock' b = 'Toornickman' res = levenshtein.distance(a, b) exp = 8 self.assertEqual(res, exp)
def test1(self): a = 'Levenshtien' b = 'Frankenstein' res = levenshtein.distance(a, b) exp = 7 self.assertEqual(res, exp)
def test3(self): a = 'Assassin' b = 'Killer' res = levenshtein.distance(a, b) exp = 8 self.assertEqual(res, exp)
def distance(a, b): if len(a) + len(b) < 1: return 0 return float(levenshtein.distance(a.lower(), b.lower())) / max( len(a), len(b))
def test_distance(): assert levenshtein.distance('roule', 'roule') == 0 assert levenshtein.distance('roule', 'roules') == 1 assert levenshtein.distance('roule', '') == 5 assert levenshtein.distance('roule', 'raoul') == 2
def vec(paperID, authorID, aKw, kWp, aP, jKw, PJ, A, T, aff_Author, aff_PaperAuthor, nb_flag=0): vector = np.empty(0) #Feature-1 ret = vecfeature.matching_keywords_pa(aKw, kWp, paperID, authorID) # print (ret) # print ("Ret Done") temp = np.zeros(50000) cnt = 0 for keyword in ret: if len(keyword) == 0: continue temp[cnt] = T.calculate(authorID, keyword) cnt += 1 # if(cnt%1000)==0: print ("Feature-1 Going on..") temp = np.resize(temp, cnt) temp = np.sort(temp, axis=None) temp[:] = temp[::-1] j = 0 for i in temp: vector = np.append(vector, i) j += 1 if (j >= 5): break for k in range(j, 5): vector = np.append(vector, 0) if (nb_flag == 1): #Feature-1_neg ret = vecfeature.matching_keywords_pa_neg(aKw, kWp, paperID, authorID) temp = np.zeros(50000) cnt = 0 for keyword in ret: if len(keyword) == 0: continue temp[cnt] = T.calculate(authorID, keyword) cnt += 1 temp = np.resize(temp, cnt) temp = np.sort(temp, axis=None) temp[:] = temp[::-1] j = 0 for i in temp: vector = np.append(vector, i) j += 1 if (j >= 5): break for k in range(j, 5): vector = np.append(vector, 0) journalID = vecfeature.get_journal(PJ, paperID) # print ("Feature-1 Done") #Feature-2 ret = vecfeature.matching_keywords_aj(aKw, jKw, journalID, authorID) # print (ret) temp = np.zeros(50000) cnt = 0 for keyword in ret: if len(keyword) == 0: continue temp[cnt] = T.calculate(authorID, keyword) cnt += 1 temp = np.resize(temp, cnt) temp = np.sort(temp, axis=None) temp[:] = temp[::-1] j = 0 for i in temp: vector = np.append(vector, i) j += 1 if (j >= 3): break for k in range(j, 3): vector = np.append(vector, 0) # print ("Feature-2 Done") if (nb_flag == 1): #Feature-2_neg ret = vecfeature.matching_keywords_aj_neg(aKw, jKw, journalID, authorID) temp = np.zeros(50000) cnt = 0 for keyword in ret: if len(keyword) == 0: continue temp[cnt] = T.calculate(authorID, keyword) cnt += 1 temp = np.resize(temp, cnt) temp = np.sort(temp, axis=None) temp[:] = temp[::-1] j = 0 for i in temp: vector = np.append(vector, i) j += 1 if (j >= 8): break for k in range(j, 8): vector = np.append(vector, 0) #Feature-3 ret = vecfeature.get_authors(A, paperID) temp = np.zeros(50000) cnt = 0 for i in ret: if i == authorID: continue temp[cnt] = vecfeature.matching_papers(aP, i, authorID) # if(cnt%1000)==0: print ("Feature-3 Going on..") cnt += 1 temp = np.resize(temp, cnt) temp = np.sort(temp, axis=None) temp[:] = temp[::-1] j = 0 for i in temp: vector = np.append(vector, i) j += 1 if (j >= 4): break for k in range(j, 4): vector = np.append(vector, 0) if (nb_flag == 1): #Feature-3_neg ret = vecfeature.get_authors(A, paperID) temp = np.zeros(50000) cnt = 0 for i in ret: if i == authorID: continue temp[cnt] = vecfeature.matching_papers_neg(aP, i, authorID) cnt += 1 temp = np.resize(temp, cnt) temp = np.sort(temp, axis=None) temp[:] = temp[::-1] j = 0 for i in temp: vector = np.append(vector, i) j += 1 if (j >= 4): break for k in range(j, 4): vector = np.append(vector, 0) # print ("Feature-3 Done") #Feature 4 vector = np.append(vector, vecfeature.noKeywords_Author(aKw, authorID)) # print ("Feature-4 Done") #Feature 5 vector = np.append(vector, vecfeature.noKeywords_Paper(kWp, paperID)) # print ("Feature-5 Done") word1 = "" word2 = "" max2 = 0 max3 = 0 min4 = 500 min5 = 500 if authorID in aff_Author: word1 = aff_Author[authorID] if authorID in aff_PaperAuthor: word2 = aff_PaperAuthor[authorID] vector = np.append(vector, levenshtein.distance(word1, word2)) if paperID in A: for i in A[paperID]: word3 = "" word4 = "" if i not in aff_Author and len(word1) == 0: max2 = max(max2, 18.976) min4 = min(min4, 18.976) if i not in aff_PaperAuthor and len(word2) == 0: max3 = max(max3, 18.65) min5 = min(min5, 18.65) if i in aff_Author: word3 = aff_Author[i] max2 = max(max2, levenshtein.distance(word1, word3)) min4 = min(min4, levenshtein.distance(word1, word3)) if i in aff_PaperAuthor: word4 = aff_PaperAuthor[i] max3 = max(max3, levenshtein.distance(word2, word4)) min5 = min(min5, levenshtein.distance(word2, word4)) if i not in aff_Author and len(word1) != 0: max2 = max(max2, levenshtein.distance(word1, word3)) min4 = min(min4, levenshtein.distance(word1, word3)) if i not in aff_PaperAuthor and len(word2) != 0: max3 = max(max3, levenshtein.distance(word2, word4)) min5 = min(min5, levenshtein.distance(word2, word4)) vector = np.append(vector, max2) vector = np.append(vector, max3) vector = np.append(vector, min4) vector = np.append(vector, min5) return vector
# Basic input... 1 == 1 1.0 == 1.0 "1" == "1" '1' == '1' r"1" == r"1" r'1' == r'1' "1" + '1' == "11" int.__add__(1, 0) (0).__add__(1) 0 < 1 < 3 > 2 "a" < "c" > "b" (9.0 < 11.0 < 10.0) == 0 import levenshtein # This comment should not be necessary :S :S :S FIXME levenshtein.distance("1", "b") if 1: print 1 else: print 0 x = 5 a = "" while x: x = 0 while x: x = 5 print "\n"