def get_similarities(Features, url_input): """ similarity metrics include: Levenshtein, jaro, damerau levenshtein, normalized_damerau_levenshtein, and hamming distance :param Features: input dictionary to add things to :param url_input :return: Features: after adding all similarity metrics """ for n in itertools.chain(product_domain_names, brand_names): Features['url_levenshtein_distance_' + n] = Levenshtein.distance( url_input, n) Features['fqdn_levenshtein_distance_' + n] = Levenshtein.distance( Features['fqdn'], n) Features['url_jaro_winkler_distance_' + n] = jw.get_jaro_distance( url_input, n) Features['fqdn_jaro_winkler_distance_' + n] = jw.get_jaro_distance( Features['fqdn'], n) Features['url_damerau_levenshtein_distance_' + n] = dl.damerau_levenshtein_distance(url_input, n) Features['fqdn_damerau_levenshtein_distance_' + n] = dl.damerau_levenshtein_distance(Features['fqdn'], n) Features['url_damerau_levenshtein_normalized_distance_' + n] = dl.normalized_damerau_levenshtein_distance(url_input, n) Features['fqdn_damerau_levenshtein_normalized_distance_' + n] = dl.normalized_damerau_levenshtein_distance( Features['fqdn'], n) if len(n) == len(url_input): Features['url_length_equals_' + n] = 1 Features['url_hamming_distance_' + n] = hamming(url_input, n) Features['fqdn_hamming_distance_' + n] = hamming( Features['fqdn'], n) else: Features['url_length_equals_' + n] = 0 return Features
def jwsim(word, otherword): # called distance but is actually similarity sim = distance.get_jaro_distance(word, otherword) uword = unidecode.unidecode(word) uotherword = unidecode.unidecode(otherword) usim = distance.get_jaro_distance(uword, uotherword) return (sim + usim) / 2
def check(artist1, song1, artist2, song2): artistScore = JW.get_jaro_distance(artist1, artist2, winkler=True, winkler_ajustment=True, scaling=0.1) #calculate jaro winkler distance of artist name songScore = JW.get_jaro_distance(song1, song2, winkler=True, winkler_ajustment=True, scaling=0.1) #calculate the jaro winkler distance of song name if artistScore > 0.75 and songScore > 0.75: #if both the artist and song name have over 75% matching return True #return that it was a match else: #if not enough matching return False #return that it was not a match
def sim_join(input_array, k, group_size=1): make_hash(input_array) data, centers = get_centers(input_array) results = [] print("making %d *sqrt(n)-sized groups..." % group_size) groups = make_groups(data, centers, k, group_size * len(centers), results) print("nested loop %d groups" % len(groups)) for i, group in enumerate(groups): if len(group) <= 1: continue for current, elem_i in enumerate(group.elems): j = 1 dist_to_groups = [ distance.get_jaro_distance(g.center, elem_i) - g.r for g in groups ] closest_group = dist_to_groups.index(min(dist_to_groups)) if groups[closest_group].id == group.id: closest_group = np.argpartition(np.array(dist_to_groups), j)[j] j += 1 target = group.all_but(current) + groups[closest_group].all() while len(target) <= k: closest_group = np.argpartition(np.array(dist_to_groups), j)[j] j += 1 if groups[closest_group].id == group.id: continue target = target + groups[closest_group].all() distances = np.array( [distance.get_jaro_distance(x, elem_i) for x in target]) knn = np.argpartition(distances, k)[:k].tolist() results.append((hash[elem_i], [hash[target[x]] for x in knn])) return results
def dictionaryMatches(word): j = 2 prefixDistCheck = False suffixDistCheck = False while (j <= len(word) - 2): prefix = word[0:j] suffix = word[j:] dictprefixList = [] dictprefixList = [i for i in dictList if i.startswith(prefix)] dictsuffixList = [i for i in dictList if i.endswith(suffix)] if (not prefixDistCheck): for dict in dictprefixList: if (distance.get_jaro_distance( word, dict, winkler=True, scaling=0.1) > avgPreDist): prefixDistCheck = True break if (not suffixDistCheck): for dict in dictsuffixList: if (distance.get_jaro_distance( word, dict, winkler=True, scaling=0.1) > avgSufDist): suffixDistCheck = True break if (prefixDistCheck and suffixDistCheck): break j = j + 1 if (prefixDistCheck and suffixDistCheck): return "True" else: return "False"
def statJW(): d = [] with open('data/blends.txt') as f: for line in f: t, tt, ttt = line.split() jw1 = distance.get_jaro_distance(t, tt, False) jw2 = distance.get_jaro_distance(t, ttt, False) d.extend([jw1, jw2]) stat('JW', d, 'similarity value', 'frequency')
def run_program(result): program_path = "" found = False response = { "tts": "", "file": "", "save": False, } if result['entities']: for entity in result['entities']: if entity["entity"] == "program": for subdir, dirs, files in os.walk(settings.PROGRAMS_DIR1): for file in files: if (entity["value"] in file and jarowinkler.get_jaro_distance( entity["value"], file, winkler=True) > 0.8): program_path = subdir + "/" + file found = True break if (found): break if not program_path: for subdir, dirs, files in os.walk(settings.PROGRAMS_DIR2): for file in files: if (entity["value"] in file and jarowinkler.get_jaro_distance( entity["value"], file, winkler=True) > 0.8): program_path = subdir + "/" + file found = True break if (found): break if not program_path: response[ "tts"] = "I was unable to find the program you wanted. It may not be in the start programs directory." response["file"] = "program_not_found.mp3" response["save"] = True return response response["tts"] = "Ok" response["file"] = "ok.mp3" response["save"] = True print(f"Opening path {program_path}") os.startfile(program_path) return response
def normalize_legal_entity_type(txt) -> (str, str, float): knowns = find_known_legal_entity_type(txt.strip()) if len(knowns) > 0: if len(knowns) == 1: k = knowns[0] return k[0], k[1], distance.get_jaro_distance(k[0], txt, winkler=True, scaling=0.1) else: finding = '', '', 0 for k in knowns: d = distance.get_jaro_distance(k[0], txt, winkler=True, scaling=0.1) if d > finding[2]: finding = k[0], k[1], d return finding else: return txt, '', 0.5
def get_synmat4title(title, maxlen): splitTitle = preprocess_line_syn(title, exclude) List1 = splitTitle List2 = splitTitle Matrix = np.zeros((maxlen, maxlen), dtype=np.float) if len(List1) < maxlen: for i in range(0, len(List1)): for j in range(0, len(List2)): Matrix[i, j] = distance.get_jaro_distance(List1[i], List2[j]) else: for i in range(0, maxlen): for j in range(0, maxlen): Matrix[i, j] = distance.get_jaro_distance(List1[i], List2[j]) return Matrix
def pareamentoself(dataframebase, colunas, highest_only=False, valor_match=0.89): dataframebase['KEY'] = reduce( lambda a, b: a + b, [dataframebase[coluna] for coluna in colunas]) size = len(dataframebase) perc = 0 matches = {} id_key = {} for i, line in dataframebase.iterrows(): id_key[line['KEY']] = line['ID'] highest_match = 0 highest_match_name = None for key in matches.keys(): jaro_value = distance.get_jaro_distance(line['KEY'], key) if jaro_value > valor_match: # deu match if not highest_only: matches[key].append(line['KEY']) elif jaro_value > highest_match: highest_match = jaro_value highest_match_name = key if highest_match_name is not None: matches[highest_match_name].append(line['KEY']) else: matches[line['KEY']] = [] if i / size * 100 > perc: print(perc, '%') perc += 1 return matches, id_key
def get_JW_matrix(NAMELIST): """ input: NAMELIST: = unique namelist (of length n) make_csv: make JW.csv distance: "JW" or "metaphone" output: upper triangluar n x n numpy matrix of Jaro-Winkler distances. """ print("creating JW.csv") namelist = list(NAMELIST) n = len(namelist) for i in range(n): for sep in ["`", "'", ".", "-", ',']: namelist[i] = namelist[i].replace(sep, " ") matrix = np.zeros([n, n]) max_iter = int(0.5 * (n - 1) * n) iter = 0 for i in range(n): for j in range(i + 1, n): # diagonal is 1 iter += 1 progress = iter / max_iter * 100 sys.stdout.write("\riter {0}({1},{2}) out of {3}({4}%) ".format( int(iter), int(i), int(j), int(max_iter), int(progress))) sys.stdout.flush() matrix[i, j] = distance.get_jaro_distance(namelist[i], namelist[j], winkler=True, scaling=0.1) print("\ncreated JW.csv") return (matrix)
def fuzzy_match_facet(text, facet): score = distance.get_jaro_distance(text.lower(), facet.lower(), winkler=True, scaling=0.1) print(facet, score) return score
def filter_entry(source, raw_table): # count is the primary criterion best_count = 0 bests = [] for target in raw_table[source]: count = raw_table[source][target] if count > best_count: best_count = count bests = [target] elif count == best_count: bests.append(target) # jaro winkler is the secondary criterion if len(bests) > 1: # alphabetic is the third criterion # (this is not meaningful, this is just to be deterministic) bests.sort() best_jw = -1 best = None source_word = source for target in bests: jw = distance.get_jaro_distance(source_word, target) if jw > best_jw: best_jw = jw best = target # not accounting for ties -- we just take the first one as best return best else: return bests[0]
def jwOnSortedFunction(s1, s2, collator=icu.Collator.createInstance( icu.Locale('de_DE.UTF-8'))): s1_s = ''.join(sorted(list(s1), key=collator.getSortKey)) s2_s = ''.join(sorted(list(s2), key=collator.getSortKey)) return jw_distance.get_jaro_distance(s1_s, s2_s, winkler=True)
def phase_3_validation(self, interpretation, min_confidence): result = Result() last_underscore_index = self.profile.name.rfind( STRUCTURED_KEY_SEPARATOR) if last_underscore_index >= 0: normalized_name = self.profile.name[last_underscore_index:] else: normalized_name = self.profile.name max_confidence = 0.0 for matching_name in interpretation['iMatchingNames']: if len(matching_name) > 0: jaroDistance = get_jaro_distance(normalized_name.lower(), matching_name.lower()) if jaroDistance > max_confidence: max_confidence = jaroDistance interpretation['iConfidence'] = max_confidence if max_confidence <= min_confidence: message = "Matching confidence not high enough for field " + normalized_name + " with interpretation " + interpretation[ 'iName'] + "." logging.debug(message) result.message = message return result logging.debug("Highest matching confidence for " + normalized_name + " was " + str(max_confidence) + ".") result.result = True return result
def knn(inputs, dataset, labels, k): '''Main function for doing kNN''' numsamples = len(dataset) Distance = [] # a list of distances Weight = [] for i in range(numsamples): dist = distance.get_jaro_distance(inputs, dataset[i], winkler=True, scaling=0.1) Distance.append(dist) if Metric_mode == 'wt': wt = Gaussian(dist, h) Weight.append(wt) if Metric_mode == 'dist': sorted_ = -1 * np.sort(-1 * np.array(Distance)) sorted_idx = np.argsort(np.array(Distance)) sorted_idx = sorted_idx[::-1] else: sorted_ = -1 * np.sort(-1 * np.array(Weight)) sorted_idx = np.argsort(np.array(Weight)) sorted_idx = sorted_idx[::-1] if CV_mode == 0: result = voting(sorted_, sorted_idx, labels, k) else: result = [] for i in range(len(k)): maxindex = voting(sorted_, sorted_idx, labels, k[i]) result.append(maxindex) result = np.array(result).reshape(1, -1) return result
def get_min_hun_distance( words1: List[str], words2: List[str]) -> Tuple[float, List[Tuple[int, int, float]]]: """Calculate a similarity score between all pairs of words.""" values = [] hits = [] min_dist = 0 for i in range(len(words1)): w1 = words1[i] row = [] for j in range(len(words2)): w2 = words2[j] # Jaro-Winkler distance (not similarity score) row.append( 1 - distance.get_jaro_distance(w1, w2, winkler=True, scaling=0.1)) values.append(row) # Calculate the best pairing based on the similarity score. row_ids, col_ids = linear_sum_assignment(values) row_ids = list(row_ids) col_ids = list(col_ids) # The best alignment hits = [] valsum = 0 for i in range(len(row_ids)): row_id = row_ids[i] col_id = col_ids[i] hits.append((row_id, col_id, values[row_id][col_id])) valsum += values[row_id][col_id] min_dist = valsum / (len(words1) + len(words2)) return min_dist, hits
def match_org_by_score(author_name, author_list): score_list = [] name = clean_name(author_name) # author_list_lower = [] # for author in author_list: # author_list_lower.append(author.lower()) # author_list_clean = list(map(clean_name, author_list)) # print("author_list_clen:",author_list_clean) name_split = name.split() for o in author_list: if "name" in o and o["name"] != "": author = clean_name(o["name"]) # lower_name = author.lower() score = distance.get_jaro_distance(name, author, winkler=True, scaling=0.1) author_split = author.split() inter = set(name_split) & set(author_split) alls = set(name_split) | set(author_split) score += round(len(inter) / len(alls), 6) score_list.append(score) rank = np.argsort(-np.array(score_list)) return_list = [author_list[i] for i in rank] return return_list[0]
def similarity(p): x, y = p if x < y: return 0 return ds.get_jaro_distance(text[x], text[y], winkler=True, scaling=0.1)
def jaroWinklerDistance(text_a, text_b): """ Calculate Jaro Winkler Distance :param text_a: Text a :param text_b: Text b :return: Jaro Winkler distance value """ return distance.get_jaro_distance(text_a, text_b, winkler=True, scaling=0.1)
def score_matches(series_1, series_2): """ Inputs: series_1, series_2 : pd.Series Series that each contain a single record of census data. Labels are the columns in the read_data function above Data from series_1 must be 10 years earlier than data from series_2 Outputs: score : float Score rating the match between the two inputs. Higher is closer. """ if not pd.isnull(series_1.NAMELAST) and not pd.isnull(series_2.NAMELAST): dist_NAMELAST = distance.get_jaro_distance(series_1.NAMELAST, series_2.NAMELAST, winkler=True, scaling=0.1) else: dist_NAMELAST = 0 if not pd.isnull(series_1.NAMEFRST) and not pd.isnull(series_2.NAMEFRST): dist_NAMEFRST = distance.get_jaro_distance(series_1.NAMEFRST, series_2.NAMEFRST, winkler=True, scaling=0.1) else: dist_NAMEFRST = 0 dist_BPL = int(series_1.BPL == series_2.BPL) dist_SEX = int(series_1.SEX == series_2.SEX) dist_AGE = int(series_1.AGE == series_2.AGE - 10 or series_1.AGE == series_2.AGE - 11 or series_1.AGE == series_2.AGE - 9 ) dist_SERIAL = int(series_1.SERIAL == series_2.SERIAL) # Weight columns, where important columns get heigher weights weight_NAMELAST = 16 weight_NAMEFRST = 15 weight_BPL = 4 weight_SEX = 3 weight_AGE = 2 weight_SERIAL = 1 # add scores weighted by importance score = weight_NAMELAST * dist_NAMELAST + \ weight_NAMEFRST * dist_NAMEFRST + \ weight_BPL * dist_BPL + \ weight_SEX * dist_SEX + \ weight_AGE * dist_AGE + \ weight_SERIAL * dist_SERIAL return score
def most_similar_word(sentence: str, word: str) -> str: msw = '' min_dist = 10 for token in word_tokenize(sentence): word_dist = 1 - distance.get_jaro_distance(word, token, winkler=True) if word_dist < min_dist: min_dist = word_dist msw = token return msw
def usernameSimilarityScore(uname1, uname2): """Compare usernames using Jaro distance. Returns a score between 0 and 1, where 1 means exact match. """ if uname1 == uname2: return 1 # matched exatcly else: return distance.get_jaro_distance(uname1,uname2,winkler=False)
def jaro_winkler_duplicate_processing(string1, string2): similarity = distance.get_jaro_distance(string1, string2, winkler=True, scaling=0.1) if (similarity >= 0.9): return True else: return False
def locationSimilarityScore(loc1, loc2): """Compare location texts using Jaro distance. Returns a score between 0 and 1, where 1 means exact match. """ if loc1 == loc2: return 1 # matched exatcly else: return distance.get_jaro_distance(loc1,loc2,winkler=False)
def calculate(self, dataX, dataY): """ calculates the jaro distance Args: dataX: 1st string dataY: 2nd string Yields: float of the jaro distance """ return distance.get_jaro_distance(dataX, dataY)
def compare_masked_strings(a, b, masked_substrings): a1 = a b1 = b for masked in masked_substrings: if a1.find(masked) >= 0 and b1.find(masked) >= 0: a1 = a1.replace(masked, '') b1 = b1.replace(masked, '') return jaro.get_jaro_distance(a1, b1, winkler=False, scaling=0.1)
def findequivalent(names_list, valor): higherbairro = 0 nome_bairro = '' for bairro in names_list: x = distance.get_jaro_distance(valor, bairro) if x > higherbairro: higherbairro = x nome_bairro = bairro return nome_bairro
def pyjarowinkler_jaro_winkler_distance(candidates, inp, min_score, winkler): res = [] for candidate in candidates: score = pyjarowinkler_distance.get_jaro_distance(candidate, inp, winkler=winkler) if score >= min_score: res.append((candidate, score)) return res
def matched_tokens(self, words): tokens = [] for word in words: for token in self.tokens: distance = jarowinkler_distance.get_jaro_distance( token['word'], word, winkler=True, scaling=0.1) if distance > 0.90: tokens.append(token) return tokens
def test_get_jaro_distance(self): self.assertEquals(0.0, distance.get_jaro_distance("fly", "ant")) self.assertEquals(0.44, distance.get_jaro_distance("elephant", "hippo")) self.assertEquals(0.91, distance.get_jaro_distance("ABC Corporation", "ABC Corp")) self.assertEquals(0.9, distance.get_jaro_distance("PENNSYLVANIA", "PENNCISYLVNIA")) self.assertEquals(0.93, distance.get_jaro_distance("D N H Enterprises Inc", "D & H Enterprises, Inc.")) self.assertEquals(0.94, distance.get_jaro_distance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"))
def phase_3_validation(self, interpretation, min_confidence): result = Result() last_underscore_index = self.profile.name.rfind(STRUCTURED_KEY_SEPARATOR) if last_underscore_index >= 0: normalized_name = self.profile.name[last_underscore_index:] else: normalized_name = self.profile.name max_confidence = 0.0 for matching_name in interpretation['iMatchingNames']: if len(matching_name) > 0: jaroDistance = get_jaro_distance(normalized_name.lower(), matching_name.lower()) if jaroDistance > max_confidence: max_confidence = jaroDistance interpretation['iConfidence'] = max_confidence if max_confidence <= min_confidence: message = "Matching confidence not high enough for field " + normalized_name + " with interpretation "+interpretation['iName']+"." logging.debug(message) result.message = message return result logging.debug("Highest matching confidence for " + normalized_name + " was " + str(max_confidence) + ".") result.result = True return result
def __init__(self, mid, title, original_file, year, size, fps, guid, count, jacket, library_path): self.mid = mid self.original_file = original_file self.filepath = os.path.dirname(original_file) self.basename = os.path.basename(original_file) self.filename, self.file_ext = os.path.splitext(self.basename) self.title = title self.correct_title = self._clean_filename() self.title_distance = distance.get_jaro_distance(self.title, self.correct_title) self.year = year self.size = size self.fps = fps self.exist = os.path.exists(original_file) self.matched = not guid.startswith('local://') self.count = count self.library_path = library_path if self.matched: h = hashlib.sha1(guid).hexdigest() self.relative_jacket_path = os.path.join(self._jacket_path.format(h[0], h[1:], jacket[11:]))
__author__ = 'Jean-Bernard Ratte - [email protected]' from pyjarowinkler import distance if __name__ == "__main__": first = "hello" second = "haloa" print("The words '{0}' and '{1}' matches at {2}%".format(first, second, distance.get_jaro_distance(first, second)))
def test_get_jaro_without_winkler(self): self.assertEquals(distance.get_jaro_distance("ZDVSXA", "ZWEIUHFSAD", winkler_ajustment=False), 0.5111111111111111) self.assertEquals(distance.get_jaro_distance("frog", "fog", winkler_ajustment=False), 0.9166666666666666) self.assertEquals(distance.get_jaro_distance("fly", "ant", winkler_ajustment=False), 0.0) self.assertEquals(distance.get_jaro_distance("elephant", "hippo", winkler_ajustment=False), 0.44166666666666665) self.assertEquals(distance.get_jaro_distance("hippo", "elephant", winkler_ajustment=False), 0.44166666666666665) self.assertEquals(distance.get_jaro_distance("hippo", "zzzzzzzz", winkler_ajustment=False), 0.0) self.assertEquals(distance.get_jaro_distance("hello", "hallo", winkler_ajustment=False), 0.8666666666666667) self.assertEquals(distance.get_jaro_distance("ABC Corporation", "ABC Corp", winkler_ajustment=False), 0.8444444444444444) self.assertEquals(distance.get_jaro_distance("PENNSYLVANIA", "PENNCISYLVNIA", winkler_ajustment=False), 0.8300310800310801) self.assertEquals(distance.get_jaro_distance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness", winkler_ajustment=False), 0.9033333333333333) self.assertEquals(distance.get_jaro_distance("D N H Enterprises Inc", "D & H Enterprises, Inc.", winkler_ajustment=False), 0.9073153899240856)