def jaro_sim(self): self.cluster = [] for i in range(0,len(self.group)): for j in range(i+1, len(self.group)): if self.threshold <= jf.jaro_similarity(str(self.group[i]),str(self.group[j])): self.cluster.append([str(self.group[i]),str(self.group[j])]) return self.cluster
def simple_example(): # String comparison. str1, str2 = u'jellyfish', u'smellyfish' print("jellyfish.levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.levenshtein_distance(str1, str2))) print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2))) print("jellyfish.hamming_distance({}, {}) = {}.".format( str1, str2, jellyfish.hamming_distance(str1, str2))) print("jellyfish.jaro_distance({}, {}) = {}.".format( str1, str2, jellyfish.jaro_distance(str1, str2))) print("jellyfish.jaro_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_similarity(str1, str2))) print("jellyfish.jaro_winkler({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler(str1, str2))) print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler_similarity(str1, str2))) print("jellyfish.match_rating_comparison({}, {}) = {}.".format( str1, str2, jellyfish.match_rating_comparison(str1, str2))) #-------------------- # Phonetic encoding. ss = u'Jellyfish' print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss))) print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss))) print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss))) print("jellyfish.match_rating_codex({}) = {}.".format( ss, jellyfish.match_rating_codex(ss)))
def getClosestPlayerName(playerName: str) -> str: closest = sorted(playerNames, key=lambda savedName: jellyfish.levenshtein_distance( savedName, playerName))[0] similarity = jellyfish.jaro_similarity(closest, playerName) if similarity > 0.7: return closest return "No Match"
def jaro_apply(x): try: return jaro_similarity(x[0], x[1]) except Exception as err: if pandas.isnull(x[0]) or pandas.isnull(x[1]): return np.nan else: raise err
def correct_state(df): state_list = ['sa','wa','nsw','qld','tas','vic','act','jbt','nt','cx','cc','hm'] for index, row in df.iterrows(): for index1, row2 in enumerate(state_list): value = row['state'] score = jellyfish.jaro_similarity(str(value),str(row2)) if(value != None and value not in (state_list)): df.state[index] = None elif(score > 0.62 and score < 1 and value != None and value != 'sa' and value != 'wa'): #sa et wa etant trop proche on ne les compare pas df.state[index] = row2 return df
async def on_raw_message_edit(self, payload: discord.RawReactionActionEvent): if payload.cached_message.author.bot is False: cached_message = payload.cached_message.content data = payload.data['content'] text = "edit '{0}' to '{1}'".format(cached_message, data) k = jellyfish.jaro_similarity(cached_message, data) dataL = len(data) cachL = len(cached_message) if k <= 0.45 or dataL*5 <= cachL: embed = await self.createEmbed(payload.cached_message, 'Edit') embed.description = text await payload.cached_message.channel.send(embed = embed)
def measure_distance(word1, word2, distance_type): if distance_type == 'lv': distance = Levenshtein.eval(word1, word2) if distance_type == 'dlv': distance = jellyfish.damerau_levenshtein_distance(word1, word2) if distance_type == 'jw': # Jaro–Winkler indicates the similiraty, we take the inverse distance = -jellyfish.jaro_winkler_similarity(word1, word2) if distance_type == 'j': distance = -jellyfish.jaro_similarity(word1, word2) if distance_type == 'hm': distance = jellyfish.hamming_distance(word1, word2) return distance
def find_similar_file(origin_file_name, files): similar_file = '' similarity = 0 for file in files: cur_file_name = os.path.splitext(file)[0] cur_extension = os.path.splitext(file)[1][1:] if 'json' == cur_extension: cur_similarity = jellyfish.jaro_similarity(origin_file_name, cur_file_name) * 100 if cur_similarity >= 70 and cur_similarity >= similarity: similarity = cur_similarity similar_file = cur_file_name + '.' + cur_extension return similar_file, similarity
def comparacion_pares(self, texto1, texto2, tipo="levenshtein", norm=None): """ Permite hacer comparaciones entre dos textos de entrada, de acuerdo a \ un tipo de distancia o similitud determinado. :param texto1: Primer texto de interés a comparar. :type texto1: str :param texto2: Segundo texto de interés a comparar. :type texto2: str :param tipo: Criterio de comparación a utilizar entre los textos. \ Valor por defecto `'levenshtein'`. :type tipo: {'damerau_levenshtein', 'levenshtein', 'hamming', \ 'jaro_winkler', 'jaro'}, opcional :param norm: Permite normalizar los resultados en función de la \ longitud de los textos. Si `norm = 1` se normaliza en función al \ texto más corto, si `norm = 2` se normaliza en función al texto \ de mayor extensión. :type norm: {1,2}, opcional :return: (float) Valor resultado de la comparación entre `texto1` y \ `texto2`. """ tipo = tipo.lower() if "damerau" in tipo: salida = jellyfish.damerau_levenshtein_distance(texto1, texto2) elif "levenshtein" in tipo: salida = jellyfish.levenshtein_distance(texto1, texto2) elif "hamming" in tipo: salida = jellyfish.hamming_distance(texto1, texto2) elif "winkler" in tipo: salida = jellyfish.jaro_winkler_similarity(texto1, texto2) elif "jaro" in tipo: salida = jellyfish.jaro_similarity(texto1, texto2) else: print( ( "Por favor seleccione un criterio válido " "para comparar los strings." ) ) return None if norm in [1, 2] and "jaro" not in tipo: if norm == 1: salida /= min(len(texto1), len(texto2)) else: salida /= max(len(texto1), len(texto2)) return salida
def extract_closest_match(search_key, target_list, score_cutoff=0): """Return str value from target list with highest score using Jaro for String distance. search_key (str): A string used to search for closest match. target_list (list): A list of strings for comparison. score_cutoff (float): A score cutoff (betwen 0 and 1) to be met. """ highest_score = score_cutoff highest_value_key = None for target_key in target_list: score = jellyfish.jaro_similarity(search_key, target_key) if score >= highest_score: highest_score = score highest_value_key = target_key return highest_value_key
def word_deduplication(self, threshold=0.5): keywords = [] #add first element to liste for index, key in enumerate(self.candidateKeywords): if index > 0: break print(key) keywords.append(key) for candidate in self.candidateKeywords: skip = False for word in self.candidateKeywords: candidat = candidate candidat1 = word if jaro_similarity(candidat, candidat1) > 0.3: skip = True break if not skip: keywords.append((candidat, candidat)) self.candidateKeywords = keywords
def find_double(self, with_fusion=True): log("Recherche des doublons sur les films") rc = 0 for p1 in self.pows: for p2 in self.pows: d = jellyfish.jaro_similarity(p1.title.lower(), p2.title.lower()) if d > 0.97 and p1.year == p2.year and p1.id != p2.id: log("Suspission de doublon entre " + str(p1) + " et " + str(p2)) if with_fusion: if p1.quality_score() > p2.quality_score(): b = self.fusion(p2, p1) else: b = self.fusion(p1, p2) if b: log("Fusion réalisée") rc = rc + 1 return rc
def word_deduplication(self, threshold=.8): keywords = [] for index, item in enumerate(self.candidateKeywords): keywords.append(item[0]) if index > 0: break for candidate in self.candidateKeywords: skip = False for key in keywords: if jaro_similarity(key.lower(), candidate[0].lower()) > threshold: skip = True break if not skip: keywords.append(candidate[0]) #print(keywords) #print( sorted([x[0] for x in self.candidateKeywords if x[0] in keywords], key=lambda k:k)) return sorted( [x[0] for x in self.candidateKeywords if x[0] in keywords], key=lambda k: k)
def comparacion_pares(self, texto1, texto2, tipo='levenshtein', norm=None): """ Permite hacer comparaciones entre dos textos de entrada, de acuerdo a un tipo de \ distancia o similitud determinado. :param texto1: (str) Primer texto de interés a comparar. :param texto2: (str) Segundo texto de interés a comparar. :param tipo: (str) {'damerau_levenshtein', 'levenshtein', 'hamming', 'jaro_winkler', \ 'jaro'} Valor por defecto: 'levenshtein'. Criterio de comparación a utilizar entre los textos. :param norm: (int) {1, 2} Valor por defecto: None. Permite normalizar \ los resultados en función de la longitud de los textos. \ Si norm=1 se normaliza en función al texto más corto, \ si norm=2 se normaliza en función al texto de mayor extensión. :return: (float o int) Valor resultado de la comparación. """ tipo = tipo.lower() if 'damerau' in tipo: salida = jellyfish.damerau_levenshtein_distance(texto1, texto2) elif 'levenshtein' in tipo: salida = jellyfish.levenshtein_distance(texto1, texto2) elif 'hamming' in tipo: salida = jellyfish.hamming_distance(texto1, texto2) elif 'winkler' in tipo: salida = jellyfish.jaro_winkler_similarity(texto1, texto2) elif 'jaro' in tipo: salida = jellyfish.jaro_similarity(texto1, texto2) else: print( 'Por favor seleccione un criterio válido para comparar los strings.' ) return None if norm in [1, 2] and 'jaro' not in tipo: if norm == 1: salida /= min(len(texto1), len(texto2)) else: salida /= max(len(texto1), len(texto2)) return salida
def jaro_distance(A, B): return 1 - jaro_similarity(A, B)
def diff(user, music_filter, spotify, output, min_threshold, max_threshold): spotify_tracks = spotify.tracks() spotify_tracks_by_slug = { slugify(f"""{t['track']['artists'][0]['name']}-{t['track']['name']}""", stopwords=STOPWORDS, replacements=REPLACEMENTS): # type: ignore t for t in spotify_tracks } local_tracks = user.do_filter(music_filter) local_tracks_by_slug = { slugify(f"""{t['artist']}-{t['title']}""", stopwords=STOPWORDS, replacements=REPLACEMENTS): # type: ignore t for t in local_tracks } spotify_differences = set(spotify_tracks_by_slug.keys()).difference( set(local_tracks_by_slug.keys())) spotify_slug_tracks = collections.OrderedDict( (d, spotify_tracks_by_slug[d]) for d in sorted(spotify_differences)) local_tracks_found = len(spotify_tracks_by_slug) - len(spotify_differences) if len(local_tracks) == local_tracks_found: return output_tracks(output, spotify_slug_tracks.values()) distances_tracks = [] for spotify_slug, spotify_track in spotify_slug_tracks.items(): distances = { local_slug: jellyfish.jaro_similarity(spotify_slug, local_slug) for local_slug in local_tracks_by_slug } if not distances: continue closest_local_track = max(distances.items(), key=operator.itemgetter(1)) closest_local_slug = closest_local_track[0] closest_distance = closest_local_track[1] if min_threshold <= closest_distance <= max_threshold: if 'spotify-error' in local_tracks_by_slug[closest_local_slug][ 'keywords']: continue distances_tracks.append({ 'local_track': local_tracks_by_slug[closest_local_slug], 'local_slug': closest_local_slug, 'spotify_track': spotify_track, 'spotify_slug': spotify_slug, 'distance': closest_distance, }) print_distances(distances_tracks) print(f"min threshold : {min_threshold}") print(f"max threshold : {max_threshold}") print(f"spotify tracks : {len(spotify_tracks)}") print(f"spotify slugs: {len(spotify_tracks_by_slug)}") print(f"local tracks : {len(local_tracks)}") print(f"local tracks slugs : {len(local_tracks_by_slug)}") print(f"found in local : {local_tracks_found}") print(f"not found in local : {len(spotify_differences)}")
import pandas as pd import jellyfish importer_list = pd.read_csv( r'C:\Users\S\PycharmProjects\CompanyNames\HMRC\importsNames.csv') importer_names = importer_list[['NAME']].drop_duplicates() # sample_df = pd.read_csv(r'C:\Users\S\PycharmProjects\CompanyNames\data\raw\company_names.csv') # # # x= pd.merge(sample_df,importer_names,how='inner',left_on = ['CompanyName'],right_on=['NAME']) # x=x[['NAME']].sample(100) # x.to_csv('matched.csv',index=None ) x = pd.read_csv(r'./HMRC/matched.csv') y = x['NAME'][0] z = [[i, jellyfish.jaro_similarity(i, y)] for i in x['NAME'] if y != i] z3 = [[i, jellyfish.match_rating_comparison(i, y)] for i in x['NAME'] if y != i] z2 = pd.DataFrame(z)
def jaro_similarity(s1, s2): return None if s1 == None or s2 == None else J.jaro_similarity(s1, s2)
def getClosestTeamName(teamName: str) -> str: return sorted( teamNames, key=lambda savedName: jellyfish.jaro_similarity(savedName, teamName), reverse=True)[0]
return float(numerator) / denominator def textToVector(text): words = WORD.findall(text) return Counter(words) second = open("reinterpreted_file_1.txt", "r") first = open("reinterpreted_file_2.txt", "r") text1 = second.read() text2 = first.read() vector1 = textToVector(text1) vector2 = textToVector(text2) cosine = calculateCosineSimilarity(vector1, vector2) data = { 'cosine': cosine, 'jaro_similarity': jellyfish.jaro_similarity(text1, text2), 'jaro_winkler_similarity': jellyfish.jaro_winkler_similarity(text1, text2), 'levenshtein_distance': jellyfish.levenshtein_distance(text1, text2), 'damerau_levenshtein_distance': jellyfish.damerau_levenshtein_distance(text1, text2), 'hamming_distance': jellyfish.hamming_distance(text1, text2) } with open('results.txt', 'w') as outfile: json.dump(data, outfile)
def jaro_similarity(self, a, b): result = jellyfish.jaro_similarity(a, b) return result