def closest_matches(value, ref, num=5, thresh=1): res = heapq.nsmallest( num, ref, lambda x: jellyfish.damerau_levenshtein_distance(value, x)) if jellyfish.damerau_levenshtein_distance(value, res[0]) <= thresh: return res[0] else: return res
def correct_sentence(self, sentence, window=2, topn=10): """Correct mistakes in a single sentence Args: sentence (:obj:`list` of :obj:`str`): list of tokens in the sentence window (int): word window used to predict center word from context topn (int): number of most probable candidates to choose from Returns: """ sentence = copy.deepcopy(sentence) for i in range(len(sentence)): if sentence[i] not in self.model.wv.vocab: candidates = predict_output_word( self.model, sentence[max(0, i - window):min(len(sentence), i + window + 1)], topn=topn) # candidates = self.model.wv.most_similar([sentence[i]]) # if no candidates were found if candidates is None: continue best_candidate = min( candidates, key=lambda x: damerau_levenshtein_distance( x[0], sentence[i]))[0] sentence[i] = best_candidate if damerau_levenshtein_distance(sentence[i], best_candidate) < 3 else \ sentence[i] return sentence
def extract(self, x, y): if x is None or y is None: return 0 if self.similarity: return 1 - float(damerau_levenshtein_distance(unicode(x), unicode(y))) / max(len(x), len(y)) else: return damerau_levenshtein_distance(unicode(x), unicode(y))
def measure_distance(log_data, simulation_data): similarity = list() temp_log_data = log_data.copy() for sim_instance in simulation_data: min_dist, min_index = jf.damerau_levenshtein_distance(sim_instance['profile'], temp_log_data[0]['profile']) , 0 for i in range(0,len(temp_log_data)): sim = jf.damerau_levenshtein_distance(sim_instance['profile'], temp_log_data[i]['profile']) if min_dist > sim: min_dist = sim min_index = i abs_err = abs(temp_log_data[min_index]['tbtw'] - sim_instance['tbtw']) dl_t = damerau_levenshtein_distance(sim_instance['profile'], temp_log_data[min_index]['profile'], sim_instance['tbtw_list'], temp_log_data[min_index]['tbtw_list']) length=np.max([len(sim_instance['profile']), len(temp_log_data[min_index]['profile'])]) similarity.append(dict(caseid=sim_instance['caseid'], sim_order=sim_instance['profile'], log_order=temp_log_data[min_index]['profile'], sim_tbtw=sim_instance['tbtw_list'], log_tbtw=temp_log_data[min_index]['tbtw_list'], sim_score_t=(1-(dl_t/length)), sim_score=(1-(min_dist/length)), abs_err=abs_err)) del temp_log_data[min_index] return similarity
def impute_qty(instance, ingredients): similar_ingredients = get_similar( instance, ingredients, lambda x, y: jellyfish.damerau_levenshtein_distance( x["base"], y["base"]) + jellyfish.damerau_levenshtein_distance( x["unit"], y["unit"])) return similar_ingredients.qty.apply(float).mean()
def test_corpus(self): twoWords = re.compile('([A-Za-z]+)\s+([A-Za-z]+)') with open(os.path.dirname(os.path.realpath(__file__))+os.path.sep+"birkbeck_spelling_error_corpus/ABODAT.643") as f: pairs = f.read().split(',') for p in pairs: if twoWords.search(p): fst,snd = twoWords.search(p).groups() print fst.lower(),snd.lower() print jellyfish.damerau_levenshtein_distance(unicode(fst.lower()),unicode(snd.lower())),dl_dist(fst.lower(),snd.lower())
def gen(metric, serie1, serie2, oracle, r): """Reads the simulation results stats Args: settings (dict): Path to jar and file names rep (int): repetition number """ try: df_matrix = list() for i, s1_ele in enumerate(serie1): for j, s2_ele in enumerate(serie2): element = {'i': r[0]['min'] + i, 'j': r[1]['min'] + j} if metric in ['tsd', 'dl', 'dl_mae']: element['s_1'] = s1_ele['profile'] element['s_2'] = s2_ele['profile'] element['length'] = max(len(s1_ele['profile']), len(s2_ele['profile'])) if metric == 'tsd': element['p_1'] = s1_ele['proc_act_norm'] element['p_2'] = s2_ele['proc_act_norm'] element['w_1'] = s1_ele['wait_act_norm'] element['w_2'] = s2_ele['wait_act_norm'] if metric in ['mae', 'dl_mae']: element['et_1'] = s1_ele['end_time'] element['et_2'] = s2_ele['end_time'] element['st_1'] = s1_ele['start_time'] element['st_2'] = s2_ele['start_time'] df_matrix.append(element) df_matrix = pd.DataFrame(df_matrix) if metric == 'tsd': df_matrix['distance'] = df_matrix.apply( lambda x: tsd_alpha(x.s_1, x.s_2, x.p_1, x.p_2, x.w_1, x.w_2, oracle) / x.length, axis=1) elif metric in 'dl': df_matrix['distance'] = df_matrix.apply( lambda x: jf.damerau_levenshtein_distance( ''.join(x.s_1), ''.join(x.s_2)) / x.length, axis=1) elif metric == 'mae': df_matrix['distance'] = df_matrix.apply( lambda x: ae_distance(x.et_1, x.et_2, x.st_1, x.st_2), axis=1) elif metric == 'dl_mae': df_matrix['dl_distance'] = df_matrix.apply( lambda x: jf.damerau_levenshtein_distance( ''.join(x.s_1), ''.join(x.s_2)) / x.length, axis=1) df_matrix['mae_distance'] = df_matrix.apply( lambda x: ae_distance(x.et_1, x.et_2, x.st_1, x.st_2), axis=1) else: raise ValueError(metric) return df_matrix except Exception: traceback.print_exc()
def get_closest_damerau_levenshtein(needle, haystack): closest = None for x in haystack: if (closest == None): closest = (x, jellyfish.damerau_levenshtein_distance(needle, x)) else: temp = (x, jellyfish.damerau_levenshtein_distance(needle, x)) if (temp[1] < closest[1]): closest = temp if (closest == None): return None return closest[0]
def score(df): group_combinations = list(itertools.combinations(df.index.tolist(), 2)) xs = [r[0] for r in group_combinations] ys = [r[1] for r in group_combinations] xdf = df.loc[xs] ydf = df.loc[ys] first_name_cmp = damerau_levenshtein_distance(xdf.first_name, ydf.first_name) last_name_cmp = damerau_levenshtein_distance(xdf.last_name, ydf.first_name) score = first_name_cmp + last_name_cmp result = DataFrame(dict(left=xs, right=ys, score=score)) return result[result.score < 4]
def get_closest_damerau_levenshtein(needle,haystack): closest = None; for x in haystack: if(closest == None): closest = (x,jellyfish.damerau_levenshtein_distance(needle,x)); else: temp = (x,jellyfish.damerau_levenshtein_distance(needle,x)); if(temp[1] < closest[1]): closest = temp; if(closest == None): return None; return closest[0];
def comparator(element): """ Extract similarity features """ return { 'donor_id1': element['record_a']['donor_id'], 'donor_id2': element['record_b']['donor_id'], 'jaro_name': jf.jaro_winkler(element['record_a']['name'], element['record_b']['name']), 'damerau_name': jf.damerau_levenshtein_distance(element['record_a']['name'], element['record_b']['name']), 'jaro_address': jf.jaro_winkler(element['record_a']['address'], element['record_b']['address']), 'damerau_address': jf.damerau_levenshtein_distance(element['record_a']['address'], element['record_b']['address']) }
def gen(metric: Metric, serie1, serie2, oracle, r): """Reads the simulation results stats""" try: df_matrix = list() for i, s1_ele in enumerate(serie1): for j, s2_ele in enumerate(serie2): element = {'i': r[0]['min'] + i, 'j': r[1]['min'] + j} if metric in [Metric.TSD, Metric.DL, Metric.DL_MAE]: element['s_1'] = s1_ele['profile'] element['s_2'] = s2_ele['profile'] element['length'] = max(len(s1_ele['profile']), len(s2_ele['profile'])) if metric is Metric.TSD: element['p_1'] = s1_ele['proc_act_norm'] element['p_2'] = s2_ele['proc_act_norm'] element['w_1'] = s1_ele['wait_act_norm'] element['w_2'] = s2_ele['wait_act_norm'] if metric in [Metric.MAE, Metric.DL_MAE]: element['et_1'] = s1_ele['end_time'] element['et_2'] = s2_ele['end_time'] element['st_1'] = s1_ele['start_time'] element['st_2'] = s2_ele['start_time'] df_matrix.append(element) df_matrix = pd.DataFrame(df_matrix) if metric is Metric.TSD: df_matrix['distance'] = df_matrix.apply( lambda x: tsd_alpha(x.s_1, x.s_2, x.p_1, x.p_2, x.w_1, x.w_2, oracle) / x.length, axis=1) elif metric is Metric.DL: df_matrix['distance'] = df_matrix.apply( lambda x: jf.damerau_levenshtein_distance( ''.join(x.s_1), ''.join(x.s_2)) / x.length, axis=1) elif metric is Metric.MAE: df_matrix['distance'] = df_matrix.apply( lambda x: ae_distance(x.et_1, x.et_2, x.st_1, x.st_2), axis=1) elif metric is Metric.DL_MAE: df_matrix['dl_distance'] = df_matrix.apply( lambda x: jf.damerau_levenshtein_distance( ''.join(x.s_1), ''.join(x.s_2)) / x.length, axis=1) df_matrix['mae_distance'] = df_matrix.apply( lambda x: ae_distance(x.et_1, x.et_2, x.st_1, x.st_2), axis=1) else: raise ValueError(metric) return df_matrix except Exception: traceback.print_exc()
def TMDB_get(self, response_search, film_number): """ Sub-method. It takes itself, the content of the response of the API, the index number of the film in self.data["film"]. Used by getAPI_tmdb to extract to confirm the right movie was picked (it contains the individual it is looking for) Returns True or False, True if the individual was present in the credits of the movie. """ url_MDB_credit = "http://api.tmdb.org/3/movie/{}/credits?api_key={}" API_KEY_MDB = "a68690ebf69567801e68c26ee82d7787" found_individual = False try : for number_results in range(len(response_search["results"])): if found_individual == False: if response_search["results"][number_results].get("release_date") not in (None,0,''): # There is a release date if (int(response_search["results"][number_results]["release_date"][0:4]) in (self.data["year"][film_number],self.data["year"][film_number]-1, self.data["year"][film_number]-2, self.data["year"][film_number]-3)) and found_individual == False: id_MDB = response_search["results"][number_results]["id"] response_credit = json.loads(requests.get(url_MDB_credit.format(id_MDB,API_KEY_MDB)).text) if response_credit.get("success") != False: # Check if the API find a person if self.data["category"][film_number] == "Directing": for acteurs in range(len(response_credit["crew"])): if jellyfish.damerau_levenshtein_distance(response_credit["crew"][acteurs]["name"], self.data["name"][film_number]) < 2: self.list_id_indiv.append(response_credit["crew"][acteurs]["id"]) found_individual = True break if not any(jellyfish.damerau_levenshtein_distance(response_credit["crew"][acteurs]["name"], self.data["name"][film_number]) < 2 for acteurs in range(len(response_credit["crew"]))): pass # self.Correction(film_number, False) else: for acteurs in range(len(response_credit["cast"])): if jellyfish.damerau_levenshtein_distance(response_credit["cast"][acteurs]["name"], self.data["name"][film_number]) < 2: self.list_id_indiv.append(response_credit["cast"][acteurs]["id"]) found_individual = True # Empêche d'avoir plusieurs fois le même acteur si on l'a déjà trouvé break if not any(jellyfish.damerau_levenshtein_distance(response_credit["cast"][acteurs]["name"], self.data["name"][film_number]) < 2 for acteurs in range(len(response_credit["cast"]))): pass # self.Correction(film_number, False) # Error handling except requests.exceptions.RequestException as e: print("There was an error while requesting oscars.org website. Please retry or check your connection or the status of the website. See next the error message: ", e) raise SystemExit(e) # Return False if no movie-individual pair was found, and True if it was found return(found_individual)
def jelly(): import jellyfish a = u'Korle Bu Teaching Hospital Sickle Cell Dept' b = u'Korle Bu Teaching Hospital' # a = u'x' # b = u'a' print jellyfish.levenshtein_distance(a, b) print jellyfish.jaro_distance(a, b) print jellyfish.damerau_levenshtein_distance(a, b) # print jellyfish.match_rating_comparison(a,b) from fuzzywuzzy import fuzz print fuzz.ratio(a, b)
def damerau_sim(self): self.cluster = [] for i in range(0,len(self.group)): for j in range(i+1, len(self.group)): if self.threshold <= (1 - jf.damerau_levenshtein_distance(str(self.group[i],self.group[j])) / max(len(str(self.group[i])),len(str(self.group[j])))): self.cluster.append([self.group[i],self.group[j]]) return self.cluster
def dl_preprocess_words(words1, words2): global DL_COUNT min_distance = 100 min_threshold = 0.4 new_words = [] for word1 in words1: l1 = len(word1) if l1 < 5: new_words.append(word1) continue closest_word = word1 for word2 in words2: l2 = len(word2) if l2 < 5: continue try: d = damerau_levenshtein_distance(word1, word2) except: d = 100 if d < min_threshold * min(l1, l2) and d < min_distance: min_distance = d closest_word = word2 DL_COUNT += 1 logging.debug('count: %d, word1: %s, word2: %s, distance: %d' % (DL_COUNT, word1, word2, d)) new_words.append(closest_word) return new_words
def dameraulevenshtein(seq1, seq2): """Calculate the Damerau-Levenshtein distance between sequences. This distance is the number of operations (consisting of insertions, deletions or substitutions of a single character, or transposition of two adjacent characters) required to change one sequence into the other. Arguments may be str or unicode. >>> dameraulevenshtein('ba', 'abc') 2 >>> dameraulevenshtein('fee', 'deed') 2 >>> dameraulevenshtein(u'abcd', u'bacde') 2 >>> dameraulevenshtein(u'number e', u'number \u03c0') 1 """ if isinstance(seq1, str): seq1 = unicode(seq1, 'utf-8') if isinstance(seq2, str): seq2 = unicode(seq2, 'utf-8') # Fall back onto Python implementation for code points unsupported by the C # implementation. # https://github.com/jamesturk/jellyfish/issues/55#issuecomment-312509263 try: return jellyfish.damerau_levenshtein_distance(seq1, seq2) except ValueError: return py_jellyfish.damerau_levenshtein_distance(seq1, seq2)
def get_damerau_levenshtein_avg(row1, row2): sum = 0 for columnIndex in xrange(1,15): a = row1[columnIndex] b = row2[columnIndex] sum += 1 - jellyfish.damerau_levenshtein_distance(a, b) / float(max(len(a), len(b))) return sum / 14.0
def impute_unit(instance, ingredients): similar_ingredients = get_similar( instance, ingredients, lambda x, y: jellyfish.damerau_levenshtein_distance( x["base"], y["base"])) return similar_ingredients.unit.mode()
def dist_calc(author_pair): dist = jellyfish.damerau_levenshtein_distance(author_pair[0], author_pair[1]) if dist <= 3: author_pair.append(dist) return author_pair else: return False
def dameraulevenshtein(seq1, seq2): """Calculate the Damerau-Levenshtein distance between sequences. This distance is the number of additions, deletions, substitutions, and transpositions needed to transform the first sequence into the second. Arguments must be strings. Transpositions are exchanges of *consecutive* characters; all other operations are self-explanatory. This implementation is O(N*M) time and O(M) space, for N and M the lengths of the two sequences. >>> dameraulevenshtein('ba', 'abc') 2 >>> dameraulevenshtein('fee', 'deed') 2 >>> dameraulevenshtein('abcd', 'bacde') 3 Note: the real answer is 2: abcd->bacd->bacde but this algorithm is apparently doing abcd->acd->bacd->bacde """ return jellyfish.damerau_levenshtein_distance(seq1.encode('utf-8'), seq2.encode('utf-8'))
def dameraulevenshtein(seq1, seq2): """Calculate the Damerau-Levenshtein distance between sequences. This distance is the number of additions, deletions, substitutions, and transpositions needed to transform the first sequence into the second. Although generally used with strings, any sequences of comparable objects will work. Transpositions are exchanges of *consecutive* characters; all other operations are self-explanatory. This implementation is O(N*M) time and O(M) space, for N and M the lengths of the two sequences. >>> dameraulevenshtein('ba', 'abc') 2 >>> dameraulevenshtein('fee', 'deed') 2 It works with arbitrary sequences too: >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e']) 2 """ return jellyfish.damerau_levenshtein_distance(seq1.encode('utf-8'), seq2.encode('utf-8'))
def check_last_name(last_names, officer_name): poss_match = [] for l in last_names: for o in officer_name: if js.damerau_levenshtein_distance(l[0], o) == 0: poss_match.append(l) return poss_match
def check_last_name(last_names, officer_name): poss_match = [] for l in last_names: for o in officer_name: if js.damerau_levenshtein_distance(l[0], o) == 0: poss_match.append(l) return poss_match
def spell_suggest(word, possibilities): r"""Return a ordered list of spelling suggestions for `word`. Suggestions are drawn from `possibilities`. If the word is too dissimilar the list will be empty. >>> possible='''title subtitle client project author recipients version date ... tnc toc toc-depth title subtitle client project author recipients ... confidential tnc toc toc-depth'''.split() >>> spell_suggest('recipient', possible) ['recipients'] >>> spell_suggest('t&c', possible) ['tnc'] >>> spell_suggest('foobar', possible) [] """ # pylint: disable=E1101 dist, best_i = min( zip((jellyfish.damerau_levenshtein_distance(deunicode(word), poss) for poss in possibilities), count())) if dist <= min(3, len(possibilities[best_i])/2): return [possibilities[best_i]] return []
def code_generator(digits, max_value, min_distance): """Generate distant enough numeric codes (Damerau-Levenshtein distance). Parameters ---------- digits : int Number of digits the numeric code is made of. If needed the string will be padded with zeroes. max_value : int Maximal numeric value of the code. min_distance : int Minimal Damerau-Levenshtein distance between generated strings. Yields ------ str A code is a string made of `digits` characters. """ lexicode = [] candidates = list(range(largest_int_with_less_digits(max_value) + 1, max_value + 1)) shuffle(candidates) for i in candidates: i = str(i) i = i.zfill(digits) if not lexicode or min(damerau_levenshtein_distance(i, j) for j in lexicode) >= min_distance: lexicode.append(i) yield i
def calculate_distances(self, serie1, serie2, id1, id2): """ Parameters ---------- serie1 : list serie2 : list id1 : index of the list 1 id2 : index of the list 2 Returns ------- dl : float value ae : absolute error value """ length = np.max([len(serie1[id1]['profile']), len(serie2[id2]['profile'])]) d_l = jf.damerau_levenshtein_distance( ''.join(serie1[id1]['profile']), ''.join(serie2[id2]['profile']))/length cicle_time_s1 = ( serie1[id1]['end_time'] - serie1[id1]['start_time']).total_seconds() cicle_time_s2 = ( serie2[id2]['end_time'] - serie2[id2]['start_time']).total_seconds() ae = np.abs(cicle_time_s1 - cicle_time_s2) return d_l, ae
def alldist(filex, filey): xread = open(filex, 'r').read() yread = open(filey, 'r').read() lvd = jellyfish.levenshtein_distance(xread,yread) dlvd= jellyfish.damerau_levenshtein_distance(xread,yread) spsum = spamsum.match(xread,yread) spsum = 100 - spsum spsum = float(spsum/100.00) # print lvd res = float( lvd / 100.00 ) dres= float(dlvd / 100.00 ) # print res # print "Levenshtein Distance=",res jaro = jellyfish.jaro_distance(xread,yread) ## Added jaro-winkler distance by fahim 20111011 jarowink = jellyfish.jaro_winkler(xread,yread) jaro = 1.0 - jaro jarowink = 1.0 - jarowink # print "Jaro Distance = ",jaro ham = jellyfish.hamming_distance(xread,yread) ham = float ( ham / 100.00) print "Hamming Distance = ", ham # print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2)) # print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1)) # print "Spamsum Match score: ", spsum kl = kldiv(tokenize(xread), tokenize(yread)) return res, dres , jaro, jarowink, ham, kl, spsum
def measure_distance(not_conformant, conformant): similarity = list() temp_conformant = conformant.copy() for not_con_trace in not_conformant: min_dist = jf.damerau_levenshtein_distance(not_con_trace['profile'], temp_conformant[0]['profile']) min_index = 0 for i in range(0,len(temp_conformant)): sim = jf.damerau_levenshtein_distance(not_con_trace['profile'], temp_conformant[i]['profile']) if min_dist > sim: min_dist = sim min_index = i length=np.max([len(not_con_trace['profile']), len(temp_conformant[min_index]['profile'])]) similarity.append(dict(caseid=not_con_trace['caseid'], sim_caseid=temp_conformant[min_index]['caseid'], sim_score=(1-(min_dist/length)))) return similarity
def dameraulevenshtein(seq1, seq2): """Calculate the Damerau-Levenshtein distance between sequences. This distance is the number of additions, deletions, substitutions, and transpositions needed to transform the first sequence into the second. Arguments must be strings. Transpositions are exchanges of *consecutive* characters; all other operations are self-explanatory. This implementation is O(N*M) time and O(M) space, for N and M the lengths of the two sequences. >>> dameraulevenshtein('ba', 'abc') 2 >>> dameraulevenshtein('fee', 'deed') 2 >>> dameraulevenshtein('abcd', 'bacde') 3 Note: the real answer is 2: abcd->bacd->bacde but this algorithm is apparently doing abcd->acd->bacd->bacde """ return jellyfish.damerau_levenshtein_distance(seq1.encode('utf-8'), seq2.encode('utf-8'))
def _name_distance_indicator(pkg_name_1, pkg_name_2): if pkg_name_1 == pkg_name_2: return float( "inf" ) # We don't want the scan to report that, for instance, numpy is a name very close to that of the popular package numpy... return (2 * jellyfish.damerau_levenshtein_distance( pkg_name_1, pkg_name_2) / (len(pkg_name_1) + len(pkg_name_2)))
def get_closest_matches(s, candidates, top_n=1): scores = np.array([ 1 - jellyfish.damerau_levenshtein_distance(s, c) / len(s) for c, _ in candidates ]) return [(scores[i], candidates[i]) for i in np.argsort(scores)[-top_n:][::-1]]
def simple_example(): # String comparison. str1, str2 = u'jellyfish', u'smellyfish' print("jellyfish.levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.levenshtein_distance(str1, str2))) print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2))) print("jellyfish.hamming_distance({}, {}) = {}.".format( str1, str2, jellyfish.hamming_distance(str1, str2))) print("jellyfish.jaro_distance({}, {}) = {}.".format( str1, str2, jellyfish.jaro_distance(str1, str2))) print("jellyfish.jaro_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_similarity(str1, str2))) print("jellyfish.jaro_winkler({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler(str1, str2))) print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler_similarity(str1, str2))) print("jellyfish.match_rating_comparison({}, {}) = {}.".format( str1, str2, jellyfish.match_rating_comparison(str1, str2))) #-------------------- # Phonetic encoding. ss = u'Jellyfish' print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss))) print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss))) print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss))) print("jellyfish.match_rating_codex({}) = {}.".format( ss, jellyfish.match_rating_codex(ss)))
def subtract(filename): # import control file as list with open(CONTROL, "r") as control: control_list = [line.strip() for line in control.readlines()] # import other files to subtract from with open(filename, "r") as peptides: peptides_list = [line.strip() for line in peptides.readlines()] # open empty lists and then append sequences based on string distance metric different = [] close = [] # tests to see if each peptide is close to all the peptides in control file, based on the cutoff for i in peptides_list: for j in control_list: metric = jellyfish.damerau_levenshtein_distance(str(i), str(j)) if metric < cutoff: if i not in close: #this is a modification made to reduce memory use, output file size, etc. only append unique entries close.append(i) # screens out any peptides that had a hit in the control for i in peptides_list: if i not in close: different.append(i) # save the filtered lists and the removed hits as text files if the lists have content if len(different) > 0: np.savetxt(filename+"_.controlsubtracted", different, fmt="%s", delimiter="\n") if len(close) > 0: np.savetxt(filename+"_.hitscontrol", close, fmt="%s", delimiter="\n") return different, close
def find_approx(cmd_input: str, cmd_map: Optional[Iterable[str]]) -> Iterable[str]: """Finds the closest command to the passed cmd, this is used in case we cannot find an exact match for the cmd We will use two methods, unique prefix match and levenshtein distance match """ prefix_suggestions = set() levenshtein_suggestions = {} for another_command in cmd_map: if str(another_command).startswith(str(cmd_input).lower()): prefix_suggestions.add(another_command) # removing single letter levenshtein suggestions # such as `?`, `q` etc elif len(another_command) > 1: distance = jellyfish.damerau_levenshtein_distance( str(cmd_input).lower(), another_command) if distance <= 2: levenshtein_suggestions.update({another_command: distance}) if prefix_suggestions: return sorted(prefix_suggestions) else: # sort suggestions by levenshtein distance and then by name return [ k for k, _ in sorted(levenshtein_suggestions.items(), key=lambda i: (i[1], i[0])) ]
def alldist(filex, filey): xread = open(filex, "r").read() yread = open(filey, "r").read() lvd = jellyfish.levenshtein_distance(xread, yread) dlvd = jellyfish.damerau_levenshtein_distance(xread, yread) # print lvd res = float(lvd / 100.00) dres = float(dlvd / 100.00) # print res # print "Levenshtein Distance=",lv_d # jaro = jellyfish.jaro_distance(xread,yread) ## Added jaro-winkler distance by fahim 20111011 # jarowink = jellyfish.jaro_winkler(xread,yread) # jaro = 1.0 - jaro # jarowink = 1.0 - jarowink # print "Jaro Distance = ",jaro # ham = jellyfish.hamming_distance(xread,yread) # ham = float ( ham / 100.00) # print "Hamming Distance = ", ham # print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2)) # print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1)) # kl = kldiv(tokenize(xread), tokenize(yread)) return res, dres, jaro, jarowink, ham, kl
def similarity(self, a, b): """Returns string similarity in range 0 - 100%.""" try: distance = damerau_levenshtein_distance(a, b) return int(100 * (1.0 - (distance / max(len(a), len(b), 1)))) except MemoryError: # Too long string, mark them as not much similar return 50
def similarity(self, a, b): """Returns string similarity in range 0 - 100%.""" try: distance = damerau_levenshtein_distance(a, b) return int(100 * (1.0 - (distance / max(len(a), len(b), 1)))) except MemoryError: # Too long string, mark them as not much similar return 50
def package_conflicts(packages, max_similarity_ratio=1/3, max_distance=2): for package_x, package_y in product(packages, repeat=2): if package_x <= package_y: continue distance = jellyfish.damerau_levenshtein_distance(package_x, package_y) min_len = min(len(package_x), len(package_y)) if distance/min_len <= max_similarity_ratio and distance <= max_distance: yield package_x, package_y
def mapperSimilarity(self, _, line): SIMILARITY_THRESHOLD = -1.0 words = line.split(' ') distance = damerau_levenshtein_distance(words[0], words[1]) sim = self.normalizeDistanceIndex(len(words[0]), len(words[1]), distance) if (sim > SIMILARITY_THRESHOLD): yield (words[0], [words[1], sim])
def damerau_levenshtein_apply(x): try: return 1 - jellyfish.damerau_levenshtein_distance(x[0], x[1]) / np.max([len(x[0]), len(x[1])]) except Exception as err: if pandas.isnull(x[0]) or pandas.isnull(x[1]): return np.nan else: raise err
def check_other_names(officer_name, poss_names): poss_match_dict = Counter() for p in poss_names: for n in p[0]: for o in officer_name: if js.damerau_levenshtein_distance(n, o) == 0: poss_match_dict[p[1]] += 1 return poss_match_dict
def get_matching_mov_title(seq,gt_rows,col_name): seq = ''.join(c for c in seq if c.isalnum()) seq = seq.upper() rows = list(gt_rows) first_row = rows.pop(0) gt_seq = first_row[col_name] gt_seq = ''.join(c for c in gt_seq if c.isalnum()) gt_seq = gt_seq.upper() best_score = jellyfish.damerau_levenshtein_distance(seq,gt_seq) best_match = first_row[col_name] for row in rows: gt_seq = ''.join(c for c in row[col_name] if c.isalnum()) gt_seq = gt_seq.upper() cur_score = jellyfish.damerau_levenshtein_distance(seq,gt_seq) if cur_score < best_score: best_score = cur_score best_match = row[col_name] return best_match
def test_damerau_levenshtein_distance(self): cases = [("", "", 0), ("abc", "", 3), ("bc", "abc", 1), ("abc", "acb", 1), ] for (s1, s2, value) in cases: self.assertEqual(jellyfish.damerau_levenshtein_distance(s1, s2), value)
def similarity(self, first, second): """Returns string similarity in range 0 - 100%.""" try: # The C version (default) fails on unicode chars # see https://github.com/jamesturk/jellyfish/issues/55 try: distance = damerau_levenshtein_distance(first, second) except ValueError: distance = py_damerau_levenshtein_distance(first, second) except MemoryError: # Too long string, mark them as not much similar return 50 return int( 100 * (1.0 - (float(distance) / max(len(first), len(second), 1))) )
def distance(string_1, string_2): """Compute the edit distance between two strings. """ return jsonify({ "levenshtein": jellyfish.levenshtein_distance(string_1, string_2), "damerau-levenshtein": jellyfish.damerau_levenshtein_distance( string_1, string_2 ), "jaro": jellyfish.jaro_distance(string_1, string_2), "jaro-winkler": jellyfish.jaro_winkler(string_1, string_2), "match_rating_codex": jellyfish.match_rating_comparison( string_1, string_2 ), "sift3": pymailcheck.sift3_distance(string_1, string_2), })
def stringDistance(str1, str2): """ Return distance between two strings String distance : jaro + levenshtein + damerau """ distance = 0 if len(str1) > 0 and len(str2) > 0: str1 = str1.decode('utf-8') str2 = str2.decode('utf-8') jaro = jellyfish.jaro_distance(str1, str2) leven = jellyfish.levenshtein_distance(str1, str2) damerau = jellyfish.damerau_levenshtein_distance(str1, str2) norm = max(len(str1), len(str2)) distance = 0.5 * jaro + 0.25 * (1 - leven / norm) \ + 0.25 * (1 - damerau / norm) return distance
def find_distances(file1, file2): #open list to dump calculated distances into distances = [] # import files to compare with open(file1, "r") as file1: file1_list = [line.strip() for line in file1.readlines()] with open(file2, "r") as file2: file2_list = [line.strip() for line in file2.readlines()] for i in file1_list: for j in file2_list: distances.append(jellyfish.damerau_levenshtein_distance(i, j)) mean = np.mean(distances) stdv = np.std(distances) return distances, mean, stdv
def calc_distances(product, company): matches = [] product_tokens = product['name'].split(' ') company_tokens = company['name'].split(' ') for company_token in company_tokens: lowest_distance = None matched_token = None for product_token in product_tokens: distance = jellyfish.damerau_levenshtein_distance( product_token, company_token) if lowest_distance is None or distance < lowest_distance: lowest_distance = distance matched_token = product_token matches.append({ 'company_token': company_token, 'product_token': matched_token, 'lowest_distance': lowest_distance }) return matches
def get_matching_seq(target_seq, gt_seqs_dict): best_score = 10000 target_seq = ''.join(c for c in target_seq if c.isalnum()) target_seq = target_seq.upper() best_matching_index = 0 best_matching_seq = "" for key, gt_seq in gt_seqs_dict.iteritems(): gt_seq = ''.join(c for c in gt_seq if c.isalnum()) gt_seq = gt_seq.upper() curr_score = jellyfish.damerau_levenshtein_distance(target_seq, gt_seq) if curr_score < best_score: best_score = curr_score best_matching_index = key best_matching_seq = gt_seq return best_matching_index
def measure_string_distance(s1, s2, method): ''' Four methods will be used with method code from 1 to 4 Two methods focused on string similarity and the other two will be focused on phonetic encoding Method code to method name: 1. jaro-winkler distance 2. damerau-levenshtein distance 3. Metaphone 4. NYSIIS 5. match_rating_codex note: for methods 4,5 and 6, they only can provide results as 1 (match) or 0 (not match) for methods 1 and 2, the methods will return a value in range [0, 1] ''' result = 0 if s1 == '' or s2 == '': return result if method == 1: result = jellyfish.jaro_winkler(s1, s2) elif method == 2: try: diff = jellyfish.damerau_levenshtein_distance(s1, s2) result = 1 - (diff / max(len(s1), len(s2))) except: result = 0 elif method == 3: result = 1 if jellyfish.metaphone(s1) == jellyfish.metaphone(s2) else 0 elif method == 4: result = 1 if jellyfish.nysiis(s1) == jellyfish.nysiis(s2) else 0 elif method == 5: result = 1 if jellyfish.match_rating_codex( s1) == jellyfish.match_rating_codex(s2) else 0 # elif method == 0: # raise ValueError("provide a method code (1-6).") # else: # raise ValueError("the method parameter must be in the range from 1 to 6.") return result
def populate_topics_from_phantom_forms(cls): all_forms = phantom_on_the_capitol.retrieve_form_elements([x.bioguide_id for x in Legislator.query.all()]) all_topics = {} for legislator, req in all_forms.iteritems(): for key, val in req.iteritems(): for step in val: if step['value'] == '$TOPIC': if type(step['options_hash']) is dict: keys = step['options_hash'].keys() else: keys = step['options_hash'] for k in keys: k = k.strip() if all_topics.has_key(k): all_topics[k] += 1 else: all_topics[k] = 1 failed_topics = [] for topic, count in all_topics.iteritems(): result = select_solver.choose('test', [topic.lower()]) if result is None: failed_topics.append(topic.lower()) elif result: db_first_or_create(Topic, name=topic.lower()) all_topics = Topic.query.filter_by(wikipedia_parent=None) for f_topic in failed_topics: try: lowest = (None, None) for topic in all_topics: print topic.name, f_topic d = jellyfish.damerau_levenshtein_distance(unicode(str(topic.name)), unicode(str(f_topic))) if lowest[0] is None or lowest[1] > d: lowest = (topic, d) print 'Adding ' + f_topic + ' with parent ' + lowest[0].name db_first_or_create(Topic, name=f_topic, wikipedia_parent=lowest[0].id) except: continue
def find(self, name_alias_id, fuzzy=False): """ Find securities :param name_alias_id: :return: ISIN_ID based on any (useful) information """ # import pdb; pdb.set_trace() if not fuzzy: find_something = Security.objects.filter(name__contains=name_alias_id) |\ Security.objects.filter(aliases__contains=name_alias_id) |\ Security.objects.filter(isin_id=name_alias_id) |\ Security.objects.filter(yahoo_id=name_alias_id) result = None if not find_something else find_something[0] else: # import pdb; pdb.set_trace() min_score = 2.5 min_score_sec = None # print('Trans', name_alias_id) for sec in Security.objects.all(): if not isinstance(sec.aliases, list): all_names = [sec.name] else: all_names = sec.aliases + [sec.name] for alias in all_names: # print('analyzing sec', alias) score = jellyfish.damerau_levenshtein_distance(name_alias_id.lower(), alias.lower()) # print('Score', score) if score < min_score: min_score = score min_score_sec = sec result = min_score_sec, min_score # >>> jellyfish.levenshtein_distance('jellyfish', 'smellyfish') # 2 # >>> jellyfish.jaro_distance('jellyfish', 'smellyfish') # 0.89629629629629637 # >>> jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs') # 1 return result
def dameraulevenshtein(seq1, seq2): """Calculate the Damerau-Levenshtein distance between sequences. This distance is the number of additions, deletions, substitutions, and transpositions needed to transform the first sequence into the second. Although generally used with strings, any sequences of comparable objects will work. Transpositions are exchanges of *consecutive* characters; all other operations are self-explanatory. This implementation is O(N*M) time and O(M) space, for N and M the lengths of the two sequences. >>> dameraulevenshtein('ba', 'abc') 2 >>> dameraulevenshtein('fee', 'deed') 2 It works with arbitrary sequences too: >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e']) 2 """ return jellyfish.damerau_levenshtein_distance(seq1.encode("utf-8"), seq2.encode("utf-8"))
def similarityMeasures(row1, row2): jaro_sum = 0 jaro_winkler_sum = 0 levenshtein_sum = 0 damerau_levenshtein_sum = 0 for columnIndex in range(1,15): #skips id column a = row1[columnIndex] b = row2[columnIndex] jaro_sum += jellyfish.jaro_distance(a, b) jaro_winkler_sum += jellyfish.jaro_winkler(a, b) levenshtein_sum += 1 - jellyfish.levenshtein_distance(a, b) / float(max(len(a), len(b))) damerau_levenshtein_sum += 1 - jellyfish.damerau_levenshtein_distance(a, b) / float(max(len(a), len(b))) returnV = "%.6f,%.6f,%.6f,%.6f" % ( jaro_sum / 14.0, jaro_winkler_sum / 14.0, levenshtein_sum / 14.0, damerau_levenshtein_sum / 14.0) for i in range(1,15): returnV += ",%.6f" % (jellyfish.jaro_distance(row1[i], row2[i])) return returnV
# Levenshtein Distance # Damerau-Levenshtein Distance # Jaro Distance # Jaro-Winkler Distance # Match Rating Approach Comparison # Hamming Distance # Phonetic encoding: # American Soundex # Metaphone # NYSIIS (New York State Identification and Intelligence System) # Match Rating Codex import jellyfish print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish')) # 2; 编辑距离 print(jellyfish.jaro_distance('jellyfish', 'smellyfish')) # 0.89629629629629637 print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')) # 1; 编辑距离, 带翻转的 print(jellyfish.metaphone('Jellyfish')) # 'JLFX' print(jellyfish.soundex('Jellyfish')) # 'J412' print(jellyfish.nysiis('Jellyfish')) # 'JALYF' print(jellyfish.match_rating_codex('Jellyfish')) # 'JLLFSH' ################################################################## ## Lenvenshtein import Levenshtein print(Levenshtein.hamming('hello', 'helol')) # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数 print(Levenshtein.distance('hello', 'helol')) # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换 print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf')) # 5 print(Levenshtein.ratio('hello', 'helol')) # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离 # 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2 # 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题 print(Levenshtein.jaro('hello', 'helol')) # 0.9333333333333332; 计算 jaro 距离; 用于健康普查
damerau_levenshtein_avg = 0 levenshtein_avg = 0 for columnIndex in xrange(1,15): a = table[index1][columnIndex][1:] b = table[index2][columnIndex][1:] if a=="" or b=="": numColumns -= 1 else: jaro_tmp = jellyfish.jaro_distance(a, b) jaro[columnIndex] = jaro_tmp jaro_avg += jaro_tmp jaro_winkler_avg += jellyfish.jaro_winkler(a, b) damerau_levenshtein_avg += 1 - jellyfish.damerau_levenshtein_distance(a, b) / float(max(len(a), len(b))) levenshtein_avg += 1 - jellyfish.levenshtein_distance(a, b) / float(max(len(a), len(b))) jaro_avg /= numColumns jaro_winkler_avg /= numColumns damerau_levenshtein_avg /= numColumns # apply the learned rules from the trained model: #if jaro_winkler_avg >= 0.844955 or ((damerau_levenshtein_avg >= 0.650227) and (jaro_winkler_avg >= 0.833977)): # results_file.write(table[index1][0] + "\t" + table[index2][0] + "\n") ''' #duplicate = [int(table[index1][0]), int(table[index2][0])] isDuplicateInReal = duplicate in trueDuplicates'''