def jaro_winkler(s1, s2): """ This function computes the Jaro Winkler measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Jaro Winkler measure if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.jaro_winkler('MARTHA', 'MARHTA') 0.9611111111111111 >>> >>> em.jaro_winkler('MARTHA', None) nan """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # Create the similarity measure object measure = sm.JaroWinkler() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
def jaro_winkler(s1, s2): """ This function computes the Jaro Winkler measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Jaro Winkler measure if both the strings are not missing (i.e NaN), else returns NaN. """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # if isinstance(s1, six.string_types): # s1 = gh.remove_non_ascii(s1) # if isinstance(s2, six.string_types): # s2 = gh.remove_non_ascii(s2) # Create the similarity measure object measure = sm.JaroWinkler() if not (isinstance(s1, six.string_types) or isinstance(s1, bytes)): s1 = str(s1) if not (isinstance(s2, six.string_types) or isinstance(s2, bytes)): s2 = str(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
def textdistance_jaro_winkler_distance(candidates, inp, min_score, winkler): res = [] fun = py_stringmatching.JaroWinkler().get_raw_score if winkler else py_stringmatching.Jaro().get_raw_score for candidate in candidates: score = fun(candidate, inp) if score >= min_score: res.append((candidate, score)) return res
def jaro_winkler_score(self, str_pair, sim_score=True): """ calculate jaro winkler similarity between two strings :return: similarity score or raw score (0 to 1) """ s1, s2 = self._check_input(str_pair) jaro_wink = sm.JaroWinkler() return jaro_wink.get_sim_score( s1, s2) if sim_score else jaro_wink.get_raw_score(s1, s2)
def monge_elkan_score(self, str_pair, sim_func=sm.JaroWinkler().get_raw_score): """ calculate monge elkan similarity between two single sets of tokens :return: raw_score """ e1, e2 = self._check_input(str_pair, type_=list) me = sm.MongeElkan(sim_func=sim_func) return me.get_raw_score(e1, e2)
def jaro_winkler(s1, s2): if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN s1 = helper.convert_to_str_unicode(s1) s2 = helper.convert_to_str_unicode(s2) measure = sm.JaroWinkler() return measure.get_raw_score(s1, s2)
def extract_jarowinkler_distance(queried_name, predicted_name): jw = sm.JaroWinkler() res = np.empty(len(queried_name), dtype=float) for i in tqdm(range(len(queried_name))): try: # res[i] = distance.get_jaro_distance(queried_name[i], predicted_name[i], winkler=True, scaling=0.1) # res[i] = jaro.jaro_winkler_metric(queried_name[i], predicted_name[i]) res[i] = jw.get_raw_score(queried_name[i], predicted_name[i]) except: print(i) return res
def __init__(self): self.similarity_function = [ sm.BagDistance(), sm.Cosine(), sm.Dice(), sm.Editex(), sm.GeneralizedJaccard(), sm.Jaccard(), sm.Jaro(), sm.JaroWinkler(), sm.Levenshtein(), sm.OverlapCoefficient(), sm.TverskyIndex() ] self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
def jaro_winkler(s1, s2): if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # if isinstance(s1, six.string_types): # s1 = gh.remove_non_ascii(s1) # if isinstance(s2, six.string_types): # s2 = gh.remove_non_ascii(s2) # Create the similarity measure object measure = sm.JaroWinkler() if not(isinstance(s1, six.string_types) or isinstance(s1, bytes)): s1 = str(s1) if not(isinstance(s2, six.string_types) or isinstance(s2, bytes)): s2 = str(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
jac = FVC.stringMatchExcerpts('Jacc', sm.Jaccard(), sm.WhitespaceTokenizer(return_set=True)) jacq3 = FVC.stringMatchExcerpts('FuzzJacc', sm.Jaccard(), sm.QgramTokenizer(qval=3, return_set=True)) dice = FVC.stringMatchExcerpts('Dice', sm.Dice(), sm.WhitespaceTokenizer(return_set=True)) diceq3 = FVC.stringMatchExcerpts('Dice', sm.Dice(), sm.QgramTokenizer(qval=3, return_set=True)) cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(), sm.WhitespaceTokenizer(return_set=True)) cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure', sm.Cosine(), sm.QgramTokenizer(return_set=True)) LVdist = FVC.stringMatchTitles('LVDist', sm.Levenshtein()) sw = FVC.stringMatchTitles('SW', sm.SmithWaterman()) nw = FVC.stringMatchTitles('NW', sm.NeedlemanWunsch()) jw = FVC.stringMatchTitles('JW', sm.JaroWinkler()) def writeToCSV(fileName, header, tableList): wr = csv.writer(open(fileName, 'wb'), quoting=csv.QUOTE_ALL) wr.writerow(header) for row in tableList: wr.writerow(row) # Given a set of feature vector components, records precision and recall over several # classifiers. Records output to a table and vertical bar plot. def modelExperiment(insampleData, outsampleData,
Args: artist_name - the artist's name to be examined (str) Return: a hopeful noise-free artist name (str) ''' if not artist_name: return artist_name artist_name = re.sub('(\(|\[)+ *[0-9]+ *(\)|\])+ *$', '', artist_name).strip() return artist_name # an object used to compute the edit distance between two strings lev = sm.Levenshtein() # an object used to compute the Jaro-Winkler distance between two strings jw = sm.JaroWinkler() def maximize_assignment(matrix): ''' Solve the assignment problem presented by the similarity matrix using hungarian algorithm, and produce a mapping the maximize the similarity Args: matrix - a float similarity matrix representing an assignment problem (float[][]) Return: the maximized consine score of the similarity matrix (float) the optimal mapping in the form of a list of (row, col) tuples ((int, int)[]) ''' max_matrix = make_cost_matrix(matrix) mapping = Munkres().compute(max_matrix) cost = 0
def make_all_string_related_features_inplace(df_before_preprocessing, df_after_preprocessing): df_before_preprocessing[ 'vendor_name_expense_normalized'] = df_before_preprocessing[ 'vendor_name_expense'].apply(normalize) df_before_preprocessing[ 'purchase_vendor_receipts_normalized'] = df_before_preprocessing[ 'purchase_vendor_receipts'].apply(normalize) df_before_preprocessing[ 'travel_vendor_receipts_normalized'] = df_before_preprocessing[ 'travel_vendor_receipts'].apply(normalize) df_before_preprocessing[ 'expense_type_name_expense_normalized'] = df_before_preprocessing[ 'expense_type_name_expense'].apply(normalize) df_before_preprocessing[ 'expense_type_name_itemization_normalized'] = df_before_preprocessing[ 'expense_type_name_itemization'].apply(normalize) df_before_preprocessing[ 'expense_category_itemization_normalized'] = df_before_preprocessing[ 'expense_category_itemization'].apply(normalize) df_before_preprocessing[ 'expense_category_expense_normalized'] = df_before_preprocessing[ 'expense_category_expense'].apply(normalize) df_after_preprocessing['vendor_name_in_hotel_name_stopwords'] = \ df_before_preprocessing['vendor_name_expense_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_VENDOR_NAME)) df_after_preprocessing['vendor_name_in_car_stopwords'] = \ df_before_preprocessing['vendor_name_expense_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_CAR_VENDOR_NAME)) df_after_preprocessing['vendor_name_in_flights_name_stopwords'] = \ df_before_preprocessing['vendor_name_expense_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_FLIGHTS_VENDOR_NAME_EXPENSE)) df_after_preprocessing['tax_str_in_expense_type_name_expense'] = \ df_before_preprocessing['expense_type_name_expense_normalized'].apply(lambda x: check_if_tax_in_str(x)) df_after_preprocessing['tax_str_in_expense_type_name_itemization'] = \ df_before_preprocessing['expense_type_name_itemization_normalized'].apply(lambda x: check_if_tax_in_str(x)) df_after_preprocessing['expense_type_name_in_car_stopwords_itemization'] = \ df_before_preprocessing['expense_type_name_itemization_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_CAR_EXPENSE_TYPE_NAME)) df_after_preprocessing['expense_type_name_in_car_stopwords_expense'] = \ df_before_preprocessing['expense_type_name_expense_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_CAR_EXPENSE_TYPE_NAME)) df_after_preprocessing['car_str_in_expense_type_name_itemization'] = \ df_before_preprocessing['expense_type_name_itemization_normalized'].apply(check_if_car_in_str) df_after_preprocessing['car_str_in_expense_type_name_expense'] = \ df_before_preprocessing['expense_type_name_expense_normalized'].apply(check_if_car_in_str) df_after_preprocessing['hotel_str_in_expense_type_name_itemization'] = \ df_before_preprocessing['expense_category_itemization_normalized'].apply(check_if_hotel_lodging_in_str) df_after_preprocessing['hotel_str_in_expense_type_name_expense'] = \ df_before_preprocessing['expense_category_expense_normalized'].apply(check_if_hotel_lodging_in_str) df_after_preprocessing['expense_type_name_in_flights_stopwords_itemization'] = \ df_before_preprocessing['expense_type_name_itemization_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_FLIGHTS_EXPENSE_TYPE_NAME)) df_after_preprocessing['expense_type_name_in_flights_stopwords_expense'] = \ df_before_preprocessing['expense_type_name_expense_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_FLIGHTS_EXPENSE_TYPE_NAME)) df_after_preprocessing['expense_type_name_in_rail_stopwords_itemization'] = \ df_before_preprocessing['expense_type_name_itemization_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_RAIL_EXPENSE_TYPE_NAME)) df_after_preprocessing['expense_type_name_in_rail_stopwords_expense'] = \ df_before_preprocessing['expense_type_name_expense_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_RAIL_EXPENSE_TYPE_NAME)) # from anlaysis19_flights budget_check_list = ['hotel', 'flight', 'car', 'rail'] make_features_which_checks_if_str_equals('expense_type_name_itemization', budget_check_list, df_before_preprocessing, df_after_preprocessing) # from anlaysis19_flights expense_check_list = [ 'other', 'hotel', 'taxi', 'flight', 'car', 'fees_and_misc', 'rail', 'bus', 'train' ] make_features_which_checks_if_str_equals('expense_type_name_expense', expense_check_list, df_before_preprocessing, df_after_preprocessing) make_features_which_checks_if_str_equals('expense_type_name_itemization', expense_check_list, df_before_preprocessing, df_after_preprocessing) make_features_which_checks_if_str_equals('expense_category_expense', expense_check_list, df_before_preprocessing, df_after_preprocessing) make_features_which_checks_if_str_equals('expense_category_itemization', expense_check_list, df_before_preprocessing, df_after_preprocessing) df_after_preprocessing[ 'airbnb_in_vendor_name_expense'] = df_before_preprocessing[ 'vendor_name_expense_normalized'].apply(check_if_airbnb_in_str) df_after_preprocessing[ 'omega_world_travel_in_vendor_name_expense'] = df_before_preprocessing[ 'vendor_name_expense_normalized'].apply( check_if_omega_world_travel_in_str) # matching vendorname me = sm.MongeElkan(sim_func=sm.JaroWinkler().get_raw_score) df_after_preprocessing['mongeelkan_jaro_wink'] = df_before_preprocessing.apply(lambda row: max( me.get_raw_score(row[ 'vendor_name_expense_normalized'].split(), row[ 'purchase_vendor_receipts_normalized'].split()), \ me.get_raw_score(row[ 'travel_vendor_receipts_normalized'].split(), row[ 'vendor_name_expense_normalized'].split()) ) , axis=1)