Example #1
0
def jaro_winkler(s1, s2):
    """
    This function computes the Jaro Winkler measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Jaro Winkler measure if both the strings are not missing (i.e NaN),
        else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.jaro_winkler('MARTHA', 'MARHTA')
        0.9611111111111111
        >>> >>> em.jaro_winkler('MARTHA', None)
        nan

    """

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    # Create the similarity measure object
    measure = sm.JaroWinkler()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)
def jaro_winkler(s1, s2):
    """
    This function computes the Jaro Winkler measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Jaro Winkler measure if both the strings are not missing (i.e NaN),
        else  returns NaN.
    """

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN
    # if isinstance(s1, six.string_types):
    #     s1 = gh.remove_non_ascii(s1)
    # if isinstance(s2, six.string_types):
    #     s2 = gh.remove_non_ascii(s2)
    # Create the similarity measure object
    measure = sm.JaroWinkler()
    if not (isinstance(s1, six.string_types) or isinstance(s1, bytes)):
        s1 = str(s1)

    if not (isinstance(s2, six.string_types) or isinstance(s2, bytes)):
        s2 = str(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)
def textdistance_jaro_winkler_distance(candidates, inp, min_score, winkler):
  res = []
  fun = py_stringmatching.JaroWinkler().get_raw_score if winkler else py_stringmatching.Jaro().get_raw_score
  for candidate in candidates:
    score = fun(candidate, inp)
    if score >= min_score:
      res.append((candidate, score))
  return res
Example #4
0
 def jaro_winkler_score(self, str_pair, sim_score=True):
     """
     calculate jaro winkler similarity between two strings
     :return: similarity score or raw score (0 to 1)
     """
     s1, s2 = self._check_input(str_pair)
     jaro_wink = sm.JaroWinkler()
     return jaro_wink.get_sim_score(
         s1, s2) if sim_score else jaro_wink.get_raw_score(s1, s2)
Example #5
0
 def monge_elkan_score(self,
                       str_pair,
                       sim_func=sm.JaroWinkler().get_raw_score):
     """
     calculate monge elkan similarity between two single sets of tokens
     :return: raw_score
     """
     e1, e2 = self._check_input(str_pair, type_=list)
     me = sm.MongeElkan(sim_func=sim_func)
     return me.get_raw_score(e1, e2)
def jaro_winkler(s1, s2):
    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    s1 = helper.convert_to_str_unicode(s1)
    s2 = helper.convert_to_str_unicode(s2)

    measure = sm.JaroWinkler()
    return measure.get_raw_score(s1, s2)
 def extract_jarowinkler_distance(queried_name, predicted_name):
     jw = sm.JaroWinkler()
     res = np.empty(len(queried_name), dtype=float)
     for i in tqdm(range(len(queried_name))):
         try:
             # res[i] = distance.get_jaro_distance(queried_name[i], predicted_name[i], winkler=True, scaling=0.1)
             # res[i] = jaro.jaro_winkler_metric(queried_name[i], predicted_name[i])
             res[i] = jw.get_raw_score(queried_name[i], predicted_name[i])
         except:
             print(i)
     return res
Example #8
0
    def __init__(self):
        self.similarity_function = [
            sm.BagDistance(),
            sm.Cosine(),
            sm.Dice(),
            sm.Editex(),
            sm.GeneralizedJaccard(),
            sm.Jaccard(),
            sm.Jaro(),
            sm.JaroWinkler(),
            sm.Levenshtein(),
            sm.OverlapCoefficient(),
            sm.TverskyIndex()
        ]

        self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
Example #9
0
def jaro_winkler(s1, s2):
    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN
    # if isinstance(s1, six.string_types):
    #     s1 = gh.remove_non_ascii(s1)
    # if isinstance(s2, six.string_types):
    #     s2 = gh.remove_non_ascii(s2)
    # Create the similarity measure object
    measure = sm.JaroWinkler()
    if not(isinstance(s1, six.string_types) or isinstance(s1, bytes)):
        s1 = str(s1)

    if not(isinstance(s2, six.string_types) or isinstance(s2, bytes)):
        s2 = str(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)
jac = FVC.stringMatchExcerpts('Jacc', sm.Jaccard(),
                              sm.WhitespaceTokenizer(return_set=True))
jacq3 = FVC.stringMatchExcerpts('FuzzJacc', sm.Jaccard(),
                                sm.QgramTokenizer(qval=3, return_set=True))
dice = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                               sm.WhitespaceTokenizer(return_set=True))
diceq3 = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                                 sm.QgramTokenizer(qval=3, return_set=True))
cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(),
                               sm.WhitespaceTokenizer(return_set=True))
cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure', sm.Cosine(),
                                 sm.QgramTokenizer(return_set=True))
LVdist = FVC.stringMatchTitles('LVDist', sm.Levenshtein())
sw = FVC.stringMatchTitles('SW', sm.SmithWaterman())
nw = FVC.stringMatchTitles('NW', sm.NeedlemanWunsch())
jw = FVC.stringMatchTitles('JW', sm.JaroWinkler())


def writeToCSV(fileName, header, tableList):
    wr = csv.writer(open(fileName, 'wb'), quoting=csv.QUOTE_ALL)
    wr.writerow(header)
    for row in tableList:
        wr.writerow(row)


# Given a set of feature vector components, records precision and recall over several
# classifiers.  Records output to a table and vertical bar plot.


def modelExperiment(insampleData,
                    outsampleData,
Example #11
0
	Args:
		 artist_name - the artist's name to be examined (str)
	Return:
		a hopeful noise-free artist name (str)
	'''
    if not artist_name: return artist_name
    artist_name = re.sub('(\(|\[)+ *[0-9]+ *(\)|\])+ *$', '',
                         artist_name).strip()
    return artist_name


# an object used to compute the edit distance between two strings
lev = sm.Levenshtein()

# an object used to compute the Jaro-Winkler distance between two strings
jw = sm.JaroWinkler()


def maximize_assignment(matrix):
    '''
	Solve the assignment problem presented by the similarity matrix using hungarian algorithm,
	and produce a mapping the maximize the similarity
	Args:
		matrix - a float similarity matrix representing an assignment problem (float[][])
	Return:
		the maximized consine score of the similarity matrix (float)
		the optimal mapping in the form of a list of (row, col) tuples ((int, int)[])
	'''
    max_matrix = make_cost_matrix(matrix)
    mapping = Munkres().compute(max_matrix)
    cost = 0
def make_all_string_related_features_inplace(df_before_preprocessing,
                                             df_after_preprocessing):
    df_before_preprocessing[
        'vendor_name_expense_normalized'] = df_before_preprocessing[
            'vendor_name_expense'].apply(normalize)
    df_before_preprocessing[
        'purchase_vendor_receipts_normalized'] = df_before_preprocessing[
            'purchase_vendor_receipts'].apply(normalize)
    df_before_preprocessing[
        'travel_vendor_receipts_normalized'] = df_before_preprocessing[
            'travel_vendor_receipts'].apply(normalize)
    df_before_preprocessing[
        'expense_type_name_expense_normalized'] = df_before_preprocessing[
            'expense_type_name_expense'].apply(normalize)
    df_before_preprocessing[
        'expense_type_name_itemization_normalized'] = df_before_preprocessing[
            'expense_type_name_itemization'].apply(normalize)
    df_before_preprocessing[
        'expense_category_itemization_normalized'] = df_before_preprocessing[
            'expense_category_itemization'].apply(normalize)
    df_before_preprocessing[
        'expense_category_expense_normalized'] = df_before_preprocessing[
            'expense_category_expense'].apply(normalize)

    df_after_preprocessing['vendor_name_in_hotel_name_stopwords'] = \
        df_before_preprocessing['vendor_name_expense_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_VENDOR_NAME))
    df_after_preprocessing['vendor_name_in_car_stopwords'] = \
        df_before_preprocessing['vendor_name_expense_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_CAR_VENDOR_NAME))
    df_after_preprocessing['vendor_name_in_flights_name_stopwords'] = \
        df_before_preprocessing['vendor_name_expense_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_FLIGHTS_VENDOR_NAME_EXPENSE))

    df_after_preprocessing['tax_str_in_expense_type_name_expense'] = \
        df_before_preprocessing['expense_type_name_expense_normalized'].apply(lambda x: check_if_tax_in_str(x))
    df_after_preprocessing['tax_str_in_expense_type_name_itemization'] = \
        df_before_preprocessing['expense_type_name_itemization_normalized'].apply(lambda x: check_if_tax_in_str(x))

    df_after_preprocessing['expense_type_name_in_car_stopwords_itemization'] = \
        df_before_preprocessing['expense_type_name_itemization_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_CAR_EXPENSE_TYPE_NAME))
    df_after_preprocessing['expense_type_name_in_car_stopwords_expense'] = \
        df_before_preprocessing['expense_type_name_expense_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_CAR_EXPENSE_TYPE_NAME))

    df_after_preprocessing['car_str_in_expense_type_name_itemization'] = \
        df_before_preprocessing['expense_type_name_itemization_normalized'].apply(check_if_car_in_str)
    df_after_preprocessing['car_str_in_expense_type_name_expense'] = \
        df_before_preprocessing['expense_type_name_expense_normalized'].apply(check_if_car_in_str)

    df_after_preprocessing['hotel_str_in_expense_type_name_itemization'] = \
        df_before_preprocessing['expense_category_itemization_normalized'].apply(check_if_hotel_lodging_in_str)
    df_after_preprocessing['hotel_str_in_expense_type_name_expense'] = \
        df_before_preprocessing['expense_category_expense_normalized'].apply(check_if_hotel_lodging_in_str)

    df_after_preprocessing['expense_type_name_in_flights_stopwords_itemization'] = \
        df_before_preprocessing['expense_type_name_itemization_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_FLIGHTS_EXPENSE_TYPE_NAME))
    df_after_preprocessing['expense_type_name_in_flights_stopwords_expense'] = \
        df_before_preprocessing['expense_type_name_expense_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_FLIGHTS_EXPENSE_TYPE_NAME))

    df_after_preprocessing['expense_type_name_in_rail_stopwords_itemization'] = \
        df_before_preprocessing['expense_type_name_itemization_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_RAIL_EXPENSE_TYPE_NAME))
    df_after_preprocessing['expense_type_name_in_rail_stopwords_expense'] = \
        df_before_preprocessing['expense_type_name_expense_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_RAIL_EXPENSE_TYPE_NAME))

    # from anlaysis19_flights
    budget_check_list = ['hotel', 'flight', 'car', 'rail']

    make_features_which_checks_if_str_equals('expense_type_name_itemization',
                                             budget_check_list,
                                             df_before_preprocessing,
                                             df_after_preprocessing)

    # from anlaysis19_flights
    expense_check_list = [
        'other', 'hotel', 'taxi', 'flight', 'car', 'fees_and_misc', 'rail',
        'bus', 'train'
    ]

    make_features_which_checks_if_str_equals('expense_type_name_expense',
                                             expense_check_list,
                                             df_before_preprocessing,
                                             df_after_preprocessing)
    make_features_which_checks_if_str_equals('expense_type_name_itemization',
                                             expense_check_list,
                                             df_before_preprocessing,
                                             df_after_preprocessing)

    make_features_which_checks_if_str_equals('expense_category_expense',
                                             expense_check_list,
                                             df_before_preprocessing,
                                             df_after_preprocessing)
    make_features_which_checks_if_str_equals('expense_category_itemization',
                                             expense_check_list,
                                             df_before_preprocessing,
                                             df_after_preprocessing)

    df_after_preprocessing[
        'airbnb_in_vendor_name_expense'] = df_before_preprocessing[
            'vendor_name_expense_normalized'].apply(check_if_airbnb_in_str)
    df_after_preprocessing[
        'omega_world_travel_in_vendor_name_expense'] = df_before_preprocessing[
            'vendor_name_expense_normalized'].apply(
                check_if_omega_world_travel_in_str)

    # matching vendorname
    me = sm.MongeElkan(sim_func=sm.JaroWinkler().get_raw_score)
    df_after_preprocessing['mongeelkan_jaro_wink'] = df_before_preprocessing.apply(lambda row:
                                                                                   max(
                                                                                       me.get_raw_score(row[
                                                                                                            'vendor_name_expense_normalized'].split(),
                                                                                                        row[
                                                                                                            'purchase_vendor_receipts_normalized'].split()), \
                                                                                       me.get_raw_score(row[
                                                                                                            'travel_vendor_receipts_normalized'].split(),
                                                                                                        row[
                                                                                                            'vendor_name_expense_normalized'].split())
                                                                                   )
                                                                                   , axis=1)