Ejemplo n.º 1
0
def monge_elkan(arr1, arr2):
    """
    This function computes the Monge-Elkan measure between the two input
    lists/sets. Specifically, this function uses Jaro-Winkler measure as the
    secondary function to compute the similarity score.

    Args:
        arr1,arr2 (list or set): The input list or sets for which the
            Monge-Elkan measure should be computed.

    Returns:
        The Monge-Elkan measure if both the lists/set are not None and do not
        have any missing tokens (i.e NaN), else  returns NaN.
    """

    if arr1 is None or arr2 is None:
        return pd.np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return pd.np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return pd.np.NaN
    # Create Monge-Elkan measure object
    measure = sm.MongeElkan()
    # Call the function to compute the Monge-Elkan measure
    return measure.get_raw_score(arr1, arr2)
Ejemplo n.º 2
0
def normMongeElkanSimStrings(str1, str2):
    if len(str1) > 0 and len(str2) > 0:
        fun = sm.MongeElkan()
        return fun.get_raw_score([str1], [str2])
    else:
        # Average for target 0 around:
        return 0.58
Ejemplo n.º 3
0
 def monge_elkan_score(self,
                       str_pair,
                       sim_func=sm.JaroWinkler().get_raw_score):
     """
     calculate monge elkan similarity between two single sets of tokens
     :return: raw_score
     """
     e1, e2 = self._check_input(str_pair, type_=list)
     me = sm.MongeElkan(sim_func=sim_func)
     return me.get_raw_score(e1, e2)
Ejemplo n.º 4
0
def monge_elkan(arr1, arr2):
    if arr1 is None or arr2 is None:
        return pd.np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return pd.np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return pd.np.NaN
    # Create Monge-Elkan measure object
    measure = sm.MongeElkan()
    # Call the function to compute the Monge-Elkan measure
    return measure.get_raw_score(arr1, arr2)
Ejemplo n.º 5
0
df.head()

# In[34]:

# Set alpha beta https://en.wikipedia.org/wiki/Tversky_index
# Setting alpha beta as 0.5 is same as Dice Similarity
tvi = sm.TverskyIndex(0.3, 0.6)
df['Tversky'] = df.apply(
    lambda x: tvi.get_sim_score(x['aTokens'], x['bTokens']), axis=1)
df.head()

# In[35]:

# me = sm.MongeElkan(sim_func=NeedlemanWunsch().get_raw_score)
# me = MongeElkan(sim_func=Affine().get_raw_score)
me = sm.MongeElkan()
df['MongeElkan'] = df.apply(
    lambda x: me.get_raw_score(x['aTokens'], x['bTokens']), axis=1)
df.head()

# In[36]:

corpus = []


def generate_corpus(tokens):
    corpus.append(tokens)


df['aTokens'].apply(generate_corpus)
df['bTokens'].apply(generate_corpus)
def compare_sum(user_sum, lyrics_sum):
    """Take two sentence list and compare, return closeness score."""
    # init monge elkan algo
    me = sm.MongeElkan()
    return me.get_raw_score(user_sum, lyrics_sum)
def make_all_string_related_features_inplace(df_before_preprocessing,
                                             df_after_preprocessing):
    df_before_preprocessing[
        'vendor_name_expense_normalized'] = df_before_preprocessing[
            'vendor_name_expense'].apply(normalize)
    df_before_preprocessing[
        'purchase_vendor_receipts_normalized'] = df_before_preprocessing[
            'purchase_vendor_receipts'].apply(normalize)
    df_before_preprocessing[
        'travel_vendor_receipts_normalized'] = df_before_preprocessing[
            'travel_vendor_receipts'].apply(normalize)
    df_before_preprocessing[
        'expense_type_name_expense_normalized'] = df_before_preprocessing[
            'expense_type_name_expense'].apply(normalize)
    df_before_preprocessing[
        'expense_type_name_itemization_normalized'] = df_before_preprocessing[
            'expense_type_name_itemization'].apply(normalize)
    df_before_preprocessing[
        'expense_category_itemization_normalized'] = df_before_preprocessing[
            'expense_category_itemization'].apply(normalize)
    df_before_preprocessing[
        'expense_category_expense_normalized'] = df_before_preprocessing[
            'expense_category_expense'].apply(normalize)

    df_after_preprocessing['vendor_name_in_hotel_name_stopwords'] = \
        df_before_preprocessing['vendor_name_expense_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_VENDOR_NAME))
    df_after_preprocessing['vendor_name_in_car_stopwords'] = \
        df_before_preprocessing['vendor_name_expense_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_CAR_VENDOR_NAME))
    df_after_preprocessing['vendor_name_in_flights_name_stopwords'] = \
        df_before_preprocessing['vendor_name_expense_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_FLIGHTS_VENDOR_NAME_EXPENSE))

    df_after_preprocessing['tax_str_in_expense_type_name_expense'] = \
        df_before_preprocessing['expense_type_name_expense_normalized'].apply(lambda x: check_if_tax_in_str(x))
    df_after_preprocessing['tax_str_in_expense_type_name_itemization'] = \
        df_before_preprocessing['expense_type_name_itemization_normalized'].apply(lambda x: check_if_tax_in_str(x))

    df_after_preprocessing['expense_type_name_in_car_stopwords_itemization'] = \
        df_before_preprocessing['expense_type_name_itemization_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_CAR_EXPENSE_TYPE_NAME))
    df_after_preprocessing['expense_type_name_in_car_stopwords_expense'] = \
        df_before_preprocessing['expense_type_name_expense_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_CAR_EXPENSE_TYPE_NAME))

    df_after_preprocessing['car_str_in_expense_type_name_itemization'] = \
        df_before_preprocessing['expense_type_name_itemization_normalized'].apply(check_if_car_in_str)
    df_after_preprocessing['car_str_in_expense_type_name_expense'] = \
        df_before_preprocessing['expense_type_name_expense_normalized'].apply(check_if_car_in_str)

    df_after_preprocessing['hotel_str_in_expense_type_name_itemization'] = \
        df_before_preprocessing['expense_category_itemization_normalized'].apply(check_if_hotel_lodging_in_str)
    df_after_preprocessing['hotel_str_in_expense_type_name_expense'] = \
        df_before_preprocessing['expense_category_expense_normalized'].apply(check_if_hotel_lodging_in_str)

    df_after_preprocessing['expense_type_name_in_flights_stopwords_itemization'] = \
        df_before_preprocessing['expense_type_name_itemization_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_FLIGHTS_EXPENSE_TYPE_NAME))
    df_after_preprocessing['expense_type_name_in_flights_stopwords_expense'] = \
        df_before_preprocessing['expense_type_name_expense_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_FLIGHTS_EXPENSE_TYPE_NAME))

    df_after_preprocessing['expense_type_name_in_rail_stopwords_itemization'] = \
        df_before_preprocessing['expense_type_name_itemization_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_RAIL_EXPENSE_TYPE_NAME))
    df_after_preprocessing['expense_type_name_in_rail_stopwords_expense'] = \
        df_before_preprocessing['expense_type_name_expense_normalized'].apply(
            lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_RAIL_EXPENSE_TYPE_NAME))

    # from anlaysis19_flights
    budget_check_list = ['hotel', 'flight', 'car', 'rail']

    make_features_which_checks_if_str_equals('expense_type_name_itemization',
                                             budget_check_list,
                                             df_before_preprocessing,
                                             df_after_preprocessing)

    # from anlaysis19_flights
    expense_check_list = [
        'other', 'hotel', 'taxi', 'flight', 'car', 'fees_and_misc', 'rail',
        'bus', 'train'
    ]

    make_features_which_checks_if_str_equals('expense_type_name_expense',
                                             expense_check_list,
                                             df_before_preprocessing,
                                             df_after_preprocessing)
    make_features_which_checks_if_str_equals('expense_type_name_itemization',
                                             expense_check_list,
                                             df_before_preprocessing,
                                             df_after_preprocessing)

    make_features_which_checks_if_str_equals('expense_category_expense',
                                             expense_check_list,
                                             df_before_preprocessing,
                                             df_after_preprocessing)
    make_features_which_checks_if_str_equals('expense_category_itemization',
                                             expense_check_list,
                                             df_before_preprocessing,
                                             df_after_preprocessing)

    df_after_preprocessing[
        'airbnb_in_vendor_name_expense'] = df_before_preprocessing[
            'vendor_name_expense_normalized'].apply(check_if_airbnb_in_str)
    df_after_preprocessing[
        'omega_world_travel_in_vendor_name_expense'] = df_before_preprocessing[
            'vendor_name_expense_normalized'].apply(
                check_if_omega_world_travel_in_str)

    # matching vendorname
    me = sm.MongeElkan(sim_func=sm.JaroWinkler().get_raw_score)
    df_after_preprocessing['mongeelkan_jaro_wink'] = df_before_preprocessing.apply(lambda row:
                                                                                   max(
                                                                                       me.get_raw_score(row[
                                                                                                            'vendor_name_expense_normalized'].split(),
                                                                                                        row[
                                                                                                            'purchase_vendor_receipts_normalized'].split()), \
                                                                                       me.get_raw_score(row[
                                                                                                            'travel_vendor_receipts_normalized'].split(),
                                                                                                        row[
                                                                                                            'vendor_name_expense_normalized'].split())
                                                                                   )
                                                                                   , axis=1)