def monge_elkan(arr1, arr2): """ This function computes the Monge-Elkan measure between the two input lists/sets. Specifically, this function uses Jaro-Winkler measure as the secondary function to compute the similarity score. Args: arr1,arr2 (list or set): The input list or sets for which the Monge-Elkan measure should be computed. Returns: The Monge-Elkan measure if both the lists/set are not None and do not have any missing tokens (i.e NaN), else returns NaN. """ if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create Monge-Elkan measure object measure = sm.MongeElkan() # Call the function to compute the Monge-Elkan measure return measure.get_raw_score(arr1, arr2)
def normMongeElkanSimStrings(str1, str2): if len(str1) > 0 and len(str2) > 0: fun = sm.MongeElkan() return fun.get_raw_score([str1], [str2]) else: # Average for target 0 around: return 0.58
def monge_elkan_score(self, str_pair, sim_func=sm.JaroWinkler().get_raw_score): """ calculate monge elkan similarity between two single sets of tokens :return: raw_score """ e1, e2 = self._check_input(str_pair, type_=list) me = sm.MongeElkan(sim_func=sim_func) return me.get_raw_score(e1, e2)
def monge_elkan(arr1, arr2): if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create Monge-Elkan measure object measure = sm.MongeElkan() # Call the function to compute the Monge-Elkan measure return measure.get_raw_score(arr1, arr2)
df.head() # In[34]: # Set alpha beta https://en.wikipedia.org/wiki/Tversky_index # Setting alpha beta as 0.5 is same as Dice Similarity tvi = sm.TverskyIndex(0.3, 0.6) df['Tversky'] = df.apply( lambda x: tvi.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # In[35]: # me = sm.MongeElkan(sim_func=NeedlemanWunsch().get_raw_score) # me = MongeElkan(sim_func=Affine().get_raw_score) me = sm.MongeElkan() df['MongeElkan'] = df.apply( lambda x: me.get_raw_score(x['aTokens'], x['bTokens']), axis=1) df.head() # In[36]: corpus = [] def generate_corpus(tokens): corpus.append(tokens) df['aTokens'].apply(generate_corpus) df['bTokens'].apply(generate_corpus)
def compare_sum(user_sum, lyrics_sum): """Take two sentence list and compare, return closeness score.""" # init monge elkan algo me = sm.MongeElkan() return me.get_raw_score(user_sum, lyrics_sum)
def make_all_string_related_features_inplace(df_before_preprocessing, df_after_preprocessing): df_before_preprocessing[ 'vendor_name_expense_normalized'] = df_before_preprocessing[ 'vendor_name_expense'].apply(normalize) df_before_preprocessing[ 'purchase_vendor_receipts_normalized'] = df_before_preprocessing[ 'purchase_vendor_receipts'].apply(normalize) df_before_preprocessing[ 'travel_vendor_receipts_normalized'] = df_before_preprocessing[ 'travel_vendor_receipts'].apply(normalize) df_before_preprocessing[ 'expense_type_name_expense_normalized'] = df_before_preprocessing[ 'expense_type_name_expense'].apply(normalize) df_before_preprocessing[ 'expense_type_name_itemization_normalized'] = df_before_preprocessing[ 'expense_type_name_itemization'].apply(normalize) df_before_preprocessing[ 'expense_category_itemization_normalized'] = df_before_preprocessing[ 'expense_category_itemization'].apply(normalize) df_before_preprocessing[ 'expense_category_expense_normalized'] = df_before_preprocessing[ 'expense_category_expense'].apply(normalize) df_after_preprocessing['vendor_name_in_hotel_name_stopwords'] = \ df_before_preprocessing['vendor_name_expense_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_VENDOR_NAME)) df_after_preprocessing['vendor_name_in_car_stopwords'] = \ df_before_preprocessing['vendor_name_expense_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_CAR_VENDOR_NAME)) df_after_preprocessing['vendor_name_in_flights_name_stopwords'] = \ df_before_preprocessing['vendor_name_expense_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_FLIGHTS_VENDOR_NAME_EXPENSE)) df_after_preprocessing['tax_str_in_expense_type_name_expense'] = \ df_before_preprocessing['expense_type_name_expense_normalized'].apply(lambda x: check_if_tax_in_str(x)) df_after_preprocessing['tax_str_in_expense_type_name_itemization'] = \ df_before_preprocessing['expense_type_name_itemization_normalized'].apply(lambda x: check_if_tax_in_str(x)) df_after_preprocessing['expense_type_name_in_car_stopwords_itemization'] = \ df_before_preprocessing['expense_type_name_itemization_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_CAR_EXPENSE_TYPE_NAME)) df_after_preprocessing['expense_type_name_in_car_stopwords_expense'] = \ df_before_preprocessing['expense_type_name_expense_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, POST_NORMALIZATION_STOP_WORDS_FOR_CAR_EXPENSE_TYPE_NAME)) df_after_preprocessing['car_str_in_expense_type_name_itemization'] = \ df_before_preprocessing['expense_type_name_itemization_normalized'].apply(check_if_car_in_str) df_after_preprocessing['car_str_in_expense_type_name_expense'] = \ df_before_preprocessing['expense_type_name_expense_normalized'].apply(check_if_car_in_str) df_after_preprocessing['hotel_str_in_expense_type_name_itemization'] = \ df_before_preprocessing['expense_category_itemization_normalized'].apply(check_if_hotel_lodging_in_str) df_after_preprocessing['hotel_str_in_expense_type_name_expense'] = \ df_before_preprocessing['expense_category_expense_normalized'].apply(check_if_hotel_lodging_in_str) df_after_preprocessing['expense_type_name_in_flights_stopwords_itemization'] = \ df_before_preprocessing['expense_type_name_itemization_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_FLIGHTS_EXPENSE_TYPE_NAME)) df_after_preprocessing['expense_type_name_in_flights_stopwords_expense'] = \ df_before_preprocessing['expense_type_name_expense_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_FLIGHTS_EXPENSE_TYPE_NAME)) df_after_preprocessing['expense_type_name_in_rail_stopwords_itemization'] = \ df_before_preprocessing['expense_type_name_itemization_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_RAIL_EXPENSE_TYPE_NAME)) df_after_preprocessing['expense_type_name_in_rail_stopwords_expense'] = \ df_before_preprocessing['expense_type_name_expense_normalized'].apply( lambda x: check_membership_if_in_stopwords(x, STOP_WORDS_FOR_RAIL_EXPENSE_TYPE_NAME)) # from anlaysis19_flights budget_check_list = ['hotel', 'flight', 'car', 'rail'] make_features_which_checks_if_str_equals('expense_type_name_itemization', budget_check_list, df_before_preprocessing, df_after_preprocessing) # from anlaysis19_flights expense_check_list = [ 'other', 'hotel', 'taxi', 'flight', 'car', 'fees_and_misc', 'rail', 'bus', 'train' ] make_features_which_checks_if_str_equals('expense_type_name_expense', expense_check_list, df_before_preprocessing, df_after_preprocessing) make_features_which_checks_if_str_equals('expense_type_name_itemization', expense_check_list, df_before_preprocessing, df_after_preprocessing) make_features_which_checks_if_str_equals('expense_category_expense', expense_check_list, df_before_preprocessing, df_after_preprocessing) make_features_which_checks_if_str_equals('expense_category_itemization', expense_check_list, df_before_preprocessing, df_after_preprocessing) df_after_preprocessing[ 'airbnb_in_vendor_name_expense'] = df_before_preprocessing[ 'vendor_name_expense_normalized'].apply(check_if_airbnb_in_str) df_after_preprocessing[ 'omega_world_travel_in_vendor_name_expense'] = df_before_preprocessing[ 'vendor_name_expense_normalized'].apply( check_if_omega_world_travel_in_str) # matching vendorname me = sm.MongeElkan(sim_func=sm.JaroWinkler().get_raw_score) df_after_preprocessing['mongeelkan_jaro_wink'] = df_before_preprocessing.apply(lambda row: max( me.get_raw_score(row[ 'vendor_name_expense_normalized'].split(), row[ 'purchase_vendor_receipts_normalized'].split()), \ me.get_raw_score(row[ 'travel_vendor_receipts_normalized'].split(), row[ 'vendor_name_expense_normalized'].split()) ) , axis=1)