def get_jw_category(df, possible_combinations): ''' Computes relative frequencies for the 27 combinations of possible tuples inputs: df (Pandas Dataframe): either matches or unmatches dataframe possible_combinations (dict): dictionary of all possible tuple combinations. Value pair is 0 for each tuple key outputs: new_d (dict): dictionary maping tuple combinations to relative frequencies ''' new_d = possible_combinations.copy() for i in range(len(df) - 1): r_score = jellyfish.jaro_winkler(df['z_restaurant'][i],\ df['f_restaurant'][i]) c_score = jellyfish.jaro_winkler(df['z_city'][i], df['f_city'][i]) a_score = jellyfish.jaro_winkler(df['z_address'][i],\ df['f_address'][i]) tup = (util.get_jw_category(r_score), util.get_jw_category(c_score),\ util.get_jw_category(a_score)) new_d[tup] += 1 / len(df) return new_d
def get_vector_values(matches_df, unmatches_df, val=0): ''' Input: matches_df and unmatches_df with Jaro-Winkler scores calculated val - int (if 0, return vector values for both dfs; if 1, return vector values for second df in params) Output: parameter dataframes with x y z vector value columns ''' # assign vectors to matches_df for index, row in matches_df.iterrows(): x = util.get_jw_category(row["jw_restaurant_names"]) y = util.get_jw_category(row["jw_address"]) z = util.get_jw_category(row["jw_city"]) matches_df.set_value(index, "vector_x", x) matches_df.set_value(index, "vector_y", y) matches_df.set_value(index, "vector_z", z) # assign vectors to unmatches_df for index, row in unmatches_df.iterrows(): x = util.get_jw_category(row["jw_restaurant_names"]) y = util.get_jw_category(row["jw_address"]) z = util.get_jw_category(row["jw_city"]) unmatches_df.set_value(index, "vector_x", x) unmatches_df.set_value(index, "vector_y", y) unmatches_df.set_value(index, "vector_z", z) # to make the function applicable for use on any scored dataframe # specify return conditions if val == 0: return matches_df, unmatches_df else: return unmatches_df
def get_jw(string_1, string_2): ''' Calculates the Jaro-Winkler score based on the two given strings. Inputs: string_1: (string) The first given string. string_2: (string) The first given string. Returns: The Jaro-Winkler score of the two strings. ''' jw_score = jellyfish.jaro_winkler(string_1, string_2) jw_category = util.get_jw_category(jw_score) return jw_category
def create_vectors(match_scores, unmatch_scores): ''' Function creates vectors from match and unmatch dataframes. Inputs: match_scores: unmatch_scores: Outputs: probabilities: ''' match_dict = {} unmatch_dict = {} for i in range(0, 3): for j in range(0, 3): for k in range(0, 3): match_dict[i, j, k] = 0 unmatch_dict[i, j, k] = 0 for i, row in match_scores.iterrows(): name_category = util.get_jw_category(row['name']) city_category = util.get_jw_category(row['city']) address_category = util.get_jw_category(row['address']) category_vector = (name_category, city_category, address_category) match_dict[category_vector] += 1 for i, row in unmatch_scores.iterrows(): name_category = util.get_jw_category(row['name']) city_category = util.get_jw_category(row['city']) address_category = util.get_jw_category(row['address']) category_vector = (name_category, city_category, address_category) match_dict[category_vector] += 1 match_probs = pd.DataFrame(list(match_dict.items()), columns = ['vect',\ 'prob']) unmatch_probs = pd.DataFrame(list(unmatch_dict.items()), columns = ['vect',\ 'prob']) match_probs['prob'] = match_probs['prob'].div(50, axis='index') unmatch_probs['prob'] = unmatch_probs['prob'].div(1000, axis='index') probabilities = match_probs.merge(unmatch_probs, how='outer', on='vect') probabilities = probabilities.fillna(value=0) probabilities.columns = ['vector', 'match_prob', 'unmatch_prob'] return probabilities
def partition_tuples(zagat, fodors, match_tuples,\ unmatch_tuples, possible_tuples): ''' Iterates through all possible combinations of entries from zagat and fodors dataframes and computes tuples. Sends each possible combination to its respective dataframe Inputs: zagat(Pandas Dataframe): zagat dataframe fodors(Pandas Dataframe): fodors dataframe match_tuples(list): list of tuples to be classified as matches unmatch_tuples(list): list of tuples to be classified as unmatches possible_tuples(list): list of tuples to be classified as possible matches Outputs: matches_df: dataframe of matches possible_df: dataframe of possible matches unmatches_df: dataframe of non matches ''' column_index = (['z_restaurant', 'z_city', 'z_address',\ 'f_restaurant', 'f_city', 'f_address']) matches_rows = [] unmatches_rows = [] possible_rows = [] for i in range(len(zagat) - 1): for j in range(len(fodors) - 1): z_restaurant = zagat['restaurant'][i] f_restaurant = fodors['restaurant'][j] z_city = zagat['city'][i] f_city = fodors['city'][j] z_address = zagat['address'][i] f_address = fodors['address'][j] r_score = jellyfish.jaro_winkler(z_restaurant, f_restaurant) c_score = jellyfish.jaro_winkler(z_city, f_city) a_score = jellyfish.jaro_winkler(z_address, f_address) tup = (util.get_jw_category(r_score), util.get_jw_category\ (c_score), util.get_jw_category(a_score)) if tup in match_tuples: matches_rows.append([z_restaurant, z_city, z_address,\ f_restaurant, f_city, f_address]) elif tup in unmatch_tuples: unmatches_rows.append([z_restaurant, z_city, z_address,\ f_restaurant, f_city, f_address]) elif tup in possible_tuples: possible_rows.append([z_restaurant, z_city, z_address,\ f_restaurant, f_city, f_address]) matches_df = pd.DataFrame(data=matches_rows, columns=column_index) unmatches_df = pd.DataFrame(data=unmatches_rows, columns=column_index) possible_df = pd.DataFrame(data=possible_rows, columns=column_index) return matches_df, possible_df, unmatches_df
def determining_matches(probabilities, zagat, fodor, \ mu, lambda_, outfile, block_on = None): ''' Function determines matches between zagat and fodor data frames depending on the partitions for vectors created by the partitioning_vectors function. Inputs: probabilities, zagat, fodor (dataframes) mu, lambda_ (acceptable lower and upper probabilities) outfile (filename to save to) block_on = None (optional blocking parameter) ''' match_count = 0 poss_count = 0 unmatch_count = 0 matches = {'zagat_name':[], 'zagat_address': [], \ 'fodor_name':[], 'fodor_addr': []} matches_v, poss_match_v, unmatch_v = partitioning_vectors(probabilities, \ mu, lambda_) for zagat_index, zagat_row in zagat.iterrows(): for fodor_index, fodor_row in fodor.iterrows(): zagat_name = zagat.iloc[zagat_index]['restaurant'] zagat_city = zagat.iloc[zagat_index]['city'] zagat_address = zagat.iloc[zagat_index]['street'] fodor_name = fodor.iloc[fodor_index]['restaurant'] fodor_city = fodor.iloc[fodor_index]['city'] fodor_address = fodor.iloc[fodor_index]['street'] name_score = jellyfish.jaro_winkler(zagat_name, fodor_name) city_score = jellyfish.jaro_winkler(zagat_city, fodor_city) addr_score = jellyfish.jaro_winkler(zagat_address, \ fodor_address) name_category = util.get_jw_category(name_score) address_category = util.get_jw_category(addr_score) city_category = util.get_jw_category(city_score) vector = (name_category, address_category, \ city_category) if vector in matches_v: match_count += 1 match_dict['zagat_name'].append(zagat_name) match_dict['zagat_addr'].append(zagat_address) match_dict['fodor_name'].append(fodor_name) match_dict['fodor_addr'].append(fodor_address) elif vector in poss_match_v: poss_count += 1 elif vector in unmatch_v: unmatches += 1 matches_csv = pd.DataFrame(data=match_dict) matches_csv.to_csv(outfile) return matches, poss_match, unmatches