コード例 #1
0
def get_jw_category(df, possible_combinations):
    '''
    Computes relative frequencies for the 27 combinations of possible tuples

    inputs:
        df (Pandas Dataframe): either matches or unmatches dataframe
        possible_combinations (dict): dictionary of all possible tuple 
          combinations. Value pair is 0 for each tuple key

    outputs:
        new_d (dict): dictionary maping tuple combinations to relative
          frequencies
    '''
    new_d = possible_combinations.copy()
    for i in range(len(df) - 1):

        r_score = jellyfish.jaro_winkler(df['z_restaurant'][i],\
         df['f_restaurant'][i])

        c_score = jellyfish.jaro_winkler(df['z_city'][i], df['f_city'][i])

        a_score = jellyfish.jaro_winkler(df['z_address'][i],\
         df['f_address'][i])

        tup = (util.get_jw_category(r_score), util.get_jw_category(c_score),\
         util.get_jw_category(a_score))
        new_d[tup] += 1 / len(df)

    return new_d
コード例 #2
0
ファイル: record_linkage.py プロジェクト: lbvalcke/Classwork
def get_vector_values(matches_df, unmatches_df, val=0):
    '''
    Input:
        matches_df and unmatches_df with Jaro-Winkler scores calculated
        val - int (if 0, return vector values for both dfs;
                   if 1, return vector values for second df in params)
    Output:
        parameter dataframes with x y z vector value columns
    '''
    # assign vectors to matches_df
    for index, row in matches_df.iterrows():
        x = util.get_jw_category(row["jw_restaurant_names"])
        y = util.get_jw_category(row["jw_address"])
        z = util.get_jw_category(row["jw_city"])
        matches_df.set_value(index, "vector_x", x)
        matches_df.set_value(index, "vector_y", y)
        matches_df.set_value(index, "vector_z", z)

    # assign vectors to unmatches_df
    for index, row in unmatches_df.iterrows():
        x = util.get_jw_category(row["jw_restaurant_names"])
        y = util.get_jw_category(row["jw_address"])
        z = util.get_jw_category(row["jw_city"])
        unmatches_df.set_value(index, "vector_x", x)
        unmatches_df.set_value(index, "vector_y", y)
        unmatches_df.set_value(index, "vector_z", z)

    # to make the function applicable for use on any scored dataframe
    # specify return conditions
    if val == 0:
        return matches_df, unmatches_df
    else:
        return unmatches_df
コード例 #3
0
ファイル: record_linkage.py プロジェクト: pinderk/SampleWork
def get_jw(string_1, string_2):
    '''
    Calculates the Jaro-Winkler score based on the two given strings.

    Inputs:
        string_1: (string) The first given string.
        string_2: (string) The first given string.

    Returns: The Jaro-Winkler score of the two strings.
    '''

    jw_score = jellyfish.jaro_winkler(string_1, string_2)

    jw_category = util.get_jw_category(jw_score)

    return jw_category
コード例 #4
0
ファイル: record_linkage.py プロジェクト: rteehas/CS-121-122
def create_vectors(match_scores, unmatch_scores):
    '''
    Function creates vectors from match and unmatch dataframes.

    Inputs:
        match_scores: 
        unmatch_scores:

    Outputs:
        probabilities: 

    '''
    match_dict = {}
    unmatch_dict = {}

    for i in range(0, 3):
        for j in range(0, 3):
            for k in range(0, 3):
                match_dict[i, j, k] = 0
                unmatch_dict[i, j, k] = 0

    for i, row in match_scores.iterrows():
        name_category = util.get_jw_category(row['name'])
        city_category = util.get_jw_category(row['city'])
        address_category = util.get_jw_category(row['address'])
        category_vector = (name_category, city_category, address_category)
        match_dict[category_vector] += 1

    for i, row in unmatch_scores.iterrows():
        name_category = util.get_jw_category(row['name'])
        city_category = util.get_jw_category(row['city'])
        address_category = util.get_jw_category(row['address'])
        category_vector = (name_category, city_category, address_category)
        match_dict[category_vector] += 1

    match_probs = pd.DataFrame(list(match_dict.items()), columns = ['vect',\
    'prob'])
    unmatch_probs = pd.DataFrame(list(unmatch_dict.items()), columns = ['vect',\
    'prob'])
    match_probs['prob'] = match_probs['prob'].div(50, axis='index')
    unmatch_probs['prob'] = unmatch_probs['prob'].div(1000, axis='index')

    probabilities = match_probs.merge(unmatch_probs, how='outer', on='vect')
    probabilities = probabilities.fillna(value=0)
    probabilities.columns = ['vector', 'match_prob', 'unmatch_prob']

    return probabilities
コード例 #5
0
def partition_tuples(zagat, fodors, match_tuples,\
 unmatch_tuples, possible_tuples):
    '''
    Iterates through all possible combinations of entries from zagat and
     fodors dataframes and computes tuples. Sends each possible combination
      to its respective dataframe

    Inputs:
        zagat(Pandas Dataframe): zagat dataframe
        fodors(Pandas Dataframe): fodors dataframe
        match_tuples(list): list of tuples to be classified as matches
        unmatch_tuples(list): list of tuples to be classified as unmatches
        possible_tuples(list): list of tuples to be classified as possible
         matches

    Outputs:
        matches_df: dataframe of matches
        possible_df: dataframe of possible matches
        unmatches_df: dataframe of non matches

    '''
    column_index = (['z_restaurant', 'z_city', 'z_address',\
     'f_restaurant', 'f_city', 'f_address'])
    matches_rows = []
    unmatches_rows = []
    possible_rows = []

    for i in range(len(zagat) - 1):

        for j in range(len(fodors) - 1):

            z_restaurant = zagat['restaurant'][i]
            f_restaurant = fodors['restaurant'][j]

            z_city = zagat['city'][i]
            f_city = fodors['city'][j]

            z_address = zagat['address'][i]
            f_address = fodors['address'][j]

            r_score = jellyfish.jaro_winkler(z_restaurant, f_restaurant)
            c_score = jellyfish.jaro_winkler(z_city, f_city)
            a_score = jellyfish.jaro_winkler(z_address, f_address)

            tup = (util.get_jw_category(r_score), util.get_jw_category\
                (c_score), util.get_jw_category(a_score))

            if tup in match_tuples:
                matches_rows.append([z_restaurant, z_city, z_address,\
                 f_restaurant, f_city, f_address])

            elif tup in unmatch_tuples:
                unmatches_rows.append([z_restaurant, z_city, z_address,\
                 f_restaurant, f_city, f_address])

            elif tup in possible_tuples:
                possible_rows.append([z_restaurant, z_city, z_address,\
                 f_restaurant, f_city, f_address])

    matches_df = pd.DataFrame(data=matches_rows, columns=column_index)
    unmatches_df = pd.DataFrame(data=unmatches_rows, columns=column_index)
    possible_df = pd.DataFrame(data=possible_rows, columns=column_index)

    return matches_df, possible_df, unmatches_df
コード例 #6
0
ファイル: record_linkage.py プロジェクト: rteehas/CS-121-122
def determining_matches(probabilities, zagat, fodor, \
    mu, lambda_, outfile, block_on = None):
    '''
    Function determines matches between zagat and fodor data 
    frames depending on the partitions for vectors created
    by the partitioning_vectors function. 

    Inputs:
    probabilities, zagat, fodor (dataframes)
    
    mu, lambda_ (acceptable lower and upper probabilities)

    outfile (filename to save to) 

    block_on = None (optional blocking parameter)

    '''
    match_count = 0
    poss_count = 0
    unmatch_count = 0

    matches = {'zagat_name':[], 'zagat_address': [], \
    'fodor_name':[], 'fodor_addr': []}

    matches_v, poss_match_v, unmatch_v = partitioning_vectors(probabilities, \
        mu, lambda_)

    for zagat_index, zagat_row in zagat.iterrows():
        for fodor_index, fodor_row in fodor.iterrows():
            zagat_name = zagat.iloc[zagat_index]['restaurant']
            zagat_city = zagat.iloc[zagat_index]['city']
            zagat_address = zagat.iloc[zagat_index]['street']

            fodor_name = fodor.iloc[fodor_index]['restaurant']
            fodor_city = fodor.iloc[fodor_index]['city']
            fodor_address = fodor.iloc[fodor_index]['street']

            name_score = jellyfish.jaro_winkler(zagat_name, fodor_name)
            city_score = jellyfish.jaro_winkler(zagat_city, fodor_city)
            addr_score = jellyfish.jaro_winkler(zagat_address, \
                fodor_address)

            name_category = util.get_jw_category(name_score)
            address_category = util.get_jw_category(addr_score)
            city_category = util.get_jw_category(city_score)

            vector = (name_category, address_category, \
                city_category)
            if vector in matches_v:
                match_count += 1
                match_dict['zagat_name'].append(zagat_name)
                match_dict['zagat_addr'].append(zagat_address)
                match_dict['fodor_name'].append(fodor_name)
                match_dict['fodor_addr'].append(fodor_address)

            elif vector in poss_match_v:
                poss_count += 1
            elif vector in unmatch_v:
                unmatches += 1

    matches_csv = pd.DataFrame(data=match_dict)
    matches_csv.to_csv(outfile)

    return matches, poss_match, unmatches