def test_iteration_known_data_generating_process(spark, gamma_settings_4, params_4, sqlite_con_4): dfpd = pd.read_sql("select * from df", sqlite_con_4) df_gammas = spark.createDataFrame(dfpd) gamma_settings_4["retain_matching_columns"] = False gamma_settings_4["em_convergence"] = 0.001 gamma_settings_4["max_iterations"] = 40 df_e = iterate( df_gammas, params_4, gamma_settings_4, spark, compute_ll=False, ) assert params_4.iteration < 20 assert params_4.params["π"]["gamma_col_2_levels"]["prob_dist_match"][ "level_0"]["probability"] == pytest.approx(0.05, abs=0.01) assert params_4.params["π"]["gamma_col_5_levels"]["prob_dist_match"][ "level_0"]["probability"] == pytest.approx(0.1, abs=0.01) assert params_4.params["π"]["gamma_col_20_levels"]["prob_dist_match"][ "level_0"]["probability"] == pytest.approx(0.05, abs=0.01) assert params_4.params["π"]["gamma_col_2_levels"]["prob_dist_non_match"][ "level_1"]["probability"] == pytest.approx(0.05, abs=0.01) assert params_4.params["π"]["gamma_col_5_levels"]["prob_dist_non_match"][ "level_1"]["probability"] == pytest.approx(0.2, abs=0.01) assert params_4.params["π"]["gamma_col_20_levels"]["prob_dist_non_match"][ "level_1"]["probability"] == pytest.approx(0.5, abs=0.01)
def get_scored_comparisons(self): """Use the EM algorithm to estimate model parameters and return match probabilities. Note: Does not compute term frequency adjustments. Returns: DataFrame: A spark dataframe including a match probability column """ df_comparison = self._get_df_comparison() df_gammas = add_gammas(df_comparison, self.settings, self.spark) df_gammas = self.break_lineage_blocked_comparisons( df_gammas, self.spark) df_e = iterate( df_gammas, self.params, self.settings, self.spark, compute_ll=False, save_state_fn=self.save_state_fn, ) # In case the user's break lineage function has persisted it df_gammas.unpersist() df_e = self.break_lineage_scored_comparisons(df_e, self.spark) df_e_adj = self._make_term_frequency_adjustments(df_e) df_e.unpersist() return df_e_adj
def get_scored_comparisons(self): """Use the EM algorithm to estimate model parameters and return match probabilities. Note: Does not compute term frequency adjustments. Returns: DataFrame: A spark dataframe including a match probability column """ df_comparison = self._get_df_comparison() df_gammas = add_gammas(df_comparison, self.settings, self.spark) df_gammas.persist() df_e = iterate( df_gammas, self.params, self.settings, self.spark, compute_ll=False, save_state_fn=self.save_state_fn, ) df_gammas.unpersist() return df_e
def estimate(df_gammas: DataFrame, settings: dict, spark: SparkSession): """Take pandas datafrae of gammas and estimate splink model Args: df_gammas (DataFrame): Pandas dataframe of df_gammas settings (dict): Splink settings dictionary spark (SparkSession): SparkSession object """ settings["retain_matching_columns"] = False df = spark.createDataFrame(df_gammas) model = Model(settings, spark) df_e = iterate(df, model, spark) return df_e, model
def test_term_frequency_adjustments(spark): settings = { "link_type": "dedupe_only", "proportion_of_matches": 0.1, "comparison_columns": [ { "col_name": "name", "term_frequency_adjustments": True, "m_probabilities": [ 0.1, # Amonst matches, 10% are have typose 0.9 # The reamining 90% have a match ], "u_probabilities": [ 4 / 5, # Among non matches, 80% of the time there's no match 1 / 5 # But 20% of the time names 'collide' WE WANT THESE U PROBABILITIES TO BE DEPENDENT ON NAME. ], }, { "col_name": "cat_12", "m_probabilities": [0.05, 0.95], "u_probabilities": [11 / 12, 1 / 12], }, { "col_name": "cat_20", "m_probabilities": [0.2, 0.8], "u_probabilities": [19 / 20, 1 / 20], } ], "em_convergence": 0.001 } from string import ascii_letters import statistics import random from splink.settings import complete_settings_dict settings = complete_settings_dict(settings, spark="supress_warnings") def is_match(settings): p = settings["proportion_of_matches"] return random.choices([0, 1], [1 - p, p])[0] def get_row_portion(match, comparison_col, skew="auto"): # Problem is that at the moment we're guaranteeing that a match on john is just as likely to be a match as a match on james # What we want is to generate more 'collisions' for john than robin i.e. if it's a non match, we want more gamma = 1 on name for john if match: gamma_pdist = comparison_col["m_probabilities"] else: gamma_pdist = comparison_col["u_probabilities"] # To decide whether gamma = 0 or 1 in the case of skew, we first need to decide on what value the left hand value column will take (well, what probability it has of selection) # How many distinct values should be choose? num_values = int(round(1 / comparison_col["u_probabilities"][1])) if skew == "auto": skew = comparison_col["term_frequency_adjustments"] if skew: prob_dist = range( 1, num_values + 1)[::-1] # a most freqent, last value least frequent # Normalise prob_dist = [p / sum(prob_dist) for p in prob_dist] index_of_value = random.choices(range(num_values), prob_dist)[0] if not match: # If it's a u probability this_prob = prob_dist[index_of_value] gamma_pdist = [1 - this_prob, this_prob] else: prob_dist = [1 / num_values] * num_values index_of_value = random.choices(range(num_values), prob_dist)[0] levels = comparison_col["num_levels"] gamma = random.choices(range(levels), gamma_pdist)[0] values = ascii_letters[:26] if num_values > 26: values = [ a + b for a in ascii_letters[:26] for b in ascii_letters[:26] ] #aa, ab etc values = values[:num_values] if gamma == 1: value_1 = values[index_of_value] value_2 = value_1 if gamma == 0: value_1 = values[index_of_value] same_value = True while same_value: value_2 = random.choices(values, prob_dist)[0] if value_1 != value_2: same_value = False cname = comparison_col["col_name"] return { f"{cname}_l": value_1, f"{cname}_r": value_2, f"gamma_{cname}": gamma } import uuid rows = [] for uid in range(100000): m = is_match(settings) row = { "unique_id_l": str(uuid.uuid4()), "unique_id_r": str(uuid.uuid4()), "match": m } for cc in settings["comparison_columns"]: row_portion = get_row_portion(m, cc) row = {**row, **row_portion} rows.append(row) all_rows = pd.DataFrame(rows) df_gammas = spark.createDataFrame(all_rows) settings["comparison_columns"][1]["term_frequency_adjustments"] = True from splink import Splink from splink.params import Params from splink.iterate import iterate from splink.term_frequencies import make_adjustment_for_term_frequencies # We have table of gammas - need to work from there within splink params = Params(settings, spark) df_e = iterate(df_gammas, params, settings, spark, compute_ll=False) df_e_adj = make_adjustment_for_term_frequencies( df_e, params, settings, retain_adjustment_columns=True, spark=spark) df_e_adj.createOrReplaceTempView("df_e_adj") sql = """ select name_l, name_tf_adj, count(*) from df_e_adj where name_l = name_r group by name_l, name_tf_adj order by name_l """ df = spark.sql(sql).toPandas() df = df.set_index("name_l") df_dict = df.to_dict(orient='index') assert df_dict['a']["name_tf_adj"] < 0.5 assert df_dict['e']["name_tf_adj"] > 0.5 assert df_dict['e'][ "name_tf_adj"] > 0.6 #Arbitrary numbers, but we do expect a big uplift here assert df_dict['e'][ "name_tf_adj"] < 0.95 #Arbitrary numbers, but we do expect a big uplift here df_e_adj.createOrReplaceTempView("df_e_adj") sql = """ select cat_12_l, cat_12_tf_adj, count(*) as count from df_e_adj where cat_12_l = cat_12_r group by cat_12_l, cat_12_tf_adj order by cat_12_l """ spark.sql(sql).toPandas() df = spark.sql(sql).toPandas() assert df["cat_12_tf_adj"].max( ) < 0.55 # Keep these loose because when generating random data anything can happen! assert df["cat_12_tf_adj"].min() > 0.45 # Test adjustments applied coorrectly when there is one df_e_adj.createOrReplaceTempView("df_e_adj") sql = """ select * from df_e_adj where name_l = name_r and cat_12_l != cat_12_r limit 1 """ df = spark.sql(sql).toPandas() df_dict = df.loc[0, :].to_dict() def bayes(p1, p2): return p1 * p2 / (p1 * p2 + (1 - p1) * (1 - p2)) assert df_dict["tf_adjusted_match_prob"] == pytest.approx( bayes(df_dict["match_probability"], df_dict["name_tf_adj"])) # Test adjustments applied coorrectly when there are multiple df_e_adj.createOrReplaceTempView("df_e_adj") sql = """ select * from df_e_adj where name_l = name_r and cat_12_l = cat_12_r limit 1 """ df = spark.sql(sql).toPandas() df_dict = df.loc[0, :].to_dict() double_b = bayes( bayes(df_dict["match_probability"], df_dict["name_tf_adj"]), df_dict["cat_12_tf_adj"]) assert df_dict["tf_adjusted_match_prob"] == pytest.approx(double_b)
def test_iterate(spark, sqlite_con_1, params_1, gamma_settings_1): original_params = copy.deepcopy(params_1.params) dfpd = pd.read_sql("select * from test1", sqlite_con_1) df = spark.createDataFrame(dfpd) rules = [ "l.mob = r.mob", "l.surname = r.surname", ] gamma_settings_1["blocking_rules"] = rules df_comparison = block_using_rules(gamma_settings_1, df=df, spark=spark) df_gammas = add_gammas(df_comparison, gamma_settings_1, spark) gamma_settings_1["max_iterations"] = 1 df_e = iterate(df_gammas, params_1, gamma_settings_1, spark) assert params_1.params["λ"] == pytest.approx(0.540922141) assert params_1.params["π"]["gamma_mob"]["prob_dist_match"]["level_0"][ "probability"] == pytest.approx(0.087438272, abs=0.0001) assert params_1.params["π"]["gamma_surname"]["prob_dist_non_match"][ "level_1"]["probability"] == pytest.approx(0.160167628, abs=0.0001) first_it_params = copy.deepcopy(params_1.params) df_e_pd = df_e.toPandas() df_e_pd = df_e_pd.sort_values(["unique_id_l", "unique_id_r"]) correct_list = [ 0.658602114, 0.796821727, 0.796821727, 0.189486495, 0.189486495, 0.658602114, 0.495063367, 0.495063367, ] result_list = list(df_e_pd["match_probability"].astype(float)) for i in zip(result_list, correct_list): assert i[0] == pytest.approx(i[1], abs=0.0001) # Does it still work with another iteration? gamma_settings_1["max_iterations"] = 1 df_e = iterate(df_gammas, params_1, gamma_settings_1, spark) assert params_1.params["λ"] == pytest.approx(0.534993426, abs=0.0001) assert params_1.params["π"]["gamma_mob"]["prob_dist_match"]["level_0"][ "probability"] == pytest.approx(0.088546179, abs=0.0001) assert params_1.params["π"]["gamma_surname"]["prob_dist_non_match"][ "level_1"]["probability"] == pytest.approx(0.109234086, abs=0.0001) ## Test whether the params object is correctly storing the iteration history assert params_1.param_history[0] == original_params assert params_1.param_history[1] == first_it_params ## Now test whether, when we data = params_1._convert_params_dict_to_dataframe(original_params) val1 = { "gamma": "gamma_mob", "match": 0, "value_of_gamma": "level_0", "probability": 0.8, "value": 0, "column": "mob", } val2 = { "gamma": "gamma_surname", "match": 1, "value_of_gamma": "level_1", "probability": 0.2, "value": 1, "column": "surname", } assert val1 in data assert val2 in data correct_list = [{ "iteration": 0, "λ": 0.4 }, { "iteration": 1, "λ": 0.540922141 }] result_list = params_1._iteration_history_df_lambdas() for i in zip(result_list, correct_list): assert i[0]["iteration"] == i[1]["iteration"] assert i[0]["λ"] == pytest.approx(i[1]["λ"]) result_list = params_1._iteration_history_df_gammas() val1 = { "iteration": 0, "gamma": "gamma_mob", "match": 0, "value_of_gamma": "level_0", "probability": 0.8, "value": 0, "column": "mob", } assert val1 in result_list val2 = { "iteration": 1, "gamma": "gamma_surname", "match": 0, "value_of_gamma": "level_1", "probability": 0.160167628, "value": 1, "column": "surname", } for r in result_list: if r["iteration"] == 1: if r["gamma"] == "gamma_surname": if r["match"] == 0: if r["value"] == 1: record = r for k, v in record.items(): expected_value = val2[k] if k == "probability": assert v == pytest.approx(expected_value, abs=0.0001) else: assert v == expected_value # Test whether saving and loading parameters works import tempfile dir = tempfile.TemporaryDirectory() fname = os.path.join(dir.name, "params.json") # print(params_1.params) # import json # print(json.dumps(params_1.to_dict(), indent=4)) params_1.save_params_to_json_file(fname) from splink.params import load_params_from_json p = load_params_from_json(fname) assert p.params["λ"] == pytest.approx(params_1.params["λ"])
def test_term_frequency_adjustments(spark): # The strategy is going to be to create a fake dataframe # where we have different levels to model frequency imbalance # gamma=3 is where name matches and name is robin (unusual name) # gamma=2 is where name matches and name is matt (normal name) # gamma=1 is where name matches and name is john (v common name) # We simulate the term frequency imbalance # by pooling this together, setting all gamma >0 # to equal 1 # We then expect that # term frequency adjustments should adjust up the # robins but adjust down the johns # We also expect that the tf adjusted match probability should be more accurate forename_probs = _probabilities_from_freqs([3, 2, 1]) surname_probs = _probabilities_from_freqs([10, 5, 1]) settings_true = { "link_type": "dedupe_only", "proportion_of_matches": 0.5, "comparison_columns": [ { "col_name": "forename", "term_frequency_adjustments": True, "m_probabilities": forename_probs["m_probabilities"], "u_probabilities": forename_probs["u_probabilities"], "num_levels": 4, }, { "col_name": "surname", "term_frequency_adjustments": True, "m_probabilities": surname_probs["m_probabilities"], "u_probabilities": surname_probs["u_probabilities"], "num_levels": 4, }, { "col_name": "cat_20", "m_probabilities": [0.2, 0.8], "u_probabilities": [19 / 20, 1 / 20], }, ], } settings_true = complete_settings_dict(settings_true, spark) df = generate_df_gammas_random(10000, settings_true) # Create new binary columns that binarise the more granular gammas to 0 and 1 df["gamma_forename_binary"] = df["gamma_forename"].where( df["gamma_forename"] == 0, 1) df["gamma_surname_binary"] = df["gamma_surname"].where( df["gamma_surname"] == 0, 1) # Populate non matches with random value # Then assign left and right values ased on the gamma values df["forename_binary_l"] = df["unique_id_l"] df["forename_binary_r"] = df["unique_id_r"] f1 = df["gamma_forename"] == 3 df.loc[f1, "forename_binary_l"] = "Robin" df.loc[f1, "forename_binary_r"] = "Robin" f1 = df["gamma_forename"] == 2 df.loc[f1, "forename_binary_l"] = "Matt" df.loc[f1, "forename_binary_r"] = "Matt" f1 = df["gamma_forename"] == 1 df.loc[f1, "forename_binary_l"] = "John" df.loc[f1, "forename_binary_r"] = "John" # Populate non matches with random value df["surname_binary_l"] = df["unique_id_l"] df["surname_binary_r"] = df["unique_id_r"] f1 = df["gamma_surname"] == 3 df.loc[f1, "surname_binary_l"] = "Linacre" df.loc[f1, "surname_binary_r"] = "Linacre" f1 = df["gamma_surname"] == 2 df.loc[f1, "surname_binary_l"] = "Hughes" df.loc[f1, "surname_binary_r"] = "Hughes" f1 = df["gamma_surname"] == 1 df.loc[f1, "surname_binary_l"] = "Smith" df.loc[f1, "surname_binary_r"] = "Smith" # cat20 df["cat_20_l"] = df["unique_id_l"] df["cat_20_r"] = df["unique_id_r"] f1 = df["gamma_cat_20"] == 1 df.loc[f1, "cat_20_l"] = "a" df.loc[f1, "cat_20_r"] = "a" df = add_match_prob(df, settings_true) df["match_probability"] = df["true_match_probability_l"] df_e = spark.createDataFrame(df) def four_to_two(probs): return [probs[0], sum(probs[1:])] settings_binary = { "link_type": "dedupe_only", "proportion_of_matches": 0.5, "comparison_columns": [ { "col_name": "forename_binary", "term_frequency_adjustments": True, "num_levels": 2, "m_probabilities": four_to_two(forename_probs["m_probabilities"]), "u_probabilities": four_to_two(forename_probs["u_probabilities"]), }, { "col_name": "surname_binary", "term_frequency_adjustments": True, "num_levels": 2, "m_probabilities": four_to_two(surname_probs["m_probabilities"]), "u_probabilities": four_to_two(surname_probs["u_probabilities"]), }, { "col_name": "cat_20", "m_probabilities": [0.2, 0.8], "u_probabilities": [19 / 20, 1 / 20], }, ], "retain_intermediate_calculation_columns": True, "max_iterations": 0, "additional_columns_to_retain": ["true_match_probability"], } # Can't use linker = Splink() because we have df_gammas, not df settings_binary = complete_settings_dict(settings_binary, spark) model = Model(settings_binary, spark) df_e = iterate(df_e, model, spark) df_e = make_adjustment_for_term_frequencies(df_e, model, spark, retain_adjustment_columns=True) df = df_e.toPandas() ######### # Tests start here ######### # Test that overall square error is better for tf adjusted match prob df["e1"] = (df["match_probability"] - df["true_match_probability_l"])**2 df["e2"] = (df["tf_adjusted_match_prob"] - df["true_match_probability_l"])**2 assert df["e1"].sum() > df["e2"].sum() # We expect Johns to be adjusted down... f1 = df["forename_binary_l"] == "John" df_filtered = df[f1] adj = df_filtered["forename_binary_tf_adj"].mean() assert adj < 0.5 # And Robins to be adjusted up f1 = df["forename_binary_l"] == "Robin" df_filtered = df[f1] adj = df_filtered["forename_binary_tf_adj"].mean() assert adj > 0.5 # We expect Smiths to be adjusted down... f1 = df["surname_binary_l"] == "Smith" df_filtered = df[f1] adj = df_filtered["surname_binary_tf_adj"].mean() assert adj < 0.5 # And Linacres to be adjusted up f1 = df["surname_binary_l"] == "Linacre" df_filtered = df[f1] adj = df_filtered["surname_binary_tf_adj"].mean() assert adj > 0.5 # Check adjustments are applied correctly f1 = df["forename_binary_l"] == "Robin" f2 = df["surname_binary_l"] == "Linacre" df_filtered = df[f1 & f2] row = df_filtered.head(1).to_dict(orient="records")[0] prior = row["match_probability"] posterior = row["tf_adjusted_match_prob"] b1 = row["forename_binary_tf_adj"] b2 = row["surname_binary_tf_adj"] expected_post = (prior * b1 * b2 / (prior * b1 * b2 + (1 - prior) * (1 - b1) * (1 - b2))) assert posterior == pytest.approx(expected_post) # We expect match probability to be equal to tf_adjusted match probability in cases where surname and forename don't match f1 = df["surname_binary_l"] != df["surname_binary_r"] f2 = df["forename_binary_l"] != df["forename_binary_r"] df_filtered = df[f1 & f2] sum_difference = (df_filtered["tf_adjusted_match_prob"] - df_filtered["match_probability"]).sum() assert 0 == pytest.approx(sum_difference)