def gamma_settings_1(): gamma_settings = { "link_type": "dedupe_only", "proportion_of_matches": 0.4, "comparison_columns": [ { "col_name": "mob", "num_levels": 2, "m_probabilities": [0.1, 0.9], "u_probabilities": [0.8, 0.2], }, { "col_name": "surname", "num_levels": 3, "case_expression": """ case when surname_l is null or surname_r is null then -1 when surname_l = surname_r then 2 when substr(surname_l,1, 3) = substr(surname_r, 1, 3) then 1 else 0 end as gamma_surname """, "m_probabilities": [0.1, 0.2, 0.7], "u_probabilities": [0.5, 0.25, 0.25], }, ], "blocking_rules": [] } gamma_settings = complete_settings_dict(gamma_settings, spark="supress_warnings") yield gamma_settings
def test_main_api(spark, sqlite_con_1): settings = { "link_type": "dedupe_only", "comparison_columns": [{ "col_name": "surname" }, { "col_name": "mob" }], "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname"], "max_iterations": 2 } settings = complete_settings_dict(settings, spark=None) dfpd = pd.read_sql("select * from test1", sqlite_con_1) df = spark.createDataFrame(dfpd) linker = Splink(settings, spark, df=df) df_e = linker.get_scored_comparisons() linker.save_model_as_json("saved_model.json", overwrite=True) linker_2 = load_from_json("saved_model.json", spark=spark, df=df) df_e = linker_2.get_scored_comparisons() from splink.intuition import intuition_report params = linker.params row_dict = df_e.toPandas().sample(1).to_dict(orient="records")[0] print(intuition_report(row_dict, params)) linker.params._print_m_u_probs()
def test_tiny_numbers(spark, sqlite_con_1): # Regression test, see https://github.com/moj-analytical-services/splink/issues/48 dfpd = pd.read_sql("select * from test1", sqlite_con_1) df = spark.createDataFrame(dfpd) settings = { "link_type": "dedupe_only", "proportion_of_matches": 0.4, "comparison_columns": [ { "col_name": "mob", "num_levels": 2, "m_probabilities": [5.9380419956766985e-25, 1 - 5.9380419956766985e-25], "u_probabilities": [0.8, 0.2], }, {"col_name": "surname", "num_levels": 2,}, ], "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname",], } settings = complete_settings_dict(settings, spark=None) df_comparison = block_using_rules(settings, df=df, spark=spark) df_gammas = add_gammas(df_comparison, settings, spark) params = Params(settings, spark="supress_warnings") df_e = run_expectation_step(df_gammas, params, settings, spark)
def gamma_settings_4(): gamma_settings = { "link_type": "dedupe_only", "proportion_of_matches":0.9, "comparison_columns": [ { "col_name": "col_2_levels", "num_levels": 2, "case_expression": sql_gen_case_smnt_strict_equality_2("col_2_levels"), }, { "col_name": "col_5_levels", "num_levels": 2, "case_expression": sql_gen_case_smnt_strict_equality_2("col_5_levels"), }, { "col_name": "col_20_levels", "num_levels": 2, "case_expression": sql_gen_case_smnt_strict_equality_2("col_20_levels"), }, ], "blocking_rules": [] } gamma_settings = complete_settings_dict(gamma_settings, spark="supress_warnings") yield gamma_settings
def test_link_option_link(spark, link_dedupe_data_repeat_ids): settings = { "link_type": "link_only", "comparison_columns": [{ "col_name": "first_name" }, { "col_name": "surname" }], "blocking_rules": ["l.first_name = r.first_name", "l.surname = r.surname"] } settings = complete_settings_dict(settings, spark=None) dfpd_l = pd.read_sql("select * from df_l", link_dedupe_data_repeat_ids) df_l = spark.createDataFrame(dfpd_l) dfpd_r = pd.read_sql("select * from df_r", link_dedupe_data_repeat_ids) df_r = spark.createDataFrame(dfpd_r) df = block_using_rules(settings, spark, df_l=df_l, df_r=df_r) df = df.toPandas() df = df.sort_values(["unique_id_l", "unique_id_r"]) assert list(df["unique_id_l"]) == [1, 1, 2, 2, 3, 3] assert list(df["unique_id_r"]) == [1, 3, 2, 3, 2, 3] # Test cartesian version settings = { "link_type": "link_only", "comparison_columns": [{ "col_name": "first_name" }, { "col_name": "surname" }], "blocking_rules": [] } settings = complete_settings_dict(settings, spark=None) dfpd_l = pd.read_sql("select * from df_l", link_dedupe_data_repeat_ids) df_l = spark.createDataFrame(dfpd_l) dfpd_r = pd.read_sql("select * from df_r", link_dedupe_data_repeat_ids) df_r = spark.createDataFrame(dfpd_r) df = block_using_rules(settings, spark, df_l=df_l, df_r=df_r) df = df.toPandas() df = df.sort_values(["unique_id_l", "unique_id_r"]) assert list(df["unique_id_l"]) == [1, 1, 1, 2, 2, 2, 3, 3, 3] assert list(df["unique_id_r"]) == [1, 2, 3, 1, 2, 3, 1, 2, 3]
def test_no_blocking(spark, link_dedupe_data): settings = { "link_type": "link_only", "comparison_columns": [{"col_name": "first_name"}, {"col_name": "surname"}], "blocking_rules": [] } settings = complete_settings_dict(settings, spark=None) dfpd_l = pd.read_sql("select * from df_l", link_dedupe_data) dfpd_r = pd.read_sql("select * from df_r", link_dedupe_data) df_l = spark.createDataFrame(dfpd_l) df_r = spark.createDataFrame(dfpd_r) df_comparison = block_using_rules(settings, spark, df_l=df_l, df_r=df_r) df = df_comparison.toPandas() df = df.sort_values(["unique_id_l", "unique_id_r"]) assert list(df["unique_id_l"]) == [1,1,1,2,2,2] assert list(df["unique_id_r"]) == [7,8,9,7,8,9]
def test_link_option_dedupe_only(spark, link_dedupe_data_repeat_ids): settings = { "link_type": "dedupe_only", "comparison_columns": [{"col_name": "first_name"}, {"col_name": "surname"}], "blocking_rules": [ "l.first_name = r.first_name", "l.surname = r.surname" ] } settings = complete_settings_dict(settings, spark=None) dfpd = pd.read_sql("select * from df_l", link_dedupe_data_repeat_ids) df = spark.createDataFrame(dfpd) df = block_using_rules(settings, spark, df=df) df = df.toPandas() df = df.sort_values(["unique_id_l", "unique_id_r"]) assert list(df["unique_id_l"]) == [2] assert list(df["unique_id_r"]) == [3]
def test_term_frequency_adjustments(spark): settings = { "link_type": "dedupe_only", "proportion_of_matches": 0.1, "comparison_columns": [ { "col_name": "name", "term_frequency_adjustments": True, "m_probabilities": [ 0.1, # Amonst matches, 10% are have typose 0.9 # The reamining 90% have a match ], "u_probabilities": [ 4 / 5, # Among non matches, 80% of the time there's no match 1 / 5 # But 20% of the time names 'collide' WE WANT THESE U PROBABILITIES TO BE DEPENDENT ON NAME. ], }, { "col_name": "cat_12", "m_probabilities": [0.05, 0.95], "u_probabilities": [11 / 12, 1 / 12], }, { "col_name": "cat_20", "m_probabilities": [0.2, 0.8], "u_probabilities": [19 / 20, 1 / 20], } ], "em_convergence": 0.001 } from string import ascii_letters import statistics import random from splink.settings import complete_settings_dict settings = complete_settings_dict(settings, spark="supress_warnings") def is_match(settings): p = settings["proportion_of_matches"] return random.choices([0, 1], [1 - p, p])[0] def get_row_portion(match, comparison_col, skew="auto"): # Problem is that at the moment we're guaranteeing that a match on john is just as likely to be a match as a match on james # What we want is to generate more 'collisions' for john than robin i.e. if it's a non match, we want more gamma = 1 on name for john if match: gamma_pdist = comparison_col["m_probabilities"] else: gamma_pdist = comparison_col["u_probabilities"] # To decide whether gamma = 0 or 1 in the case of skew, we first need to decide on what value the left hand value column will take (well, what probability it has of selection) # How many distinct values should be choose? num_values = int(round(1 / comparison_col["u_probabilities"][1])) if skew == "auto": skew = comparison_col["term_frequency_adjustments"] if skew: prob_dist = range( 1, num_values + 1)[::-1] # a most freqent, last value least frequent # Normalise prob_dist = [p / sum(prob_dist) for p in prob_dist] index_of_value = random.choices(range(num_values), prob_dist)[0] if not match: # If it's a u probability this_prob = prob_dist[index_of_value] gamma_pdist = [1 - this_prob, this_prob] else: prob_dist = [1 / num_values] * num_values index_of_value = random.choices(range(num_values), prob_dist)[0] levels = comparison_col["num_levels"] gamma = random.choices(range(levels), gamma_pdist)[0] values = ascii_letters[:26] if num_values > 26: values = [ a + b for a in ascii_letters[:26] for b in ascii_letters[:26] ] #aa, ab etc values = values[:num_values] if gamma == 1: value_1 = values[index_of_value] value_2 = value_1 if gamma == 0: value_1 = values[index_of_value] same_value = True while same_value: value_2 = random.choices(values, prob_dist)[0] if value_1 != value_2: same_value = False cname = comparison_col["col_name"] return { f"{cname}_l": value_1, f"{cname}_r": value_2, f"gamma_{cname}": gamma } import uuid rows = [] for uid in range(100000): m = is_match(settings) row = { "unique_id_l": str(uuid.uuid4()), "unique_id_r": str(uuid.uuid4()), "match": m } for cc in settings["comparison_columns"]: row_portion = get_row_portion(m, cc) row = {**row, **row_portion} rows.append(row) all_rows = pd.DataFrame(rows) df_gammas = spark.createDataFrame(all_rows) settings["comparison_columns"][1]["term_frequency_adjustments"] = True from splink import Splink from splink.params import Params from splink.iterate import iterate from splink.term_frequencies import make_adjustment_for_term_frequencies # We have table of gammas - need to work from there within splink params = Params(settings, spark) df_e = iterate(df_gammas, params, settings, spark, compute_ll=False) df_e_adj = make_adjustment_for_term_frequencies( df_e, params, settings, retain_adjustment_columns=True, spark=spark) df_e_adj.createOrReplaceTempView("df_e_adj") sql = """ select name_l, name_tf_adj, count(*) from df_e_adj where name_l = name_r group by name_l, name_tf_adj order by name_l """ df = spark.sql(sql).toPandas() df = df.set_index("name_l") df_dict = df.to_dict(orient='index') assert df_dict['a']["name_tf_adj"] < 0.5 assert df_dict['e']["name_tf_adj"] > 0.5 assert df_dict['e'][ "name_tf_adj"] > 0.6 #Arbitrary numbers, but we do expect a big uplift here assert df_dict['e'][ "name_tf_adj"] < 0.95 #Arbitrary numbers, but we do expect a big uplift here df_e_adj.createOrReplaceTempView("df_e_adj") sql = """ select cat_12_l, cat_12_tf_adj, count(*) as count from df_e_adj where cat_12_l = cat_12_r group by cat_12_l, cat_12_tf_adj order by cat_12_l """ spark.sql(sql).toPandas() df = spark.sql(sql).toPandas() assert df["cat_12_tf_adj"].max( ) < 0.55 # Keep these loose because when generating random data anything can happen! assert df["cat_12_tf_adj"].min() > 0.45 # Test adjustments applied coorrectly when there is one df_e_adj.createOrReplaceTempView("df_e_adj") sql = """ select * from df_e_adj where name_l = name_r and cat_12_l != cat_12_r limit 1 """ df = spark.sql(sql).toPandas() df_dict = df.loc[0, :].to_dict() def bayes(p1, p2): return p1 * p2 / (p1 * p2 + (1 - p1) * (1 - p2)) assert df_dict["tf_adjusted_match_prob"] == pytest.approx( bayes(df_dict["match_probability"], df_dict["name_tf_adj"])) # Test adjustments applied coorrectly when there are multiple df_e_adj.createOrReplaceTempView("df_e_adj") sql = """ select * from df_e_adj where name_l = name_r and cat_12_l = cat_12_r limit 1 """ df = spark.sql(sql).toPandas() df_dict = df.loc[0, :].to_dict() double_b = bayes( bayes(df_dict["match_probability"], df_dict["name_tf_adj"]), df_dict["cat_12_tf_adj"]) assert df_dict["tf_adjusted_match_prob"] == pytest.approx(double_b)
def test_link_option_link_dedupe(spark, link_dedupe_data_repeat_ids): settings = { "link_type": "link_and_dedupe", "comparison_columns": [{ "col_name": "first_name" }, { "col_name": "surname" }], "blocking_rules": ["l.first_name = r.first_name", "l.surname = r.surname"] } settings = complete_settings_dict(settings, spark=None) dfpd_l = pd.read_sql("select * from df_l", link_dedupe_data_repeat_ids) df_l = spark.createDataFrame(dfpd_l) dfpd_r = pd.read_sql("select * from df_r", link_dedupe_data_repeat_ids) df_r = spark.createDataFrame(dfpd_r) df = block_using_rules(settings, spark, df_l=df_l, df_r=df_r) df = df.toPandas() df["u_l"] = df["unique_id_l"].astype( str) + df["_source_table_l"].str.slice(0, 1) df["u_r"] = df["unique_id_r"].astype( str) + df["_source_table_r"].str.slice(0, 1) df = df.sort_values( ["_source_table_l", "_source_table_r", "unique_id_l", "unique_id_r"]) assert list( df["u_l"]) == ['2l', '1l', '1l', '2l', '2l', '3l', '3l', '1r', '2r'] assert list( df["u_r"]) == ['3l', '1r', '3r', '2r', '3r', '2r', '3r', '3r', '3r'] # Same for no blocking rules = cartesian product settings = { "link_type": "link_and_dedupe", "comparison_columns": [{ "col_name": "first_name" }, { "col_name": "surname" }], "blocking_rules": [] } settings = complete_settings_dict(settings, spark=None) dfpd_l = pd.read_sql("select * from df_l", link_dedupe_data_repeat_ids) df_l = spark.createDataFrame(dfpd_l) dfpd_r = pd.read_sql("select * from df_r", link_dedupe_data_repeat_ids) df_r = spark.createDataFrame(dfpd_r) df = block_using_rules(settings, spark, df_l=df_l, df_r=df_r) df = df.toPandas() df["u_l"] = df["unique_id_l"].astype( str) + df["_source_table_l"].str.slice(0, 1) df["u_r"] = df["unique_id_r"].astype( str) + df["_source_table_r"].str.slice(0, 1) df = df.sort_values( ["_source_table_l", "unique_id_l", "_source_table_r", "unique_id_r"]) assert list(df["u_l"]) == [ '1l', '1l', '1l', '1l', '1l', '2l', '2l', '2l', '2l', '3l', '3l', '3l', '1r', '1r', '2r' ] assert list(df["u_r"]) == [ '2l', '3l', '1r', '2r', '3r', '3l', '1r', '2r', '3r', '1r', '2r', '3r', '2r', '3r', '3r' ] # Same for cartesian product settings = { "link_type": "link_and_dedupe", "comparison_columns": [{ "col_name": "first_name" }, { "col_name": "surname" }] } settings = complete_settings_dict(settings, spark=None) dfpd_l = pd.read_sql("select * from df_l", link_dedupe_data_repeat_ids) df_l = spark.createDataFrame(dfpd_l) dfpd_r = pd.read_sql("select * from df_r", link_dedupe_data_repeat_ids) df_r = spark.createDataFrame(dfpd_r) df = block_using_rules(settings, spark, df_l=df_l, df_r=df_r) df = df.toPandas() df["u_l"] = df["unique_id_l"].astype( str) + df["_source_table_l"].str.slice(0, 1) df["u_r"] = df["unique_id_r"].astype( str) + df["_source_table_r"].str.slice(0, 1) df = df.sort_values( ["_source_table_l", "unique_id_l", "_source_table_r", "unique_id_r"]) assert list(df["u_l"]) == [ '1l', '1l', '1l', '1l', '1l', '2l', '2l', '2l', '2l', '3l', '3l', '3l', '1r', '1r', '2r' ] assert list(df["u_r"]) == [ '2l', '3l', '1r', '2r', '3r', '3l', '1r', '2r', '3r', '1r', '2r', '3r', '2r', '3r', '3r' ]
def test_add_gammas(db): gamma_settings = { "link_type": "dedupe_only", "proportion_of_matches": 0.5, "comparison_columns": [ { "col_name": "fname", "num_levels": 2 }, { "col_name": "sname", "num_levels": 3, "case_expression": """ case when sname_l is null or sname_r is null then -1 when sname_l = sname_r then 2 when substr(sname_l,1, 3) = substr(sname_r, 1, 3) then 1 else 0 end as gamma_sname """ }, ], "blocking_rules": [], "retain_matching_columns": False } gamma_settings = complete_settings_dict(gamma_settings, spark="supress_warnings") sql = _sql_gen_add_gammas(gamma_settings, table_name="test2") db.execute(sql) result = db.fetchall() result = [dict(r) for r in result] correct_answer = [ { "unique_id_l": 1, "unique_id_r": 2, "gamma_fname": 1, "gamma_sname": 2 }, { "unique_id_l": 3, "unique_id_r": 4, "gamma_fname": 1, "gamma_sname": 1 }, { "unique_id_l": 5, "unique_id_r": 6, "gamma_fname": -1, "gamma_sname": -1 }, { "unique_id_l": 7, "unique_id_r": 8, "gamma_fname": 0, "gamma_sname": 0 }, ] pd_correct = pd.DataFrame(correct_answer) pd_correct = pd_correct.sort_values(["unique_id_l", "unique_id_r"]) pd_result = pd.DataFrame(result) pd_result = pd_result.sort_values(["unique_id_l", "unique_id_r"]) assert_frame_equal(pd_correct, pd_result) gamma_settings["retain_matching_columns"] = True sql = _sql_gen_add_gammas(gamma_settings, table_name="test2") db.execute(sql) result = db.fetchone() col_names = list(dict(result).keys()) correct_col_names = [ "unique_id_l", "unique_id_r", "fname_l", "fname_r", "gamma_fname", "sname_l", "sname_r", "gamma_sname", ] assert col_names == correct_col_names