def test_main_api(spark, sqlite_con_1): settings = { "link_type": "dedupe_only", "comparison_columns": [{ "col_name": "surname" }, { "col_name": "mob" }], "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname"], "max_iterations": 2 } settings = complete_settings_dict(settings, spark=None) dfpd = pd.read_sql("select * from test1", sqlite_con_1) df = spark.createDataFrame(dfpd) linker = Splink(settings, spark, df=df) df_e = linker.get_scored_comparisons() linker.save_model_as_json("saved_model.json", overwrite=True) linker_2 = load_from_json("saved_model.json", spark=spark, df=df) df_e = linker_2.get_scored_comparisons() from splink.intuition import intuition_report params = linker.params row_dict = df_e.toPandas().sample(1).to_dict(orient="records")[0] print(intuition_report(row_dict, params)) linker.params._print_m_u_probs()
def test_tiny_numbers(spark): rows = [ {"unique_id": 1, "mob": 10, "surname": "Linacre"}, {"unique_id": 2, "mob": 10, "surname": "Linacre"}, {"unique_id": 3, "mob": 10, "surname": "Linacer"}, {"unique_id": 4, "mob": 7, "surname": "Smith"}, {"unique_id": 5, "mob": 8, "surname": "Smith"}, {"unique_id": 6, "mob": 8, "surname": "Smith"}, {"unique_id": 7, "mob": 8, "surname": "Jones"}, ] df = spark.createDataFrame(Row(**x) for x in rows) # Regression test, see https://github.com/moj-analytical-services/splink/issues/48 settings = { "link_type": "dedupe_only", "proportion_of_matches": 0.4, "comparison_columns": [ { "col_name": "mob", "num_levels": 2, "m_probabilities": [5.9380419956766985e-25, 1 - 5.9380419956766985e-25], "u_probabilities": [0.8, 0.2], }, { "col_name": "surname", "num_levels": 2, }, ], "blocking_rules": [ "l.mob = r.mob", "l.surname = r.surname", ], } linker = Splink(settings, df, spark) df_e = linker.manually_apply_fellegi_sunter_weights() df = df_e.toPandas() assert df["match_probability"].min() > 0.0 assert df["match_probability"].min() < 1e-20
def test_freq_adj_divzero(spark, nulls_df): # create settings object that requests term_freq_adjustments on column 'weird' settings = { "link_type": "dedupe_only", "blocking_rules": [ "l.surname = r.surname", ], "comparison_columns": [ { "col_name": "firstname", "num_levels": 3, }, { "col_name": "surname", "num_levels": 3, "term_frequency_adjustments": True, }, { "col_name": "always_none", "num_levels": 3, "term_frequency_adjustments": True, }, ], "additional_columns_to_retain": ["unique_id"], "max_iterations": 1, } # create column in a way that could trigger a div by zero on the average adj calculation before the fix nulls_df = nulls_df.withColumn("always_none", f.lit(None)) test_passing = True try: linker = Splink(settings, nulls_df, spark) linker.get_scored_comparisons() except ZeroDivisionError: test_passing = False assert test_passing is True
def test_freq_adj_divzero(spark, sparkdf): # create settings object that requests term_freq_adjustments on column 'weird' settings = { "link_type": "dedupe_only", "blocking_rules": [ "l.surname = r.surname", ], "comparison_columns": [{ "col_name": "firstname", "num_levels": 3, }, { "col_name": "surname", "num_levels": 3, "term_frequency_adjustments": True }, { "col_name": "weird", "num_levels": 3, "term_frequency_adjustments": True }], "additional_columns_to_retain": ["unique_id"], "em_convergence": 0.01 } sparkdf = sparkdf.withColumn("unique_id", f.monotonically_increasing_id()) # create column weird in a way that could trigger a div by zero on the average adj calculation before the fix sparkdf = sparkdf.withColumn("weird", f.lit(None)) try: linker = Splink(settings, spark, df=sparkdf) notpassing = False except ZeroDivisionError: notpassing = True assert (notpassing == False)
def test_fix_u(spark): # We expect u on the cartesian product of MoB to be around 1/12 df = [ { "unique_id": 1, "mob": "1", "first_name": "a", "surname": "a" }, { "unique_id": 2, "mob": "2", "first_name": "b", "surname": "b" }, { "unique_id": 3, "mob": "3", "first_name": "c", "surname": "c" }, { "unique_id": 4, "mob": "4", "first_name": "d", "surname": "d" }, { "unique_id": 5, "mob": "5", "first_name": "e", "surname": "e" }, { "unique_id": 6, "mob": "6", "first_name": "f", "surname": "f" }, { "unique_id": 7, "mob": "7", "first_name": "g", "surname": "g" }, { "unique_id": 9, "mob": "9", "first_name": "h", "surname": "h" }, { "unique_id": 10, "mob": "10", "first_name": "i", "surname": "i" }, { "unique_id": 10, "mob": "10", "first_name": "i", "surname": "i" }, ] df = spark.createDataFrame(Row(**x) for x in df) settings = { "link_type": "dedupe_only", "proportion_of_matches": 0.1, "comparison_columns": [ { "col_name": "mob", "num_levels": 2, "u_probabilities": [0.8, 0.2], "fix_u_probabilities": True, }, { "col_name": "first_name", "u_probabilities": [0.8, 0.2], }, { "col_name": "surname" }, ], "blocking_rules": [], "max_iterations": 1, } linker = Splink(settings, df, spark) df_e = linker.get_scored_comparisons() # Want to check that the "u_probabilities" in the latest parameters are still 0.8, 0.2 mob = linker.model.current_settings_obj.get_comparison_column("mob") assert mob["u_probabilities"][0] == pytest.approx(0.8) assert mob["u_probabilities"][1] == pytest.approx(0.2) first_name = linker.model.current_settings_obj.get_comparison_column( "first_name") assert first_name["u_probabilities"][0] != 0.8 assert first_name["u_probabilities"][1] != 0.2 settings = { "link_type": "dedupe_only", "proportion_of_matches": 0.1, "comparison_columns": [ { "col_name": "mob", "num_levels": 2, "u_probabilities": [0.8, 0.2], "fix_u_probabilities": False, }, { "col_name": "first_name" }, { "col_name": "surname" }, ], "blocking_rules": [], "max_iterations": 1, } linker = Splink(settings, df, spark) df_e = linker.get_scored_comparisons() # Want to check that the "u_probabilities" in the latest parameters are no longer 0.8, 0.2 mob = linker.model.current_settings_obj.get_comparison_column("mob") assert mob["u_probabilities"][0] != 0.8 assert mob["u_probabilities"][0] != 0.2 settings = { "link_type": "dedupe_only", "proportion_of_matches": 0.1, "comparison_columns": [ { "col_name": "mob", "num_levels": 2, "m_probabilities": [0.04, 0.96], "fix_m_probabilities": True, "u_probabilities": [0.75, 0.25], "fix_u_probabilities": False, }, { "col_name": "first_name" }, { "col_name": "surname" }, ], "blocking_rules": [], "max_iterations": 1, } linker = Splink(settings, df, spark) linker.get_scored_comparisons() mob = linker.model.current_settings_obj.get_comparison_column("mob") assert mob["u_probabilities"][0] != 0.75 assert mob["u_probabilities"][1] != 0.25 mob = linker.model.current_settings_obj.get_comparison_column("mob") assert mob["m_probabilities"][0] == pytest.approx(0.04) assert mob["m_probabilities"][1] == pytest.approx(0.96)
path = os.path.join(OUTPUT_PATH, f"params/saved_params_iteration_{it_num}.json") write_local_file_to_s3("saved_params.json", path, overwrite=True) # Lineage breaking functions def blocked_comparisons_to_s3(df, spark): df = df.repartition(50) path = os.path.join(OUTPUT_PATH, "data/df_gammas/") df.write.mode("overwrite").parquet(path) df_new = spark.read.parquet(path) return df_new def scored_comparisons_to_s3(df, spark): path = os.path.join(OUTPUT_PATH, "data/df_e/") df.write.mode("overwrite").parquet(path) df_new = spark.read.parquet(path) return df_new from splink import Splink linker = Splink(settings, spark, df=df, save_state_fn=persist_params_settings, break_lineage_blocked_comparisons = blocked_comparisons_to_s3, break_lineage_scored_comparisons = scored_comparisons_to_s3 ) df_e = linker.get_scored_comparisons() write_local_file_to_s3("saved_params.json", path, overwrite=True) df_e.write.mode("overwrite").parquet(path)
spark = get_spark() bn = spark.createDataFrame(bn) settings = { "link_type": "dedupe_only", "additional_columns_to_retain": ["filing_num"], "comparison_columns": [ { "col_name": "name", "term_frequency_adjustments": True }, ], "blocking_rules": ["l.name = r.name"] } linker = Splink(settings, df_or_dfs=bn, spark=spark) df_e = linker.get_scored_comparisons() print(df_e.head(10)) print(df_e.count()) print(df_e.columns) """ ad = get_addresses(conn) ad['unique_id'] = (ad.address1+ad.filing_num).map(hash) ad = spark.createDataFrame(ad) settings = { "link_type": "dedupe_only", "additional_columns_to_retain":[ "filing_num"
def test_main_api(spark): rows = [ { "unique_id": 1, "mob": 10, "surname": "Linacre" }, { "unique_id": 2, "mob": 10, "surname": "Linacre" }, { "unique_id": 3, "mob": 10, "surname": "Linacer" }, { "unique_id": 4, "mob": 7, "surname": "Smith" }, { "unique_id": 5, "mob": 8, "surname": "Smith" }, { "unique_id": 6, "mob": 8, "surname": "Smith" }, { "unique_id": 7, "mob": 8, "surname": "Jones" }, ] df = spark.createDataFrame(Row(**x) for x in rows) settings = { "link_type": "dedupe_only", "comparison_columns": [{ "col_name": "surname" }, { "col_name": "mob" }], "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname"], "max_iterations": 1, } linker = Splink(settings, df, spark) df_e = linker.get_scored_comparisons() linker.save_model_as_json("saved_model.json", overwrite=True) linker_2 = load_from_json("saved_model.json", df, spark=spark) df_e = linker_2.get_scored_comparisons() model = linker.model row_dict = df_e.toPandas().sample(1).to_dict(orient="records")[0] intuition_report(row_dict, model) bayes_factor_chart(row_dict, model)
def test_nulls(spark): settings = { "link_type": "dedupe_only", "proportion_of_matches": 0.1, "comparison_columns": [ { "col_name": "fname", "m_probabilities": [0.4, 0.6], "u_probabilities": [0.65, 0.35], }, { "col_name": "sname", "m_probabilities": [0.25, 0.75], "u_probabilities": [0.7, 0.3], }, { "col_name": "dob", "m_probabilities": [0.4, 0.6], "u_probabilities": [0.65, 0.35], }, ], "blocking_rules": [], } rows = [ { "unique_id": 1, "fname": "Rob", "sname": "Jones", "dob": "1980-01-01" }, { "unique_id": 2, "fname": "Rob", "sname": "Jones", "dob": None }, { "unique_id": 3, "fname": "Rob", "sname": None, "dob": None }, { "unique_id": 4, "fname": None, "sname": None, "dob": None }, ] df = spark.createDataFrame(Row(**x) for x in rows) linker = Splink(settings, df, spark) df_e = linker.manually_apply_fellegi_sunter_weights() df = df_e.toPandas() result_list = list(df["match_probability"]) correct_list = [0.322580645, 0.16, 0.1, 0.16, 0.1, 0.1] assert result_list == pytest.approx(correct_list)