Ejemplo n.º 1
0
    def get_scored_comparisons(self):
        """Use the EM algorithm to estimate model parameters and return match probabilities.

        Note: Does not compute term frequency adjustments.

        Returns:
            DataFrame: A spark dataframe including a match probability column
        """

        df_comparison = self._get_df_comparison()

        df_gammas = add_gammas(df_comparison, self.settings, self.spark)

        df_gammas = self.break_lineage_blocked_comparisons(
            df_gammas, self.spark)

        df_e = iterate(
            df_gammas,
            self.params,
            self.settings,
            self.spark,
            compute_ll=False,
            save_state_fn=self.save_state_fn,
        )

        # In case the user's break lineage function has persisted it
        df_gammas.unpersist()

        df_e = self.break_lineage_scored_comparisons(df_e, self.spark)

        df_e_adj = self._make_term_frequency_adjustments(df_e)

        df_e.unpersist()

        return df_e_adj
Ejemplo n.º 2
0
def test_expectation(spark, sqlite_con_1, params_1, gamma_settings_1):
    dfpd = pd.read_sql("select * from test1", sqlite_con_1)
    df = spark.createDataFrame(dfpd)

    gamma_settings_1["blocking_rules"] = [
        "l.mob = r.mob",
        "l.surname = r.surname",
    ]

    df_comparison = block_using_rules(gamma_settings_1, df=df, spark=spark)

    df_gammas = add_gammas(df_comparison, gamma_settings_1, spark)

    # df_e = iterate(df_gammas, spark, params_1, num_iterations=1)
    df_e = run_expectation_step(df_gammas, params_1, gamma_settings_1, spark)

    df_e_pd = df_e.toPandas()
    df_e_pd = df_e_pd.sort_values(["unique_id_l", "unique_id_r"])

    correct_list = [
        0.893617021,
        0.705882353,
        0.705882353,
        0.189189189,
        0.189189189,
        0.893617021,
        0.375,
        0.375,
    ]
    result_list = list(df_e_pd["match_probability"].astype(float))

    for i in zip(result_list, correct_list):
        assert i[0] == pytest.approx(i[1])
Ejemplo n.º 3
0
def test_tiny_numbers(spark, sqlite_con_1):

    # Regression test, see https://github.com/moj-analytical-services/splink/issues/48

    dfpd = pd.read_sql("select * from test1", sqlite_con_1)
    df = spark.createDataFrame(dfpd)

    settings = {
        "link_type": "dedupe_only",
        "proportion_of_matches": 0.4,
        "comparison_columns": [
            {
                "col_name": "mob",
                "num_levels": 2,
                "m_probabilities": [5.9380419956766985e-25, 1 - 5.9380419956766985e-25],
                "u_probabilities": [0.8, 0.2],
            },
            {"col_name": "surname", "num_levels": 2,},
        ],
        "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname",],
    }

    settings = complete_settings_dict(settings, spark=None)

    df_comparison = block_using_rules(settings, df=df, spark=spark)

    df_gammas = add_gammas(df_comparison, settings, spark)
    params = Params(settings, spark="supress_warnings")

    df_e = run_expectation_step(df_gammas, params, settings, spark)
Ejemplo n.º 4
0
    def get_scored_comparisons(self):
        """Use the EM algorithm to estimate model parameters and return match probabilities.

        Note: Does not compute term frequency adjustments.

        Returns:
            DataFrame: A spark dataframe including a match probability column
        """

        df_comparison = self._get_df_comparison()

        df_gammas = add_gammas(df_comparison, self.settings, self.spark)

        df_gammas.persist()

        df_e = iterate(
            df_gammas,
            self.params,
            self.settings,
            self.spark,
            compute_ll=False,
            save_state_fn=self.save_state_fn,
        )
        df_gammas.unpersist()
        return df_e
Ejemplo n.º 5
0
    def manually_apply_fellegi_sunter_weights(self):
        """Compute match probabilities from m and u probabilities specified in the splink settings object

        Returns:
            DataFrame: A spark dataframe including a match probability column
        """
        df_comparison = self._get_df_comparison()
        df_gammas = add_gammas(df_comparison, self.settings, self.spark)
        return run_expectation_step(df_gammas, self.params, self.settings,
                                    self.spark)
Ejemplo n.º 6
0
    def manually_apply_fellegi_sunter_weights(self):
        """Compute match probabilities from m and u probabilities specified in the splink settings object

        Returns:
            DataFrame: A spark dataframe including a match probability column
        """
        df_comparison = block_using_rules(self.settings_dict, self.df,
                                          self.spark)
        df_gammas = add_gammas(df_comparison, self.settings_dict, self.spark)
        # see https://github.com/moj-analytical-services/splink/issues/187
        df_gammas = self.break_lineage_blocked_comparisons(
            df_gammas, self.spark)
        return run_expectation_step(df_gammas, self.model, self.spark)
Ejemplo n.º 7
0
def test_iterate(spark, sqlite_con_1, params_1, gamma_settings_1):

    original_params = copy.deepcopy(params_1.params)
    dfpd = pd.read_sql("select * from test1", sqlite_con_1)
    df = spark.createDataFrame(dfpd)

    rules = [
        "l.mob = r.mob",
        "l.surname = r.surname",
    ]

    gamma_settings_1["blocking_rules"] = rules

    df_comparison = block_using_rules(gamma_settings_1, df=df, spark=spark)

    df_gammas = add_gammas(df_comparison, gamma_settings_1, spark)

    gamma_settings_1["max_iterations"] = 1
    df_e = iterate(df_gammas, params_1, gamma_settings_1, spark)

    assert params_1.params["λ"] == pytest.approx(0.540922141)

    assert params_1.params["π"]["gamma_mob"]["prob_dist_match"]["level_0"][
        "probability"] == pytest.approx(0.087438272, abs=0.0001)
    assert params_1.params["π"]["gamma_surname"]["prob_dist_non_match"][
        "level_1"]["probability"] == pytest.approx(0.160167628, abs=0.0001)

    first_it_params = copy.deepcopy(params_1.params)

    df_e_pd = df_e.toPandas()
    df_e_pd = df_e_pd.sort_values(["unique_id_l", "unique_id_r"])

    correct_list = [
        0.658602114,
        0.796821727,
        0.796821727,
        0.189486495,
        0.189486495,
        0.658602114,
        0.495063367,
        0.495063367,
    ]
    result_list = list(df_e_pd["match_probability"].astype(float))

    for i in zip(result_list, correct_list):
        assert i[0] == pytest.approx(i[1], abs=0.0001)

    # Does it still work with another iteration?
    gamma_settings_1["max_iterations"] = 1
    df_e = iterate(df_gammas, params_1, gamma_settings_1, spark)
    assert params_1.params["λ"] == pytest.approx(0.534993426, abs=0.0001)

    assert params_1.params["π"]["gamma_mob"]["prob_dist_match"]["level_0"][
        "probability"] == pytest.approx(0.088546179, abs=0.0001)
    assert params_1.params["π"]["gamma_surname"]["prob_dist_non_match"][
        "level_1"]["probability"] == pytest.approx(0.109234086, abs=0.0001)

    ## Test whether the params object is correctly storing the iteration history

    assert params_1.param_history[0] == original_params
    assert params_1.param_history[1] == first_it_params

    ## Now test whether, when we

    data = params_1._convert_params_dict_to_dataframe(original_params)
    val1 = {
        "gamma": "gamma_mob",
        "match": 0,
        "value_of_gamma": "level_0",
        "probability": 0.8,
        "value": 0,
        "column": "mob",
    }
    val2 = {
        "gamma": "gamma_surname",
        "match": 1,
        "value_of_gamma": "level_1",
        "probability": 0.2,
        "value": 1,
        "column": "surname",
    }

    assert val1 in data
    assert val2 in data

    correct_list = [{
        "iteration": 0,
        "λ": 0.4
    }, {
        "iteration": 1,
        "λ": 0.540922141
    }]

    result_list = params_1._iteration_history_df_lambdas()

    for i in zip(result_list, correct_list):
        assert i[0]["iteration"] == i[1]["iteration"]
        assert i[0]["λ"] == pytest.approx(i[1]["λ"])

    result_list = params_1._iteration_history_df_gammas()

    val1 = {
        "iteration": 0,
        "gamma": "gamma_mob",
        "match": 0,
        "value_of_gamma": "level_0",
        "probability": 0.8,
        "value": 0,
        "column": "mob",
    }
    assert val1 in result_list

    val2 = {
        "iteration": 1,
        "gamma": "gamma_surname",
        "match": 0,
        "value_of_gamma": "level_1",
        "probability": 0.160167628,
        "value": 1,
        "column": "surname",
    }

    for r in result_list:
        if r["iteration"] == 1:
            if r["gamma"] == "gamma_surname":
                if r["match"] == 0:
                    if r["value"] == 1:
                        record = r

    for k, v in record.items():
        expected_value = val2[k]
        if k == "probability":
            assert v == pytest.approx(expected_value, abs=0.0001)
        else:
            assert v == expected_value

    # Test whether saving and loading parameters works
    import tempfile

    dir = tempfile.TemporaryDirectory()
    fname = os.path.join(dir.name, "params.json")

    # print(params_1.params)
    # import json
    # print(json.dumps(params_1.to_dict(), indent=4))

    params_1.save_params_to_json_file(fname)

    from splink.params import load_params_from_json

    p = load_params_from_json(fname)
    assert p.params["λ"] == pytest.approx(params_1.params["λ"])
def test_add_gammas(spark):

    rows = [
        {
            "unique_id_l": 1,
            "unique_id_r": 2,
            "fname_l": "robin",
            "fname_r": "robin",
            "sname_l": "linacre",
            "sname_r": "linacre",
        },
        {
            "unique_id_l": 3,
            "unique_id_r": 4,
            "fname_l": "robin",
            "fname_r": "robin",
            "sname_l": "linacrr",
            "sname_r": "linacre",
        },
        {
            "unique_id_l": 5,
            "unique_id_r": 6,
            "fname_l": None,
            "fname_r": None,
            "sname_l": None,
            "sname_r": "linacre",
        },
        {
            "unique_id_l": 7,
            "unique_id_r": 8,
            "fname_l": "robin",
            "fname_r": "julian",
            "sname_l": "linacre",
            "sname_r": "smith",
        },
    ]

    df = spark.createDataFrame(Row(**x) for x in rows)

    gamma_settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.5,
        "comparison_columns": [
            {
                "col_name": "fname",
                "num_levels": 2
            },
            {
                "col_name":
                "sname",
                "num_levels":
                3,
                "case_expression":
                """
                                    case
                                    when sname_l is null or sname_r is null then -1
                                    when sname_l = sname_r then 2
                                    when substr(sname_l,1, 3) =  substr(sname_r, 1, 3) then 1
                                    else 0
                                    end
                                    as gamma_sname
                                    """,
            },
        ],
        "blocking_rules": [],
        "retain_matching_columns":
        False,
    }

    df_gammas = add_gammas(df, gamma_settings, spark)

    correct_answer = [
        {
            "unique_id_l": 1,
            "unique_id_r": 2,
            "gamma_fname": 1,
            "gamma_sname": 2
        },
        {
            "unique_id_l": 3,
            "unique_id_r": 4,
            "gamma_fname": 1,
            "gamma_sname": 1
        },
        {
            "unique_id_l": 5,
            "unique_id_r": 6,
            "gamma_fname": -1,
            "gamma_sname": -1
        },
        {
            "unique_id_l": 7,
            "unique_id_r": 8,
            "gamma_fname": 0,
            "gamma_sname": 0
        },
    ]

    pd_correct = pd.DataFrame(correct_answer)
    pd_correct = pd_correct.sort_values(["unique_id_l", "unique_id_r"])
    pd_correct = pd_correct.astype(int)
    pd_result = df_gammas.toPandas()
    pd_result = pd_result.sort_values(["unique_id_l", "unique_id_r"])
    pd_result = pd_result.astype(int)

    assert_frame_equal(pd_correct, pd_result)

    gamma_settings["retain_matching_columns"] = True
    df_gammas = add_gammas(df, gamma_settings, spark)

    result = df_gammas.toPandas()
    col_names = list(result.columns)
    correct_col_names = [
        "unique_id_l",
        "unique_id_r",
        "fname_l",
        "fname_r",
        "gamma_fname",
        "sname_l",
        "sname_r",
        "gamma_sname",
    ]
    assert col_names == correct_col_names

    # With source datset
    gamma_settings["source_dataset_column_name"] = "source_ds"
    df = df.withColumn("source_ds_l", lit("ds"))
    df = df.withColumn("source_ds_r", lit("ds"))

    df_gammas = add_gammas(df, gamma_settings, spark)

    result = df_gammas.toPandas()
    col_names = list(result.columns)
    correct_col_names = [
        "source_ds_l",
        "unique_id_l",
        "source_ds_r",
        "unique_id_r",
        "fname_l",
        "fname_r",
        "gamma_fname",
        "sname_l",
        "sname_r",
        "gamma_sname",
    ]

    assert col_names == correct_col_names
def test_expectation_and_maximisation(spark):
    settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.4,
        "comparison_columns": [
            {
                "col_name": "mob",
                "num_levels": 2,
                "m_probabilities": [0.1, 0.9],
                "u_probabilities": [0.8, 0.2],
            },
            {
                "custom_name": "surname",
                "custom_columns_used": ["surname"],
                "num_levels": 3,
                "case_expression": """
                    case
                    when surname_l is null or surname_r is null then -1
                    when surname_l = surname_r then 2
                    when substr(surname_l,1, 3) =  substr(surname_r, 1, 3) then 1
                    else 0
                    end
                    as gamma_surname
                    """,
                "m_probabilities": [0.1, 0.2, 0.7],
                "u_probabilities": [0.5, 0.25, 0.25],
            },
        ],
        "blocking_rules": [
            "l.mob = r.mob",
            "l.surname = r.surname",
        ],
        "retain_intermediate_calculation_columns":
        True,
    }

    rows = [
        {
            "unique_id": 1,
            "mob": 10,
            "surname": "Linacre"
        },
        {
            "unique_id": 2,
            "mob": 10,
            "surname": "Linacre"
        },
        {
            "unique_id": 3,
            "mob": 10,
            "surname": "Linacer"
        },
        {
            "unique_id": 4,
            "mob": 7,
            "surname": "Smith"
        },
        {
            "unique_id": 5,
            "mob": 8,
            "surname": "Smith"
        },
        {
            "unique_id": 6,
            "mob": 8,
            "surname": "Smith"
        },
        {
            "unique_id": 7,
            "mob": 8,
            "surname": "Jones"
        },
    ]

    df_input = spark.createDataFrame(Row(**x) for x in rows)
    df_input.persist()
    params = Model(settings, spark)

    df_comparison = block_using_rules(
        params.current_settings_obj.settings_dict, df_input, spark)
    df_gammas = add_gammas(df_comparison,
                           params.current_settings_obj.settings_dict, spark)
    df_gammas.persist()
    df_e = run_expectation_step(df_gammas, params, spark)
    df_e = df_e.sort("unique_id_l", "unique_id_r")

    df_e.persist()

    ################################################
    # Test probabilities correctly assigned
    ################################################

    df = df_e.toPandas()
    cols_to_keep = [
        "prob_gamma_mob_match",
        "prob_gamma_mob_non_match",
        "prob_gamma_surname_match",
        "prob_gamma_surname_non_match",
    ]
    pd_df_result = df[cols_to_keep][:4]

    df_correct = [
        {
            "prob_gamma_mob_match": 0.9,
            "prob_gamma_mob_non_match": 0.2,
            "prob_gamma_surname_match": 0.7,
            "prob_gamma_surname_non_match": 0.25,
        },
        {
            "prob_gamma_mob_match": 0.9,
            "prob_gamma_mob_non_match": 0.2,
            "prob_gamma_surname_match": 0.2,
            "prob_gamma_surname_non_match": 0.25,
        },
        {
            "prob_gamma_mob_match": 0.9,
            "prob_gamma_mob_non_match": 0.2,
            "prob_gamma_surname_match": 0.2,
            "prob_gamma_surname_non_match": 0.25,
        },
        {
            "prob_gamma_mob_match": 0.1,
            "prob_gamma_mob_non_match": 0.8,
            "prob_gamma_surname_match": 0.7,
            "prob_gamma_surname_non_match": 0.25,
        },
    ]

    pd_df_correct = pd.DataFrame(df_correct)

    assert_frame_equal(pd_df_correct, pd_df_result)

    ################################################
    # Test match probabilities correctly calculated
    ################################################

    result_list = list(df["match_probability"])
    # See https://github.com/moj-analytical-services/splink/blob/master/tests/expectation_maximisation_test_answers.xlsx
    # for derivation of these numbers
    correct_list = [
        0.893617021,
        0.705882353,
        0.705882353,
        0.189189189,
        0.189189189,
        0.893617021,
        0.375,
        0.375,
    ]
    assert result_list == pytest.approx(correct_list)

    ################################################
    # Test new probabilities correctly calculated
    ################################################

    run_maximisation_step(df_e, params, spark)

    new_lambda = params.current_settings_obj["proportion_of_matches"]

    # See https://github.com/moj-analytical-services/splink/blob/master/tests/expectation_maximisation_test_answers.xlsx
    # for derivation of these numbers
    assert new_lambda == pytest.approx(0.540922141)

    rows = [
        ["mob", 0, 0.087438272, 0.441543191],
        ["mob", 1, 0.912561728, 0.558456809],
        ["surname", 0, 0.173315146, 0.340356209],
        ["surname", 1, 0.326240275, 0.160167628],
        ["surname", 2, 0.500444578, 0.499476163],
    ]

    settings_obj = params.current_settings_obj

    for r in rows:
        cc = settings_obj.get_comparison_column(r[0])
        level_dict = cc.level_as_dict(r[1])
        assert level_dict["m_probability"] == pytest.approx(r[2])
        assert level_dict["u_probability"] == pytest.approx(r[3])

    ################################################
    # Test revised probabilities correctly used
    ################################################

    df_e = run_expectation_step(df_gammas, params, spark)
    df_e = df_e.sort("unique_id_l", "unique_id_r")
    result_list = list(df_e.toPandas()["match_probability"])

    correct_list = [
        0.658602114,
        0.796821727,
        0.796821727,
        0.189486495,
        0.189486495,
        0.658602114,
        0.495063367,
        0.495063367,
    ]
    assert result_list == pytest.approx(correct_list)

    run_maximisation_step(df_e, params, spark)
    new_lambda = params.current_settings_obj["proportion_of_matches"]
    assert new_lambda == pytest.approx(0.534993426)

    rows = [
        ["mob", 0, 0.088546179, 0.435753788],
        ["mob", 1, 0.911453821, 0.564246212],
        ["surname", 0, 0.231340865, 0.27146747],
        ["surname", 1, 0.372351177, 0.109234086],
        ["surname", 2, 0.396307958, 0.619298443],
    ]

    settings_obj = params.current_settings_obj

    for r in rows:
        cc = settings_obj.get_comparison_column(r[0])
        level_dict = cc.level_as_dict(r[1])
        assert level_dict["m_probability"] == pytest.approx(r[2])
        assert level_dict["u_probability"] == pytest.approx(r[3])

    ################################################
    # Test whether saving and loading params works
    # (If we load params, does the expectation step yield same answer)
    ################################################
    import tempfile

    dir = tempfile.TemporaryDirectory()
    fname = os.path.join(dir.name, "params.json")

    df_e = run_expectation_step(df_gammas, params, spark)
    params.save_model_to_json_file(fname)

    from splink.model import load_model_from_json

    p = load_model_from_json(fname)

    df_e_2 = run_expectation_step(df_gammas, p, spark)

    assert_frame_equal(df_e.toPandas(), df_e_2.toPandas())