Python Splink Exemples, splink.Splink Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_spark.py Projet : sdmurff/splink

def test_main_api(spark, sqlite_con_1):

    settings = {
        "link_type": "dedupe_only",
        "comparison_columns": [{
            "col_name": "surname"
        }, {
            "col_name": "mob"
        }],
        "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname"],
        "max_iterations": 2
    }
    settings = complete_settings_dict(settings, spark=None)
    dfpd = pd.read_sql("select * from test1", sqlite_con_1)

    df = spark.createDataFrame(dfpd)

    linker = Splink(settings, spark, df=df)
    df_e = linker.get_scored_comparisons()
    linker.save_model_as_json("saved_model.json", overwrite=True)
    linker_2 = load_from_json("saved_model.json", spark=spark, df=df)
    df_e = linker_2.get_scored_comparisons()

    from splink.intuition import intuition_report
    params = linker.params
    row_dict = df_e.toPandas().sample(1).to_dict(orient="records")[0]
    print(intuition_report(row_dict, params))

    linker.params._print_m_u_probs()

Exemple #2

0

Afficher le fichier

def test_tiny_numbers(spark):

    rows = [
        {"unique_id": 1, "mob": 10, "surname": "Linacre"},
        {"unique_id": 2, "mob": 10, "surname": "Linacre"},
        {"unique_id": 3, "mob": 10, "surname": "Linacer"},
        {"unique_id": 4, "mob": 7, "surname": "Smith"},
        {"unique_id": 5, "mob": 8, "surname": "Smith"},
        {"unique_id": 6, "mob": 8, "surname": "Smith"},
        {"unique_id": 7, "mob": 8, "surname": "Jones"},
    ]

    df = spark.createDataFrame(Row(**x) for x in rows)

    # Regression test, see https://github.com/moj-analytical-services/splink/issues/48

    settings = {
        "link_type": "dedupe_only",
        "proportion_of_matches": 0.4,
        "comparison_columns": [
            {
                "col_name": "mob",
                "num_levels": 2,
                "m_probabilities": [5.9380419956766985e-25, 1 - 5.9380419956766985e-25],
                "u_probabilities": [0.8, 0.2],
            },
            {
                "col_name": "surname",
                "num_levels": 2,
            },
        ],
        "blocking_rules": [
            "l.mob = r.mob",
            "l.surname = r.surname",
        ],
    }

    linker = Splink(settings, df, spark)
    df_e = linker.manually_apply_fellegi_sunter_weights()
    df = df_e.toPandas()
    assert df["match_probability"].min() > 0.0
    assert df["match_probability"].min() < 1e-20

Exemple #3

0

Afficher le fichier

Fichier : test_adj.py Projet : uk-gov-mirror/moj-analytical-services.splink

def test_freq_adj_divzero(spark, nulls_df):

    # create settings object that requests term_freq_adjustments on column 'weird'

    settings = {
        "link_type":
        "dedupe_only",
        "blocking_rules": [
            "l.surname = r.surname",
        ],
        "comparison_columns": [
            {
                "col_name": "firstname",
                "num_levels": 3,
            },
            {
                "col_name": "surname",
                "num_levels": 3,
                "term_frequency_adjustments": True,
            },
            {
                "col_name": "always_none",
                "num_levels": 3,
                "term_frequency_adjustments": True,
            },
        ],
        "additional_columns_to_retain": ["unique_id"],
        "max_iterations":
        1,
    }

    # create column in a way that could trigger a div by zero on the average adj calculation before the fix
    nulls_df = nulls_df.withColumn("always_none", f.lit(None))

    test_passing = True
    try:
        linker = Splink(settings, nulls_df, spark)
        linker.get_scored_comparisons()
    except ZeroDivisionError:
        test_passing = False

    assert test_passing is True

Exemple #4

0

Afficher le fichier

def test_freq_adj_divzero(spark, sparkdf):

    # create settings object that requests term_freq_adjustments on column 'weird'

    settings = {
        "link_type":
        "dedupe_only",
        "blocking_rules": [
            "l.surname = r.surname",
        ],
        "comparison_columns": [{
            "col_name": "firstname",
            "num_levels": 3,
        }, {
            "col_name": "surname",
            "num_levels": 3,
            "term_frequency_adjustments": True
        }, {
            "col_name": "weird",
            "num_levels": 3,
            "term_frequency_adjustments": True
        }],
        "additional_columns_to_retain": ["unique_id"],
        "em_convergence":
        0.01
    }

    sparkdf = sparkdf.withColumn("unique_id", f.monotonically_increasing_id())
    # create column weird in a way that could trigger a div by zero on the average adj calculation before the fix
    sparkdf = sparkdf.withColumn("weird", f.lit(None))

    try:
        linker = Splink(settings, spark, df=sparkdf)
        notpassing = False
    except ZeroDivisionError:
        notpassing = True

    assert (notpassing == False)

Exemple #5

0

Afficher le fichier

def test_fix_u(spark):

    # We expect u on the cartesian product of MoB to be around 1/12
    df = [
        {
            "unique_id": 1,
            "mob": "1",
            "first_name": "a",
            "surname": "a"
        },
        {
            "unique_id": 2,
            "mob": "2",
            "first_name": "b",
            "surname": "b"
        },
        {
            "unique_id": 3,
            "mob": "3",
            "first_name": "c",
            "surname": "c"
        },
        {
            "unique_id": 4,
            "mob": "4",
            "first_name": "d",
            "surname": "d"
        },
        {
            "unique_id": 5,
            "mob": "5",
            "first_name": "e",
            "surname": "e"
        },
        {
            "unique_id": 6,
            "mob": "6",
            "first_name": "f",
            "surname": "f"
        },
        {
            "unique_id": 7,
            "mob": "7",
            "first_name": "g",
            "surname": "g"
        },
        {
            "unique_id": 9,
            "mob": "9",
            "first_name": "h",
            "surname": "h"
        },
        {
            "unique_id": 10,
            "mob": "10",
            "first_name": "i",
            "surname": "i"
        },
        {
            "unique_id": 10,
            "mob": "10",
            "first_name": "i",
            "surname": "i"
        },
    ]

    df = spark.createDataFrame(Row(**x) for x in df)

    settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.1,
        "comparison_columns": [
            {
                "col_name": "mob",
                "num_levels": 2,
                "u_probabilities": [0.8, 0.2],
                "fix_u_probabilities": True,
            },
            {
                "col_name": "first_name",
                "u_probabilities": [0.8, 0.2],
            },
            {
                "col_name": "surname"
            },
        ],
        "blocking_rules": [],
        "max_iterations":
        1,
    }

    linker = Splink(settings, df, spark)

    df_e = linker.get_scored_comparisons()

    # Want to check that the "u_probabilities" in the latest parameters are still 0.8, 0.2
    mob = linker.model.current_settings_obj.get_comparison_column("mob")
    assert mob["u_probabilities"][0] == pytest.approx(0.8)
    assert mob["u_probabilities"][1] == pytest.approx(0.2)

    first_name = linker.model.current_settings_obj.get_comparison_column(
        "first_name")
    assert first_name["u_probabilities"][0] != 0.8
    assert first_name["u_probabilities"][1] != 0.2

    settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.1,
        "comparison_columns": [
            {
                "col_name": "mob",
                "num_levels": 2,
                "u_probabilities": [0.8, 0.2],
                "fix_u_probabilities": False,
            },
            {
                "col_name": "first_name"
            },
            {
                "col_name": "surname"
            },
        ],
        "blocking_rules": [],
        "max_iterations":
        1,
    }

    linker = Splink(settings, df, spark)

    df_e = linker.get_scored_comparisons()

    # Want to check that the "u_probabilities" in the latest parameters are no longer 0.8, 0.2
    mob = linker.model.current_settings_obj.get_comparison_column("mob")
    assert mob["u_probabilities"][0] != 0.8
    assert mob["u_probabilities"][0] != 0.2

    settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.1,
        "comparison_columns": [
            {
                "col_name": "mob",
                "num_levels": 2,
                "m_probabilities": [0.04, 0.96],
                "fix_m_probabilities": True,
                "u_probabilities": [0.75, 0.25],
                "fix_u_probabilities": False,
            },
            {
                "col_name": "first_name"
            },
            {
                "col_name": "surname"
            },
        ],
        "blocking_rules": [],
        "max_iterations":
        1,
    }

    linker = Splink(settings, df, spark)

    linker.get_scored_comparisons()

    mob = linker.model.current_settings_obj.get_comparison_column("mob")
    assert mob["u_probabilities"][0] != 0.75
    assert mob["u_probabilities"][1] != 0.25

    mob = linker.model.current_settings_obj.get_comparison_column("mob")
    assert mob["m_probabilities"][0] == pytest.approx(0.04)
    assert mob["m_probabilities"][1] == pytest.approx(0.96)

Exemple #6

0

Afficher le fichier

Fichier : job.py Projet : uk-gov-mirror/moj-analytical-services.splink_demos

    path = os.path.join(OUTPUT_PATH, f"params/saved_params_iteration_{it_num}.json")
    write_local_file_to_s3("saved_params.json", path, overwrite=True)

# Lineage breaking functions
def blocked_comparisons_to_s3(df, spark):
    df = df.repartition(50)
    path = os.path.join(OUTPUT_PATH, "data/df_gammas/")
    df.write.mode("overwrite").parquet(path)
    df_new = spark.read.parquet(path)
    return df_new

def scored_comparisons_to_s3(df, spark):

    path = os.path.join(OUTPUT_PATH, "data/df_e/")
    df.write.mode("overwrite").parquet(path)
    df_new = spark.read.parquet(path)
    return df_new

from splink import Splink
linker = Splink(settings,
                spark,
                df=df,
                save_state_fn=persist_params_settings,
                break_lineage_blocked_comparisons = blocked_comparisons_to_s3,
                break_lineage_scored_comparisons = scored_comparisons_to_s3
               )
df_e = linker.get_scored_comparisons()

write_local_file_to_s3("saved_params.json", path, overwrite=True)

df_e.write.mode("overwrite").parquet(path)

Exemple #7

0

Afficher le fichier

    spark = get_spark()
    bn = spark.createDataFrame(bn)
    settings = {
        "link_type":
        "dedupe_only",
        "additional_columns_to_retain": ["filing_num"],
        "comparison_columns": [
            {
                "col_name": "name",
                "term_frequency_adjustments": True
            },
        ],
        "blocking_rules": ["l.name = r.name"]
    }

    linker = Splink(settings, df_or_dfs=bn, spark=spark)

    df_e = linker.get_scored_comparisons()

    print(df_e.head(10))
    print(df_e.count())
    print(df_e.columns)
    """

    ad = get_addresses(conn)
    ad['unique_id'] = (ad.address1+ad.filing_num).map(hash)
    ad = spark.createDataFrame(ad)
    settings = {
    "link_type": "dedupe_only",
    "additional_columns_to_retain":[
        "filing_num"

Exemple #8

0

Afficher le fichier

def test_main_api(spark):

    rows = [
        {
            "unique_id": 1,
            "mob": 10,
            "surname": "Linacre"
        },
        {
            "unique_id": 2,
            "mob": 10,
            "surname": "Linacre"
        },
        {
            "unique_id": 3,
            "mob": 10,
            "surname": "Linacer"
        },
        {
            "unique_id": 4,
            "mob": 7,
            "surname": "Smith"
        },
        {
            "unique_id": 5,
            "mob": 8,
            "surname": "Smith"
        },
        {
            "unique_id": 6,
            "mob": 8,
            "surname": "Smith"
        },
        {
            "unique_id": 7,
            "mob": 8,
            "surname": "Jones"
        },
    ]

    df = spark.createDataFrame(Row(**x) for x in rows)

    settings = {
        "link_type": "dedupe_only",
        "comparison_columns": [{
            "col_name": "surname"
        }, {
            "col_name": "mob"
        }],
        "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname"],
        "max_iterations": 1,
    }

    linker = Splink(settings, df, spark)
    df_e = linker.get_scored_comparisons()
    linker.save_model_as_json("saved_model.json", overwrite=True)
    linker_2 = load_from_json("saved_model.json", df, spark=spark)
    df_e = linker_2.get_scored_comparisons()

    model = linker.model
    row_dict = df_e.toPandas().sample(1).to_dict(orient="records")[0]
    intuition_report(row_dict, model)
    bayes_factor_chart(row_dict, model)

Exemple #9

0

Afficher le fichier

Fichier : test_nulls.py Projet : uk-gov-mirror/moj-analytical-services.splink

def test_nulls(spark):

    settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.1,
        "comparison_columns": [
            {
                "col_name": "fname",
                "m_probabilities": [0.4, 0.6],
                "u_probabilities": [0.65, 0.35],
            },
            {
                "col_name": "sname",
                "m_probabilities": [0.25, 0.75],
                "u_probabilities": [0.7, 0.3],
            },
            {
                "col_name": "dob",
                "m_probabilities": [0.4, 0.6],
                "u_probabilities": [0.65, 0.35],
            },
        ],
        "blocking_rules": [],
    }

    rows = [
        {
            "unique_id": 1,
            "fname": "Rob",
            "sname": "Jones",
            "dob": "1980-01-01"
        },
        {
            "unique_id": 2,
            "fname": "Rob",
            "sname": "Jones",
            "dob": None
        },
        {
            "unique_id": 3,
            "fname": "Rob",
            "sname": None,
            "dob": None
        },
        {
            "unique_id": 4,
            "fname": None,
            "sname": None,
            "dob": None
        },
    ]

    df = spark.createDataFrame(Row(**x) for x in rows)

    linker = Splink(settings, df, spark)

    df_e = linker.manually_apply_fellegi_sunter_weights()
    df = df_e.toPandas()
    result_list = list(df["match_probability"])

    correct_list = [0.322580645, 0.16, 0.1, 0.16, 0.1, 0.1]

    assert result_list == pytest.approx(correct_list)