Exemple #1
0
def test_vertically_concat(spark):
    data_list_1 = [{"a": 1, "b": 1}, {"a": 2, "b": 2}]
    data_list_2 = [{"a": 3, "b": 3}, {"a": 4, "b": 4}]
    data_list_3 = [{"b": 1, "a": 10}, {"b": 2, "a": 20}]
    data_list_4 = [{"b": 1, "a": 10, "c": 1}, {"b": 2, "a": 20, "c": 2}]
    data_list_5 = [{"a": "a", "b": 1}, {"a": "b", "b": 2}]

    df1 = spark.createDataFrame(Row(**x) for x in data_list_1)
    df2 = spark.createDataFrame(Row(**x) for x in data_list_2)
    df3 = spark.createDataFrame(Row(**x) for x in data_list_3)
    df4 = spark.createDataFrame(Row(**x) for x in data_list_4)
    df5 = spark.createDataFrame(Row(**x) for x in data_list_5)

    df = vertically_concatenate_datasets([df1, df2, df3, df1])

    dfpd = df.toPandas()
    dfpd.sort_values(["a", "b"])

    assert list(dfpd["a"]) == [1, 2, 3, 4, 10, 20, 1, 2]
    assert list(dfpd["b"]) == [1, 2, 3, 4, 1, 2, 1, 2]

    # Error because different columns
    with pytest.raises(ValueError):
        vertically_concatenate_datasets([df4, df2]).show()

    # Error because different types
    with pytest.raises(ValueError):
        vertically_concatenate_datasets([df1, df5]).show()
Exemple #2
0
def test_link_dedupe(spark, link_dedupe_data, link_dedupe_data_repeat_ids):

    settings = {
        "link_type": "link_and_dedupe",
        "comparison_columns": [{"col_name": "first_name"}, {"col_name": "surname"}],
        "blocking_rules": ["l.first_name = r.first_name", "l.surname = r.surname"],
    }
    settings = complete_settings_dict(settings, spark=spark)
    df_l = link_dedupe_data["df_l"]
    df_r = link_dedupe_data["df_r"]
    df = vertically_concatenate_datasets([df_l, df_r])
    df_comparison = block_using_rules(settings, df, spark)
    df = df_comparison.toPandas()
    df = df.sort_values(["unique_id_l", "unique_id_r"])

    assert list(df["unique_id_l"]) == [1, 1, 2, 2, 7, 8]
    assert list(df["unique_id_r"]) == [7, 9, 8, 9, 9, 9]

    df_l = link_dedupe_data_repeat_ids["df_l"]
    df_r = link_dedupe_data_repeat_ids["df_r"]
    df = vertically_concatenate_datasets([df_l, df_r])
    df = block_using_rules(settings, df, spark)
    df = df.toPandas()
    df["u_l"] = df["unique_id_l"].astype(str) + df["source_dataset_l"].str.slice(0, 1)
    df["u_r"] = df["unique_id_r"].astype(str) + df["source_dataset_r"].str.slice(0, 1)

    df = df.sort_values(
        ["source_dataset_l", "source_dataset_r", "unique_id_l", "unique_id_r"]
    )

    assert list(df["u_l"]) == ["2l", "1l", "1l", "2l", "2l", "3l", "3l", "1r", "2r"]
    assert list(df["u_r"]) == ["3l", "1r", "3r", "2r", "3r", "2r", "3r", "3r", "3r"]

    settings = {
        "link_type": "link_and_dedupe",
        "comparison_columns": [{"col_name": "first_name"}, {"col_name": "surname"}],
        "blocking_rules": [],
    }
    settings = complete_settings_dict(settings, spark=spark)

    df_l = link_dedupe_data_repeat_ids["df_l"]
    df_r = link_dedupe_data_repeat_ids["df_r"]
    df = vertically_concatenate_datasets([df_l, df_r])
    df = block_using_rules(settings, df, spark)
    df = df.toPandas()

    df["u_l"] = df["unique_id_l"].astype(str) + df["source_dataset_l"].str.slice(0, 1)
    df["u_r"] = df["unique_id_r"].astype(str) + df["source_dataset_r"].str.slice(0, 1)
    df = df.sort_values(
        ["source_dataset_l", "unique_id_l", "source_dataset_r", "unique_id_r"]
    )
    # fmt: off
    assert list(df["u_l"]) == ["1l", "1l", "1l", "1l", "1l", "2l", "2l", "2l", "2l", "3l", "3l", "3l", "1r", "1r", "2r"]
    assert list(df["u_r"]) == ["2l", "3l", "1r", "2r", "3r", "3l", "1r", "2r", "3r", "1r", "2r", "3r", "2r", "3r", "3r"]
Exemple #3
0
def test_link_only(spark, link_dedupe_data, link_dedupe_data_repeat_ids):

    settings = {
        "link_type": "link_only",
        "comparison_columns": [{"col_name": "first_name"}, {"col_name": "surname"}],
        "blocking_rules": ["l.first_name = r.first_name", "l.surname = r.surname"],
    }
    settings = complete_settings_dict(settings, spark)
    df_l = link_dedupe_data["df_l"]
    df_r = link_dedupe_data["df_r"]
    df = vertically_concatenate_datasets([df_l, df_r])
    df_comparison = block_using_rules(settings, df, spark)
    df = df_comparison.toPandas()
    df = df.sort_values(["unique_id_l", "unique_id_r"])

    assert list(df["unique_id_l"]) == [1, 1, 2, 2]
    assert list(df["unique_id_r"]) == [7, 9, 8, 9]

    df_l = link_dedupe_data_repeat_ids["df_l"]
    df_r = link_dedupe_data_repeat_ids["df_r"]
    df = vertically_concatenate_datasets([df_l, df_r])
    df_comparison = block_using_rules(settings, df, spark)
    df = df_comparison.toPandas()
    df = df.sort_values(["unique_id_l", "unique_id_r"])

    assert list(df["unique_id_l"]) == [1, 1, 2, 2, 3, 3]
    assert list(df["unique_id_r"]) == [1, 3, 2, 3, 2, 3]

    settings = {
        "link_type": "link_only",
        "comparison_columns": [{"col_name": "first_name"}, {"col_name": "surname"}],
        "blocking_rules": [],
    }
    settings = complete_settings_dict(settings, spark)
    df = vertically_concatenate_datasets([df_l, df_r])
    df_comparison = block_using_rules(settings, df, spark)
    df = df_comparison.toPandas()
    df = df.sort_values(["unique_id_l", "unique_id_r"])

    assert list(df["unique_id_l"]) == [1, 1, 1, 2, 2, 2, 3, 3, 3]
    assert list(df["unique_id_r"]) == [1, 2, 3, 1, 2, 3, 1, 2, 3]
Exemple #4
0
    def __init__(
        self,
        settings: dict,
        df_or_dfs: Union[DataFrame, List[DataFrame]],
        spark: SparkSession,
        save_state_fn: Callable = None,
        break_lineage_blocked_comparisons:
        Callable = default_break_lineage_blocked_comparisons,
        break_lineage_scored_comparisons:
        Callable = default_break_lineage_scored_comparisons,
    ):
        """Splink data linker

        Provides easy access to the core user-facing functinoality of splink

        Args:
            settings (dict): splink settings dictionary
            df_or_dfs (Union[DataFrame, List[DataFrame]]): Either a single Spark dataframe to dedupe, or a list of Spark dataframe to link and or dedupe. Where `link_type` is `dedupe_only`, should be a single dataframe to dedupe. Where `link_type` is `link_only` or `link_and_dedupe`, show be a list of dfs.  Requires conformant dataframes (i.e. they must have same columns)
            spark (SparkSession): SparkSession object
            save_state_fn (function, optional):  A function provided by the user that takes one arguments, model (i.e. a Model from splink.model), and is executed each iteration.  This is a hook that allows the user to save the state between iterations, which is mostly useful for very large jobs which may need to be restarted from where they left off if they fail.
            break_lineage_blocked_comparisons (function, optional): Large jobs will likely run into memory errors unless the lineage is broken after blocking.  This is a user-provided function that takes one argument - df - and allows the user to break lineage.  For example, the function might save df to the AWS s3 file system, and then reload it from the saved files.
            break_lineage_scored_comparisons (function, optional): Large jobs will likely run into memory errors unless the lineage is broken after comparisons are scored and before term frequency adjustments.  This is a user-provided function that takes one argument - df - and allows the user to break lineage.  For example, the function might save df to the AWS s3 file system, and then reload it from the saved files.
        """

        self.spark = spark
        self.break_lineage_blocked_comparisons = break_lineage_blocked_comparisons
        self.break_lineage_scored_comparisons = break_lineage_scored_comparisons
        _check_jaro_registered(spark)

        validate_settings_against_schema(settings)
        validate_link_type(df_or_dfs, settings)

        self.model = Model(settings, spark)
        self.settings_dict = self.model.current_settings_obj.settings_dict
        self.settings_dict = normalise_probabilities(self.settings_dict)
        validate_probabilities(self.settings_dict)
        # dfs is a list of dfs irrespective of whether input was a df or list of dfs
        if type(df_or_dfs) == DataFrame:
            dfs = [df_or_dfs]
        else:
            dfs = df_or_dfs

        self.df = vertically_concatenate_datasets(dfs)
        validate_input_datasets(self.df, self.model.current_settings_obj)
        self.save_state_fn = save_state_fn