def test_jaro_warning(spark):
    assert _check_jaro_registered(spark) == True

    spark.sql("drop temporary function jaro_winkler_sim")
    with pytest.warns(UserWarning):
        assert _check_jaro_registered(spark) == False
    from pyspark.sql.types import DoubleType

    spark.udf.registerJavaFunction(
        "jaro_winkler_sim",
        "uk.gov.moj.dash.linkage.JaroWinklerSimilarity",
        DoubleType(),
    )
Beispiel #2
0
    def __init__(
        self,
        settings: dict,
        df_or_dfs: Union[DataFrame, List[DataFrame]],
        spark: SparkSession,
        save_state_fn: Callable = None,
        break_lineage_blocked_comparisons:
        Callable = default_break_lineage_blocked_comparisons,
        break_lineage_scored_comparisons:
        Callable = default_break_lineage_scored_comparisons,
    ):
        """Splink data linker

        Provides easy access to the core user-facing functinoality of splink

        Args:
            settings (dict): splink settings dictionary
            df_or_dfs (Union[DataFrame, List[DataFrame]]): Either a single Spark dataframe to dedupe, or a list of Spark dataframe to link and or dedupe. Where `link_type` is `dedupe_only`, should be a single dataframe to dedupe. Where `link_type` is `link_only` or `link_and_dedupe`, show be a list of dfs.  Requires conformant dataframes (i.e. they must have same columns)
            spark (SparkSession): SparkSession object
            save_state_fn (function, optional):  A function provided by the user that takes one arguments, model (i.e. a Model from splink.model), and is executed each iteration.  This is a hook that allows the user to save the state between iterations, which is mostly useful for very large jobs which may need to be restarted from where they left off if they fail.
            break_lineage_blocked_comparisons (function, optional): Large jobs will likely run into memory errors unless the lineage is broken after blocking.  This is a user-provided function that takes one argument - df - and allows the user to break lineage.  For example, the function might save df to the AWS s3 file system, and then reload it from the saved files.
            break_lineage_scored_comparisons (function, optional): Large jobs will likely run into memory errors unless the lineage is broken after comparisons are scored and before term frequency adjustments.  This is a user-provided function that takes one argument - df - and allows the user to break lineage.  For example, the function might save df to the AWS s3 file system, and then reload it from the saved files.
        """

        self.spark = spark
        self.break_lineage_blocked_comparisons = break_lineage_blocked_comparisons
        self.break_lineage_scored_comparisons = break_lineage_scored_comparisons
        _check_jaro_registered(spark)

        validate_settings_against_schema(settings)
        validate_link_type(df_or_dfs, settings)

        self.model = Model(settings, spark)
        self.settings_dict = self.model.current_settings_obj.settings_dict
        self.settings_dict = normalise_probabilities(self.settings_dict)
        validate_probabilities(self.settings_dict)
        # dfs is a list of dfs irrespective of whether input was a df or list of dfs
        if type(df_or_dfs) == DataFrame:
            dfs = [df_or_dfs]
        else:
            dfs = df_or_dfs

        self.df = vertically_concatenate_datasets(dfs)
        validate_input_datasets(self.df, self.model.current_settings_obj)
        self.save_state_fn = save_state_fn
Beispiel #3
0
    def __init__(self,
                 settings: dict,
                 spark: SparkSession,
                 df_l: DataFrame = None,
                 df_r: DataFrame = None,
                 df: DataFrame = None,
                 save_state_fn: Callable = None,
                 break_lineage_blocked_comparisons:
                 Callable = default_break_lineage_blocked_comparisons,
                 break_lineage_scored_comparisons:
                 Callable = default_break_lineage_scored_comparisons):
        """splink data linker

        Provides easy access to the core user-facing functinoality of splink

        Args:
            settings (dict): splink settings dictionary
            spark (SparkSession): SparkSession object
            df_l (DataFrame, optional): A dataframe to link/dedupe. Where `link_type` is `link_only` or `link_and_dedupe`, one of the two dataframes to link. Should be ommitted `link_type` is `dedupe_only`.
            df_r (DataFrame, optional): A dataframe to link/dedupe. Where `link_type` is `link_only` or `link_and_dedupe`, one of the two dataframes to link. Should be ommitted `link_type` is `dedupe_only`.
            df (DataFrame, optional): The dataframe to dedupe. Where `link_type` is `dedupe_only`, the dataframe to dedupe. Should be ommitted `link_type` is `link_only` or `link_and_dedupe`.
            save_state_fn (function, optional):  A function provided by the user that takes two arguments, params and settings, and is executed each iteration.  This is a hook that allows the user to save the state between iterations, which is mostly useful for very large jobs which may need to be restarted from where they left off if they fail.
            break_lineage_blocked_comparisons (function, optional): Large jobs will likely run into memory errors unless the lineage is broken after blocking.  This is a user-provided function that takes one argument - df - and allows the user to break lineage.  For example, the function might save df to the AWS s3 file system, and then reload it from the saved files.
            break_lineage_scored_comparisons (function, optional): Large jobs will likely run into memory errors unless the lineage is broken after comparisons are scored and before term frequency adjustments.  This is a user-provided function that takes one argument - df - and allows the user to break lineage.  For example, the function might save df to the AWS s3 file system, and then reload it from the saved files.
        """

        self.spark = spark
        self.break_lineage_blocked_comparisons = break_lineage_blocked_comparisons
        self.break_lineage_scored_comparisons = break_lineage_scored_comparisons
        _check_jaro_registered(spark)

        settings = complete_settings_dict(settings, spark)
        validate_settings(settings)
        self.settings = settings

        self.params = Params(settings, spark)

        self.df_r = df_r
        self.df_l = df_l
        self.df = df
        self.save_state_fn = save_state_fn
        self._check_args()
Beispiel #4
0
    def __init__(
        self,
        settings: dict,
        spark: SparkSession,
        df_l: DataFrame = None,
        df_r: DataFrame = None,
        df: DataFrame = None,
        save_state_fn: Callable = None,
    ):
        """splink data linker

        Provides easy access to the core user-facing functinoality of splink

        Args:
            settings (dict): splink settings dictionary
            spark (SparkSession): SparkSession object
            df_l (DataFrame, optional): A dataframe to link/dedupe. Where `link_type` is `link_only` or `link_and_dedupe`, one of the two dataframes to link. Should be ommitted `link_type` is `dedupe_only`.
            df_r (DataFrame, optional): A dataframe to link/dedupe. Where `link_type` is `link_only` or `link_and_dedupe`, one of the two dataframes to link. Should be ommitted `link_type` is `dedupe_only`.
            df (DataFrame, optional): The dataframe to dedupe. Where `link_type` is `dedupe_only`, the dataframe to dedupe. Should be ommitted `link_type` is `link_only` or `link_and_dedupe`.
            save_state_fn (function, optional):  A function provided by the user that takes two arguments, params and settings, and is executed each iteration.  This is a hook that allows the user to save the state between iterations, which is mostly useful for very large jobs which may need to be restarted from where they left off if they fail.

        """

        self.spark = spark
        _check_jaro_registered(spark)

        settings = complete_settings_dict(settings, spark)
        validate_settings(settings)
        self.settings = settings

        self.params = Params(settings, spark)

        self.df_r = df_r
        self.df_l = df_l
        self.df = df
        self.save_state_fn = save_state_fn
        self._check_args()
Beispiel #5
0
def test_case_statements(spark, sqlite_con_3):

    assert _check_jaro_registered(spark) == True

    spark.sql("drop temporary function jaro_winkler_sim")
    with pytest.warns(UserWarning):
        assert _check_jaro_registered(spark) == False
    from pyspark.sql.types import DoubleType

    spark.udf.registerJavaFunction(
        "jaro_winkler_sim",
        "uk.gov.moj.dash.linkage.JaroWinklerSimilarity",
        DoubleType(),
    )

    assert _check_jaro_registered(spark) == True

    dfpd = pd.read_sql("select * from str_comp", sqlite_con_3)
    df = spark.createDataFrame(dfpd)
    df.createOrReplaceTempView("str_comp")

    case_statement = sql_gen_case_stmt_levenshtein_3("str_col", "str_col")
    sql = f"""select {case_statement} from str_comp"""
    df = spark.sql(sql).toPandas()

    assert df.loc[0, "gamma_str_col"] == 2
    assert df.loc[1, "gamma_str_col"] == 1
    assert df.loc[2, "gamma_str_col"] == 0
    assert df.loc[3, "gamma_str_col"] == -1
    assert df.loc[4, "gamma_str_col"] == -1

    case_statement = sql_gen_case_stmt_levenshtein_4("str_col", "str_col")
    sql = f"""select {case_statement} from str_comp"""
    df = spark.sql(sql).toPandas()

    assert df.loc[0, "gamma_str_col"] == 3
    assert df.loc[1, "gamma_str_col"] == 2
    assert df.loc[2, "gamma_str_col"] == 0
    assert df.loc[3, "gamma_str_col"] == -1
    assert df.loc[4, "gamma_str_col"] == -1

    case_statement = sql_gen_gammas_case_stmt_jaro_2("str_col", "str_col")
    sql = f"""select {case_statement} from str_comp"""
    df = spark.sql(sql).toPandas()

    assert df.loc[0, "gamma_str_col"] == 1
    assert df.loc[1, "gamma_str_col"] == 1
    assert df.loc[2, "gamma_str_col"] == 0
    assert df.loc[3, "gamma_str_col"] == -1
    assert df.loc[4, "gamma_str_col"] == -1

    case_statement = sql_gen_gammas_case_stmt_jaro_3("str_col", "str_col")
    sql = f"""select {case_statement} from str_comp"""
    df = spark.sql(sql).toPandas()

    assert df.loc[0, "gamma_str_col"] == 2
    assert df.loc[1, "gamma_str_col"] == 2
    assert df.loc[2, "gamma_str_col"] == 0
    assert df.loc[3, "gamma_str_col"] == -1
    assert df.loc[4, "gamma_str_col"] == -1

    case_statement = sql_gen_gammas_case_stmt_jaro_4("str_col",
                                                     "str_col",
                                                     threshold3=0.001)
    sql = f"""select {case_statement} from str_comp"""
    df = spark.sql(sql).toPandas()

    assert df.loc[0, "gamma_str_col"] == 3
    assert df.loc[1, "gamma_str_col"] == 3
    assert df.loc[2, "gamma_str_col"] == 1
    assert df.loc[3, "gamma_str_col"] == -1
    assert df.loc[4, "gamma_str_col"] == -1

    data = [{
        "surname_l": "smith",
        "forename1_l": "john",
        "forename2_l": "david",
        "surname_r": "smith",
        "forename1_r": "john",
        "forename2_r": "david"
    }, {
        "surname_l": "smith",
        "forename1_l": "john",
        "forename2_l": "david",
        "surname_r": "smithe",
        "forename1_r": "john",
        "forename2_r": "david"
    }, {
        "surname_l": "smith",
        "forename1_l": "john",
        "forename2_l": "david",
        "surname_r": "john",
        "forename1_r": "smith",
        "forename2_r": "david"
    }, {
        "surname_l": "smith",
        "forename1_l": "john",
        "forename2_l": "david",
        "surname_r": "john",
        "forename1_r": "david",
        "forename2_r": "smithe"
    }, {
        "surname_l": "linacre",
        "forename1_l": "john",
        "forename2_l": "david",
        "surname_r": "linaker",
        "forename1_r": "john",
        "forename2_r": "david"
    }, {
        "surname_l": "smith",
        "forename1_l": "john",
        "forename2_l": "david",
        "surname_r": "john",
        "forename1_r": "david",
        "forename2_r": "smarty"
    }]
    dfpd = pd.DataFrame(data)
    df = spark.createDataFrame(dfpd)
    df.createOrReplaceTempView("df_names")

    sql = sql_gen_gammas_name_inversion_4("surname",
                                          ["forename1", "forename2"],
                                          "surname")

    df_results = spark.sql(f"select {sql} from df_names").toPandas()
    assert df_results.loc[0, "gamma_surname"] == 3
    assert df_results.loc[1, "gamma_surname"] == 3
    assert df_results.loc[2, "gamma_surname"] == 2
    assert df_results.loc[3, "gamma_surname"] == 2
    assert df_results.loc[4, "gamma_surname"] == 1
    assert df_results.loc[5, "gamma_surname"] == 0