Ejemplo n.º 1
0
    def test_alias_linked_column_values(self):
        '''
        Doc string
        '''

        with patch(
                "exhibit.core.linkage.hierarchical._LinkedDataGenerator.__init__"
        ) as mock_init:
            mock_init.return_value = None
            test_LDG = tm._LinkedDataGenerator(Mock, Mock, Mock)

        test_dict = {
            "columns": {
                "C1": {
                    "anonymising_set":
                    "random",
                    "original_values":
                    pd.DataFrame(
                        data={"C1": ["repl_A", "B", MISSING_DATA_STR]}),
                    "paired_columns": []
                },
                "C2": {
                    "anonymising_set":
                    "random",
                    "original_values":
                    pd.DataFrame(
                        data={"C2": ["eggs", "spam", MISSING_DATA_STR]}),
                    "paired_columns": []
                },
            }
        }

        create_temp_table(table_name="temp_1234_0",
                          col_names=["C1", "C2"],
                          data=[("A", "spam"), ("B", "eggs")])

        #A - spam, B - eggs is initial linkage that was put into SQLdb
        test_linked_df = pd.DataFrame(data={
            "C1": ["A", "A", "B", "B"],
            "C2": ["spam", "spam", "eggs", "eggs"]
        })

        #repl_A - spam, B - eggs is user-edited linkage that exists only in spec
        expected_df = pd.DataFrame(
            data={
                "C1": ["repl_A", "repl_A", "B", "B"],
                "C2": ["spam", "spam", "eggs", "eggs"]
            })

        setattr(test_LDG, "spec_dict", test_dict)
        setattr(test_LDG, "table_name", "temp_1234_0")
        setattr(test_LDG, "id", "1234")
        setattr(test_LDG, "linked_group", (0, ["C1", "C2"]))
        setattr(test_LDG, "linked_cols", ["C1", "C2"])

        assert_frame_equal(
            left=test_LDG.alias_linked_column_values(test_linked_df),
            right=expected_df)

        db_util.drop_tables(["temp_1234_0"])
Ejemplo n.º 2
0
    def test_weights_for_linked_columns_with_mixed_inline_limits(self):
        '''
        Doc string
        '''

        data = [("A", "A1"), ("A", "A2"), ("B", "B1"), ("B", "B2"),
                ("B", "B3"), (MISSING_DATA_STR, MISSING_DATA_STR)]

        create_temp_table(table_name="temp_test_id_weights_1",
                          col_names=["LinkCat1", "LinkCat2"],
                          data=data)

        self._temp_tables.append("temp_test_id_weights_0")

        values = pd.DataFrame(data={
            "LinkCat1": ["A", "B", MISSING_DATA_STR],
            "NumC": [0.1, 0.9, 0.0]
        })

        test_dict = {
            "metadata": {
                "numerical_columns": ["NumC"],
                "inline_limit": 3,
                "id": "test_id_weights"
            },
            "columns": {
                "LinkCat1": {
                    "type": "categorical",
                    "original_values": values,
                    "uniques": 2,
                    "anonymising_set": "random"
                },
                "LinkCat2": {
                    "type": "categorical",
                    "original_values": ORIGINAL_VALUES_DB,
                    "uniques": 5,
                    "anonymising_set": "random"
                },
                "NumC": {
                    "type": "continuous",
                }
            },
            "linked_columns": [(1, ["LinkCat1", "LinkCat2"])]
        }

        test_cols = ["LinkCat1", "LinkCat2"]
        test_wt = tm.generate_weights_table(test_dict, test_cols)

        self.assertEqual(
            test_wt[("NumC", "LinkCat1", MISSING_DATA_STR)]["weights"].weight,
            0.0)
        self.assertEqual(test_wt[("NumC", "LinkCat1", "B")]["weights"].weight,
                         0.9)
        self.assertEqual(
            test_wt[("NumC", "LinkCat2", MISSING_DATA_STR)]["weights"].weight,
            0.2)
        self.assertEqual(test_wt[("NumC", "LinkCat2", "B1")]["weights"].weight,
                         0.2)
Ejemplo n.º 3
0
    def test_equal_weight_for_single_column_exceeding_inline_limit(self):
        '''
        Missind data is a special value that might or might not appear
        in the actually generated data, hence when calculating equal
        weights, we ignore it and only divide 1 by the total number
        of valid unique values in the column.
        '''

        data = [("A", ), ("B", ), ("C", ), ("D", ), ("E", ),
                (MISSING_DATA_STR, )]

        create_temp_table(table_name="temp_test_id_weights_CatC",
                          col_names=["CatC"],
                          data=data)

        self._temp_tables.append("temp_test_id_weights_CatC")

        test_dict = {
            "metadata": {
                "numerical_columns": ["NumC"],
                "inline_limit": 1,
                "id": "test_id_weights"
            },
            "columns": {
                "CatC": {
                    "type": "categorical",
                    "original_values": ORIGINAL_VALUES_DB,
                    "uniques": 5,
                    "anonymising_set": "random"
                },
                "NumC": {
                    "type": "continuous",
                }
            }
        }

        test_cols = ["CatC"]
        test_wt = tm.generate_weights_table(test_dict, test_cols)

        result_md = test_wt[("NumC", "CatC",
                             MISSING_DATA_STR)]["weights"].weight
        result_col = test_wt[("NumC", "CatC", "A")]["weights"].weight

        self.assertEqual(result_md, 0.2)
        self.assertEqual(result_col, 0.2)
Ejemplo n.º 4
0
    def test_temp_table_insertion(self):
        '''
        Temporary lookup table in anon.db - also testing
        extra whitespace in source data. When values are
        formatted for the spec, extra whitespace is stripped
        so we have to make sure the same happens when values
        are put in the SQL db.
        '''

        expected = [("A", "B"), ("A", "B")]
        output = tm.create_temp_table(table_name="test_table",
                                      col_names=list("AB"),
                                      data=[("A ", "B"), ("A", "B")],
                                      db_uri="file:test_db?mode=memory",
                                      return_table=True)

        self.assertListEqual(expected, output)
Ejemplo n.º 5
0
    def test_random_column_with_missing_pairs_sql(self):
        '''
        An edge case where a paired column isn't in sql alongside
        the base column; generation set is random shuffle.
        '''

        test_dict = {
            "metadata": {
                "inline_limit": 5,
                "id": 1234
            },
            "columns": {
                "test_Root": {
                    "type": "categorical",
                    "paired_columns": ["test_C1", "test_C2"],
                    "uniques": 10,
                    "original_values": pd.DataFrame(),
                    "anonymising_set": "random",
                    "cross_join_all_unique_values": False,
                }
            }
        }

        test_num_rows = 100
        test_col_name = "test_Root"
        test_col_attrs = test_dict["columns"][test_col_name]

        path = "exhibit.core.generate.categorical.CategoricalDataGenerator.__init__"
        with patch(path) as mock_init:
            mock_init.return_value = None
            generatorMock = tm.CategoricalDataGenerator(Mock(), Mock())

        setattr(generatorMock, "spec_dict", test_dict)
        setattr(generatorMock, "num_rows", test_num_rows)
        setattr(generatorMock, "rng", np.random.default_rng(seed=0))

        with tempfile.TemporaryDirectory() as td:

            db_name = "test.db"
            db_path = abspath(join(td, db_name))

            create_temp_table(table_name="temp_1234_test_Root",
                              col_names=["test_Root", "test_C1"],
                              data=[("A ", "B"), ("A", "B")],
                              db_uri=db_path,
                              return_table=False)

            result = generatorMock._generate_from_sql(test_col_name,
                                                      test_col_attrs,
                                                      db_uri=db_path)

            expected = pd.DataFrame(
                data={
                    "test_Root": ["A"] * test_num_rows,
                    "test_C1": ["B"] * test_num_rows,
                    "test_C2": ["A"] * test_num_rows
                })

            assert_frame_equal(
                left=expected,
                right=result,
            )