Ejemplo n.º 1
0
    def test_primitive_compiler_report(self):
        structured_options = StructuredOptions()
        data1 = pd.Series(["2.6", "-1.8"])
        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        structured_options.float.precision.is_enabled = False
        compiler1._create_profile(data1, structured_options)

        report = compiler1.report(remove_disabled_flag=True)
        self.assertNotIn("precision", report["statistics"])

        report = compiler1.report(remove_disabled_flag=False)
        self.assertIn("precision", report["statistics"])

        structured_options2 = StructuredOptions()
        data2 = pd.Series(["abcd", "aa", "abcd", "aa", "b", "dfd"])
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)
        structured_options2.text.vocab.is_enabled = False
        compiler2._create_profile(data2, structured_options2)

        report = compiler2.report(remove_disabled_flag=True)
        self.assertNotIn("vocab", report["statistics"])

        report = compiler2.report(remove_disabled_flag=False)
        self.assertIn("vocab", report["statistics"])
Ejemplo n.º 2
0
    def test_disabling_columns_during_primitive_diff(self):

        data1 = pd.Series(["-2", "-1", "1", "2"])
        data2 = pd.Series(["5", "15"])
        options = StructuredOptions()

        # Test disabled column in one compiler
        options.int.is_enabled = False
        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(
            data1, options)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)
        expected_diff = {
            "data_type_representation": {
                "datetime": "unchanged",
                "float": "unchanged",
                "text": "unchanged",
                "int": [None, 1.0],
            },
            "data_type": ["float", "int"],
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test disabled column in both compilers
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(
            data2, options)
        expected_diff = {
            "data_type_representation": {
                "datetime": "unchanged",
                "float": "unchanged",
                "text": "unchanged",
            },
            "data_type": "unchanged",
            "statistics": {
                "min": -7.0,
                "max": -13.0,
                "sum": -20.0,
                "mean": -10.0,
                "median": -10,
                "mode": [[-2, -1, 1, 2], [], [5, 15]],
                "median_absolute_deviation": -3.5,
                "variance": -46.666666666666664,
                "stddev": data1.astype(int).std() - data2.astype(int).std(),
                "precision": {
                    "min": "unchanged",
                    "max": -1,
                    "mean": -0.5,
                    "var": -0.5,
                    "std": -0.71,
                    "sample_size": 2,
                    "margin_of_error": -1.6,
                },
                "t-test": {
                    "t-statistic": -1.9674775073518591,
                    "conservative": {
                        "df": 1,
                        "p-value": 0.29936264581081673
                    },
                    "welch": {
                        "df": 1.0673824509440946,
                        "p-value": 0.28696889329266506
                    },
                },
            },
        }
        profile_diff = compiler1.diff(compiler2)
        self.assertAlmostEqual(
            expected_diff["statistics"].pop("median"),
            profile_diff["statistics"].pop("median"),
            places=2,
        )
        expected_diff_mode = expected_diff["statistics"].pop("mode")
        diff_mode = profile_diff["statistics"].pop("mode")
        for i in range(len(expected_diff_mode)):
            np.testing.assert_almost_equal(sorted(expected_diff_mode[i]),
                                           sorted(diff_mode[i]), 2)
        self.assertAlmostEqual(
            expected_diff["statistics"].pop("median_absolute_deviation"),
            profile_diff["statistics"].pop("median_absolute_deviation"),
            places=2,
        )
        self.assertDictEqual(expected_diff, profile_diff)

        # Test disabling all columns in one compiler
        options.float.is_enabled = False
        options.text.is_enabled = False
        options.datetime.is_enabled = False
        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(
            data1, options)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)
        expected_diff = {
            "data_type_representation": {
                "datetime": [None, 0.0],
                "int": [None, 1.0],
                "float": [None, 1.0],
                "text": [None, 1.0],
            },
            "data_type": [None, "int"],
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test disabling all columns in all compilers
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(
            data2, options)
        expected_diff = {}
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
Ejemplo n.º 3
0
    def test_diff_primitive_compilers(self):
        # Test different data types
        data1 = pd.Series(['-2', '-1', '1', '2'])
        data2 = pd.Series(["YO YO YO", "HELLO"])
        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)

        expected_diff = {
            'data_type_representation': {
                'datetime': 'unchanged',
                'int': 1.0,
                'float': 1.0,
                'text': 'unchanged'
            },
            'data_type': ['int', 'text']
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test different data types with datetime specifically
        data1 = pd.Series(['-2', '-1', '1', '2'])
        data2 = pd.Series(["01/12/1967", "11/9/2024"])
        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)

        expected_diff = {
            'data_type_representation': {
                'datetime': -1.0,
                'int': 1.0,
                'float': 1.0,
                'text': 'unchanged'
            },
            'data_type': ['int', 'datetime']
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test same data types
        data1 = pd.Series(['-2', '15', '1', '2'])
        data2 = pd.Series(['5', '-1'])

        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)

        expected_diff = {
            'data_type_representation': {
                'datetime': 'unchanged',
                'int': 'unchanged',
                'float': 'unchanged',
                'text': 'unchanged'
            },
            'data_type': 'unchanged',
            'statistics': {
                'min': -1.0,
                'max': 10.0,
                'sum': 12.0,
                'mean': 2.0,
                'median': -0.5,
                'mode': [[-2, 15, 1, 2], [], [5, -1]],
                'median_absolute_deviation': -1,
                'variance': 38.666666666666664,
                'stddev': 3.285085839971525,
                't-test': {
                    't-statistic': 0.4155260166386663,
                    'conservative': {
                        'df': 1,
                        'p-value': 0.749287157907667
                    },
                    'welch': {
                        'df': 3.6288111187629117,
                        'p-value': 0.7011367179395704
                    }
                }
            }
        }
        profile_diff = compiler1.diff(compiler2)
        self.assertAlmostEqual(expected_diff['statistics'].pop('median'),
                               profile_diff['statistics'].pop('median'),
                               places=2)
        expected_diff_mode = expected_diff['statistics'].pop('mode')
        diff_mode = profile_diff['statistics'].pop('mode')
        for i in range(len(expected_diff_mode)):
            np.testing.assert_almost_equal(sorted(expected_diff_mode[i]),
                                           sorted(diff_mode[i]), 2)
        self.assertAlmostEqual(
            expected_diff['statistics'].pop('median_absolute_deviation'),
            profile_diff['statistics'].pop('median_absolute_deviation'),
            places=2)
        self.assertDictEqual(expected_diff, profile_diff)

        # Test different compilers
        data1 = pd.Series(['-2', '-1', '1', '2'])
        data2 = pd.Series(['5', '15'])

        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
        # Assert type error is properly called
        with self.assertRaises(TypeError) as exc:
            compiler1.diff(compiler2)
        self.assertEqual(
            str(exc.exception), "`ColumnPrimitiveTypeProfileCompiler` and "
            "`ColumnStatsProfileCompiler` are not of the same "
            "profile compiler type.")
Ejemplo n.º 4
0
    def test_diff_primitive_compilers(self):
        # Test different data types
        data1 = pd.Series(["-2", "-1", "1", "2"])
        data2 = pd.Series(["YO YO YO", "HELLO"])
        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)

        expected_diff = {
            "data_type_representation": {
                "datetime": "unchanged",
                "int": 1.0,
                "float": 1.0,
                "text": "unchanged",
            },
            "data_type": ["int", "text"],
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test different data types with datetime specifically
        data1 = pd.Series(["-2", "-1", "1", "2"])
        data2 = pd.Series(["01/12/1967", "11/9/2024"])
        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)

        expected_diff = {
            "data_type_representation": {
                "datetime": -1.0,
                "int": 1.0,
                "float": 1.0,
                "text": "unchanged",
            },
            "data_type": ["int", "datetime"],
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test same data types
        data1 = pd.Series(["-2", "15", "1", "2"])
        data2 = pd.Series(["5", "-1"])

        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)

        expected_diff = {
            "data_type_representation": {
                "datetime": "unchanged",
                "int": "unchanged",
                "float": "unchanged",
                "text": "unchanged",
            },
            "data_type": "unchanged",
            "statistics": {
                "min": -1.0,
                "max": 10.0,
                "sum": 12.0,
                "mean": 2.0,
                "median": -0.5,
                "mode": [[-2, 15, 1, 2], [], [5, -1]],
                "median_absolute_deviation": -1,
                "variance": 38.666666666666664,
                "stddev": 3.285085839971525,
                "t-test": {
                    "t-statistic": 0.4155260166386663,
                    "conservative": {
                        "df": 1,
                        "p-value": 0.749287157907667
                    },
                    "welch": {
                        "df": 3.6288111187629117,
                        "p-value": 0.7011367179395704
                    },
                },
            },
        }
        profile_diff = compiler1.diff(compiler2)
        self.assertAlmostEqual(
            expected_diff["statistics"].pop("median"),
            profile_diff["statistics"].pop("median"),
            places=2,
        )
        expected_diff_mode = expected_diff["statistics"].pop("mode")
        diff_mode = profile_diff["statistics"].pop("mode")
        for i in range(len(expected_diff_mode)):
            np.testing.assert_almost_equal(sorted(expected_diff_mode[i]),
                                           sorted(diff_mode[i]), 2)
        self.assertAlmostEqual(
            expected_diff["statistics"].pop("median_absolute_deviation"),
            profile_diff["statistics"].pop("median_absolute_deviation"),
            places=2,
        )
        self.assertDictEqual(expected_diff, profile_diff)

        # Test different compilers
        data1 = pd.Series(["-2", "-1", "1", "2"])
        data2 = pd.Series(["5", "15"])

        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
        # Assert type error is properly called
        with self.assertRaises(TypeError) as exc:
            compiler1.diff(compiler2)
        self.assertEqual(
            str(exc.exception),
            "`ColumnPrimitiveTypeProfileCompiler` and "
            "`ColumnStatsProfileCompiler` are not of the same "
            "profile compiler type.",
        )
Ejemplo n.º 5
0
    def test_disabling_columns_during_primitive_diff(self):

        data1 = pd.Series(['-2', '-1', '1', '2'])
        data2 = pd.Series(['5', '15'])
        options = StructuredOptions()

        # Test disabled column in one compiler
        options.int.is_enabled = False
        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(
            data1, options)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)
        expected_diff = {
            'data_type_representation': {
                'datetime': 'unchanged',
                'float': 'unchanged',
                'text': 'unchanged',
                'int': [None, 1.0]
            },
            'data_type': ['float', 'int']
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test disabled column in both compilers
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(
            data2, options)
        expected_diff = {
            'data_type_representation': {
                'datetime': 'unchanged',
                'float': 'unchanged',
                'text': 'unchanged'
            },
            'data_type': "unchanged",
            'statistics': {
                'min': -7.0,
                'max': -13.0,
                'sum': -20.0,
                'mean': -10.0,
                'median': -10,
                'mode': [[-2, -1, 1, 2], [], [5, 15]],
                'median_absolute_deviation': -3.5,
                'variance': -46.666666666666664,
                'stddev': data1.astype(int).std() - data2.astype(int).std(),
                'precision': {
                    'min': 'unchanged',
                    'max': -1,
                    'mean': -0.5,
                    'var': -0.5,
                    'std': -0.71,
                    'sample_size': 2,
                    'margin_of_error': -1.6
                },
                't-test': {
                    't-statistic': -1.9674775073518591,
                    'conservative': {
                        'df': 1,
                        'p-value': 0.29936264581081673
                    },
                    'welch': {
                        'df': 1.0673824509440946,
                        'p-value': 0.28696889329266506
                    }
                }
            }
        }
        profile_diff = compiler1.diff(compiler2)
        self.assertAlmostEqual(expected_diff['statistics'].pop('median'),
                               profile_diff['statistics'].pop('median'),
                               places=2)
        expected_diff_mode = expected_diff['statistics'].pop('mode')
        diff_mode = profile_diff['statistics'].pop('mode')
        for i in range(len(expected_diff_mode)):
            np.testing.assert_almost_equal(sorted(expected_diff_mode[i]),
                                           sorted(diff_mode[i]), 2)
        self.assertAlmostEqual(
            expected_diff['statistics'].pop('median_absolute_deviation'),
            profile_diff['statistics'].pop('median_absolute_deviation'),
            places=2)
        self.assertDictEqual(expected_diff, profile_diff)

        # Test disabling all columns in one compiler
        options.float.is_enabled = False
        options.text.is_enabled = False
        options.datetime.is_enabled = False
        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(
            data1, options)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)
        expected_diff = {
            'data_type_representation': {
                'datetime': [None, 0.0],
                'int': [None, 1.0],
                'float': [None, 1.0],
                'text': [None, 1.0]
            },
            'data_type': [None, 'int']
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test disabling all columns in all compilers
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(
            data2, options)
        expected_diff = {}
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))