コード例 #1
0
    def test_profile_merge_with_different_options(self):
        # Creating first profiler with default options
        options = IntOptions()
        options.max.is_enabled = False
        options.min.is_enabled = False

        data = [2, 4, 6, 8]
        df = pd.Series(data).apply(str)
        profiler1 = IntColumn("Int", options=options)
        profiler1.update(df)
        profiler1.match_count = 0

        # Creating second profiler with separate options
        options = IntOptions()
        options.min.is_enabled = False
        data2 = [10, 15]
        df2 = pd.Series(data2).apply(str)
        profiler2 = IntColumn("Int", options=options)
        profiler2.update(df2)

        # Asserting warning when adding 2 profilers with different options
        with self.assertWarnsRegex(
                RuntimeWarning, "max is disabled because it is not enabled in"
                " both profiles."):
            profiler3 = profiler1 + profiler2

        # Assert that these features are still merged
        profile = profiler3.profile
        self.assertIsNotNone(profiler3.histogram_selection)
        self.assertIsNotNone(profile['variance'])
        self.assertIsNotNone(profiler3.sum)

        # Assert that these features are not calculated
        self.assertIsNone(profiler3.max)
        self.assertIsNone(profiler3.min)
コード例 #2
0
    def test_profile_merge_edge_case(self):
        data = [2.0, 12.5, 'not a float', 6.0, 'not a float']
        df = pd.Series(data).apply(str)
        profiler1 = IntColumn(name="Int")
        profiler1.update(df)
        profiler1.match_count = 0

        data2 = [10.0, 3.5, 'not a float', 15.0, 'not a float']
        df2 = pd.Series(data2).apply(str)
        profiler2 = IntColumn(name="Int")
        profiler2.update(df2)

        profiler3 = profiler1 + profiler2
        self.assertEqual(profiler3.stddev, profiler2.stddev)

        # test merge with empty data
        df1 = pd.Series([], dtype=object)
        profiler1 = IntColumn("Int")
        profiler1.update(df1)

        df2 = pd.Series([], dtype=object)
        profiler2 = IntColumn("Int")
        profiler2.update(df2)

        profiler = profiler1 + profiler2
        self.assertEqual(profiler.min, None)
        self.assertEqual(profiler.max, None)
        self.assertIsNone(profiler.histogram_selection)

        df3 = pd.Series([2, 3]).apply(str)
        profiler3 = IntColumn("Int")
        profiler3.update(df3)

        profiler = profiler1 + profiler3
        self.assertEqual(profiler.min, 2)
        self.assertEqual(profiler.max, 3)

        df4 = pd.Series([4, 5]).apply(str)
        profiler4 = IntColumn("Int")
        profiler4.update(df4)

        profiler = profiler3 + profiler4
        self.assertEqual(profiler.min, 2)
        self.assertEqual(profiler.max, 5)
コード例 #3
0
    def test_profile_merge_no_bin_overlap(self):

        data = [2, 'not an int', 6, 'not an int']
        df = pd.Series(data).apply(str)
        profiler1 = IntColumn("Int")
        profiler1.update(df)
        profiler1.match_count = 0

        data2 = [10, 'not an int', 15, 'not an int']
        df2 = pd.Series(data2).apply(str)
        profiler2 = IntColumn("Int")
        profiler2.update(df2)

        # set bin names so no overlap
        profiler1.histogram_bin_method_names = ['No overlap 1']
        profiler2.histogram_bin_method_names = ['No overlap 2']

        with self.assertRaisesRegex(
                ValueError, 'Profiles have no overlapping bin methods '
                'and therefore cannot be added together.'):
            profiler1 + profiler2
コード例 #4
0
    def test_profile_merge_edge_case(self):
        data = [2.0, 12.5, 'not a float', 6.0, 'not a float']
        df = pd.Series(data).apply(str)
        profiler1 = IntColumn(name="Int")
        profiler1.update(df)
        profiler1.match_count = 0

        data2 = [10.0, 3.5, 'not a float', 15.0, 'not a float']
        df2 = pd.Series(data2).apply(str)
        profiler2 = IntColumn(name="Int")
        profiler2.update(df2)

        profiler3 = profiler1 + profiler2
        self.assertEqual(profiler3.stddev, profiler2.stddev)

        # test merge with empty data
        df1 = pd.Series([], dtype=object)
        profiler1 = IntColumn("Int")
        profiler1.update(df1)

        df2 = pd.Series([], dtype=object)
        profiler2 = IntColumn("Int")
        profiler2.update(df2)

        profiler = profiler1 + profiler2
        self.assertEqual(profiler.min, None)
        self.assertEqual(profiler.max, None)
        self.assertTrue(np.isnan(profiler.skewness))
        self.assertTrue(np.isnan(profiler.kurtosis))
        self.assertIsNone(profiler.histogram_selection)

        df3 = pd.Series([2, 3]).apply(str)
        profiler3 = IntColumn("Int")
        profiler3.update(df3)

        profiler = profiler1 + profiler3
        self.assertEqual(profiler.min, 2)
        self.assertEqual(profiler.max, 3)
        self.assertTrue(np.isnan(profiler.skewness))
        self.assertTrue(np.isnan(profiler.kurtosis))
        self.assertEqual(profiler.num_zeros, 0)
        self.assertEqual(profiler.num_negatives, 0)

        df4 = pd.Series([4, 5]).apply(str)
        profiler4 = IntColumn("Int")
        profiler4.update(df4)

        profiler = profiler3 + profiler4
        self.assertEqual(profiler.min, 2)
        self.assertEqual(profiler.max, 5)
        self.assertEqual(profiler.skewness, 0)
        self.assertAlmostEqual(profiler.kurtosis, -1.2)
        self.assertEqual(profiler.num_zeros, 0)
        self.assertEqual(profiler.num_negatives, 0)

        df5 = pd.Series([0, 0, -1]).apply(str)
        profiler5 = IntColumn("Int")
        profiler5.update(df5)

        profiler = profiler4 + profiler5
        self.assertEqual(profiler.min, -1)
        self.assertEqual(profiler.max, 5)
        self.assertEqual(profiler.num_zeros, 2)
        self.assertEqual(profiler.num_negatives, 1)