Exemple #1
0
    def test_base_props(self):
        src_column = self.aws_dataset.src
        src_profile = StructuredDataProfile(src_column,
                                            sample_size=len(src_column))

        self.assertIsInstance(src_profile.profiles['data_type_profile'],
                              ColumnPrimitiveTypeProfileCompiler)
        self.assertIsInstance(src_profile.profiles['data_stats_profile'],
                              ColumnStatsProfileCompiler)

        data_types = ['int', 'float', 'datetime', 'text']
        six.assertCountEqual(
            self, data_types,
            list(src_profile.profiles['data_type_profile']._profiles.keys()))

        stats_types = ['category', 'order']
        six.assertCountEqual(
            self, stats_types,
            list(src_profile.profiles['data_stats_profile']._profiles.keys()))

        self.assertEqual(3, src_profile.null_count)
        self.assertEqual(2999, src_profile.sample_size)

        total_nulls = 0
        for _, null_rows in src_profile.null_types_index.items():
            total_nulls += len(null_rows)
        self.assertEqual(3, total_nulls)

        # test updated base props with batch addition
        src_profile.update_profile(src_column)
        src_profile.update_profile(src_column)

        self.assertEqual(3 * 3, src_profile.null_count)
        self.assertEqual(2999 * 3, src_profile.sample_size)
 def test_data_labeler_toggle(self):
     src_column = self.aws_dataset.src
     structured_options = StructuredOptions()
     structured_options.data_labeler.is_enabled = False
     std_profile = StructuredDataProfile(src_column,
                                         sample_size=len(src_column))
     togg_profile = StructuredDataProfile(src_column,
                                          sample_size=len(src_column),
                                          options=structured_options)
     self.assertIn('data_label_profile', std_profile.profiles)
     self.assertNotIn('data_label_profile', togg_profile.profiles)
Exemple #3
0
    def test_null_count(self):
        column = pd.Series([1, float('nan')] * 10)

        # test null_count when subset of full sample size
        random.seed(0)
        profile = StructuredDataProfile(column, sample_size=10)
        self.assertEqual(6, profile.null_count)

        # test null_count when full sample size
        profile = StructuredDataProfile(column, sample_size=len(column))
        self.assertEqual(10, profile.null_count)
 def test_index_overlap_for_update_profile(self, *mocks):
     data = pd.Series([0, None, 1, 2, None])
     profile = StructuredDataProfile(data)
     self.assertEqual(0, profile._min_id)
     self.assertEqual(4, profile._max_id)
     self.assertDictEqual(profile.null_types_index, {'nan': {1, 4}})
     profile.update_profile(data)
     # Now all indices will be shifted by max_id + 1 (5)
     # So the 2 None will move from indices 1, 4 to 6, 9
     self.assertEqual(0, profile._min_id)
     self.assertEqual(9, profile._max_id)
     self.assertDictEqual(profile.null_types_index, {'nan': {1, 4, 6, 9}})
    def test_uppercase_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self):
        """
        Tests whether columns with a ratio of categorical columns less than
        MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL and container
        uppercase letters identify as categorical.
        """
        num_unique_values = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL + 1
        list_unique_values = [
            self.test_sentence + str(i + 1) for i in range(num_unique_values)
        ]
        num_sentences = int(
            float(1) / CategoricalColumn._CATEGORICAL_THRESHOLD_DEFAULT) + 2
        cat_sentence_list = list_unique_values * num_sentences
        cat_sentence_list[-1] = self.test_sentence_upper1 + str(num_sentences)
        cat_sentence_list[-2] = self.test_sentence_upper2 + \
            str(num_sentences - 1)
        cat_sentence_list[-3] = self.test_sentence_upper3 + \
            str(num_sentences - 2)

        len_unique = len(set(cat_sentence_list))
        cat_sentence_df = pd.Series(cat_sentence_list)
        column_profile = StructuredDataProfile(cat_sentence_df)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles[
            "category"]
        self.assertEqual(True, cat_profiler.is_match)
        self.assertEqual(len_unique, len(cat_profiler.categories))
    def test_index_overlap_for_merge(self, *mocks):
        data = pd.Series([0, None, 1, 2, None])
        profile1 = StructuredDataProfile(data)
        profile2 = StructuredDataProfile(data)

        # Ensure merged profile included shifted indices
        profile3 = profile1 + profile2
        self.assertEqual(0, profile3._min_id)
        self.assertEqual(9, profile3._max_id)
        self.assertDictEqual(profile3.null_types_index, {'nan': {1, 4, 6, 9}})

        # Ensure original profiles not overwritten
        self.assertEqual(0, profile1._min_id)
        self.assertEqual(4, profile1._max_id)
        self.assertDictEqual(profile1.null_types_index, {'nan': {1, 4}})
        self.assertEqual(0, profile2._min_id)
        self.assertEqual(4, profile2._max_id)
        self.assertDictEqual(profile2.null_types_index, {'nan': {1, 4}})
Exemple #7
0
    def test_categorical_mapping(self):

        df1 = pd.Series([
            "abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2", np.nan,
        ])
        df2 = pd.Series([
            "1", "null", "ee", "NaN", "ff", "nan", "gg", "None", "aa", "b", "ee",
        ])
        df3 = pd.Series([
            "NaN", "b", "nan", "c", None,
        ])

        column_profile = StructuredDataProfile(df1)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles["category"]

        num_null_types = 1
        num_nan_count = 1
        categories = df1.apply(str).unique().tolist()
        six.assertCountEqual(
            self,
            categories,
            cat_profiler.categories +
            column_profile.null_types)
        self.assertEqual(num_null_types, len(column_profile.null_types))
        self.assertEqual(
            num_nan_count, len(
                column_profile.null_types_index["nan"]))

        num_null_types = 4
        num_nan_count = 2
        categories = pd.concat([df1, df2]).apply(str).unique().tolist()
        column_profile.update_profile(df2)
        six.assertCountEqual(
            self,
            categories,
            cat_profiler.categories +
            column_profile.null_types)
        self.assertEqual(num_null_types, len(column_profile.null_types))
        self.assertEqual(
            num_nan_count, len(
                column_profile.null_types_index["nan"]))

        num_null_types = 4
        num_nan_count = 3
        categories = pd.concat([df1, df2, df3]).apply(str).unique().tolist()
        column_profile.update_profile(df3)
        six.assertCountEqual(
            self,
            categories,
            cat_profiler.categories +
            column_profile.null_types)
        self.assertEqual(num_null_types, len(column_profile.null_types))
        self.assertEqual(
            num_nan_count, len(
                column_profile.null_types_index["nan"]))
        self.assertNotEqual(
            num_nan_count, len(
                column_profile.null_types_index["NaN"]))
    def test_clean_data_and_get_base_stats(self):
        data = pd.Series([1, None, 3, 4, None, 6],
                         index=['a', 'b', 'c', 'd', 'e', 'f'])

        # validate that if sliced data, still functional
        # previously `iloc` was used at:
        # `df_series = df_series.loc[sorted(true_sample_list)]`
        # which caused errors
        df_series, base_stats = \
            StructuredDataProfile.clean_data_and_get_base_stats(
                df_series=data[1:], sample_size=6, min_true_samples=0)
        # note data above is a subset `df_series=data[1:]`, 1.0 will not exist
        self.assertTrue(np.issubdtype(np.object_, df_series.dtype))
        self.assertCountEqual({'sample': ['4.0', '6.0', '3.0'],
                               'sample_size': 5, 'null_count': 2,
                               'null_types': dict(nan=['e', 'b'])}, base_stats)
Exemple #9
0
    def test_greater_than_CATEGORICAL_THRESHOLD_DEFAULT_identify_as_text(self):
        """
        Tests whether columns with a ratio of categorical columns greater than
        MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL identify as text.
        """
        num_unique_values = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL + 1
        list_unique_values = [self.test_sentence +
                              str(i + 1) for i in range(num_unique_values)]
        num_sentences = int(
            float(1) / CategoricalColumn._CATEGORICAL_THRESHOLD_DEFAULT) - 1
        cat_sentence_list = list_unique_values * num_sentences

        cat_sentence_df = pd.Series(cat_sentence_list)
        column_profile = StructuredDataProfile(cat_sentence_df)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles["category"]
        self.assertEqual(False, cat_profiler.is_match)
Exemple #10
0
    def test_long_sentences_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL(
            self):
        """
        Tests whether columns with the number of unique long sentences fewer
        than MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL identify as
        categorical.
        """
        num_sentences = CategoricalColumn._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL - 1
        cat_sentence_list = [self.test_sentence_long +
                             str(i + 1) for i in range(num_sentences)]

        len_unique = len(set(cat_sentence_list))
        cat_sentence_df = pd.Series(cat_sentence_list)
        column_profile = StructuredDataProfile(cat_sentence_df)
        cat_profiler = column_profile.profiles['data_stats_profile']._profiles["category"]
        self.assertEqual(True, cat_profiler.is_match)
        self.assertEqual(len_unique, len(cat_profiler.categories))
    def test_column_names(self):
        data = [['a', 1], ['b', 2], ['c', 3]]
        df = pd.DataFrame(data, columns=['letter', 'number'])
        profile1 = StructuredDataProfile(df['letter'])
        profile2 = StructuredDataProfile(df['number'])
        self.assertEqual(profile1.name, 'letter')
        self.assertEqual(profile2.name, 'number')

        df_series = pd.Series([1, 2, 3, 4, 5])
        profile = StructuredDataProfile(df_series)
        self.assertEqual(profile.name, df_series.name)

        # Ensure issue raised
        profile = StructuredDataProfile(df['letter'])
        with self.assertRaises(ValueError) as context:
            profile.update_profile(df['number'])
        self.assertTrue(
            'Column names have changed, col number does not match prior name letter',
            context)
    def test_min_max_id_properly_update(self, *mocks):
        data = pd.Series([1, None, 3, 4, 5, None, 1])
        profile1 = StructuredDataProfile(data[:2])
        profile2 = StructuredDataProfile(data[2:])

        # Base initialization
        self.assertEqual(0, profile1._min_id)
        self.assertEqual(1, profile1._max_id)
        self.assertEqual(2, profile2._min_id)
        self.assertEqual(6, profile2._max_id)

        # Needs to work with merge
        profile3 = profile1 + profile2
        self.assertEqual(0, profile3._min_id)
        self.assertEqual(6, profile3._max_id)

        # Needs to work with update_profile
        profile = StructuredDataProfile(data[:2])
        profile.update_profile(data[2:])
        self.assertEqual(0, profile._min_id)
        self.assertEqual(6, profile._max_id)
    def test_add_profilers(self, *mocks):
        data = pd.Series([1, None, 3, 4, 5, None])
        profile1 = StructuredDataProfile(data[:2])
        profile2 = StructuredDataProfile(data[2:])

        # test incorrect type
        with self.assertRaisesRegex(
                TypeError, '`StructuredDataProfile` and `int` are '
                'not of the same profiler type.'):
            profile1 + 3

        # test mismatched names
        profile1.name = 'profile1'
        profile2.name = 'profile2'
        with self.assertRaisesRegex(
                ValueError, 'Structured profile names are unmatched: '
                'profile1 != profile2'):
            profile1 + profile2

        # test mismatched profiles due to options
        profile2.name = 'profile1'
        profile1._profiles = dict(test1=mock.Mock())
        profile2.profiles.pop('data_label_profile')
        with self.assertRaisesRegex(
                ValueError, 'Structured profilers were not setup with '
                'the same options, hence they do not '
                'calculate the same profiles and cannot be '
                'added together.'):
            profile1 + profile2

        # test success
        profile1.profiles = dict(test=1)
        profile2.profiles = dict(test=2)
        merged_profile = profile1 + profile2
        self.assertEqual(3, merged_profile.profiles['test'])
        self.assertCountEqual(['5.0', '4.0', '3.0', '1.0'],
                              merged_profile.sample)
        self.assertEqual(6, merged_profile.sample_size)
        self.assertEqual(2, merged_profile.null_count)
        self.assertListEqual(['nan'], merged_profile.null_types)
        self.assertDictEqual({'nan': {1, 5}}, merged_profile.null_types_index)

        # test add with different sampling properties
        profile1._min_sample_size = 10
        profile2._min_sample_size = 100
        profile1._sampling_ratio = 0.5
        profile2._sampling_ratio = 0.3
        profile1._min_true_samples = 11
        profile2._min_true_samples = 1
        merged_profile = profile1 + profile2
        self.assertEqual(100, merged_profile._min_sample_size)
        self.assertEqual(0.5, merged_profile._sampling_ratio)
        self.assertEqual(11, merged_profile._min_true_samples)