Beispiel #1
0
    def setUp(self):
        self.factory = AIFactory.AIFactory(builders=[],
                                           project_name='test',
                                           log_dir='test/dir')

        metadata = MetaData()
        metadata.define_numerical_columns(
            ['feature_2', 'feature_3', 'target_1'])
        metadata.define_categorical_columns(['feature_1'])

        self.builder_A = mock.Mock(spec=Builder)
        self.builder_A.dependent_on = ['B']
        self.builder_A.builder_type = 'A'
        self.builder_A.describe = mock.Mock()
        self.builder_A.describe.return_value = {'A': 1}
        self.builder_A.build = mock.Mock()

        self.builder_B = mock.Mock(spec=Builder)
        self.builder_B.dependent_on = ['C']
        self.builder_B.builder_type = 'B'
        self.builder_B.describe = mock.Mock()
        self.builder_B.describe.return_value = {'B': 1}
        self.builder_B.build = mock.Mock()

        self.builder_C = mock.Mock(spec=Builder)
        self.builder_C.dependent_on = []
        self.builder_C.builder_type = 'C'
        self.builder_C.describe = mock.Mock()
        self.builder_C.describe.return_value = {'C': 1}
        self.builder_C.build = mock.Mock()

        self.artie = self.factory.create_AI(
            [self.builder_A, self.builder_B, self.builder_C])
 def setUp(self):
     data = {
         'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50],
         'num_2': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
     }
     metadata = MetaData()
     metadata.define_numerical_columns(['num_1', 'num_2'])
     self.df = pd.DataFrame(data)
     self.data_model = DataModel(self.df)
     self.data_model.metadata = metadata
    def setUp(self):
        data = {
            'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50],
            'mh_1': [
                'EUR,USD', 'USD,JPY,AUD', 'EUR', 'EUR,GBP,AUD', 'USD',
                'EUR,JPY', 'EUR,GBP', 'USD,JPY', 'EUR,GBP', 'USD'
            ],
        }

        metadata = MetaData()
        metadata.define_numerical_columns(['num_1'])
        metadata.define_multiple_cat_columns(['mh_1'])

        self.df = pd.DataFrame(data)
        self.data_model = DataModel(self.df)
        self.data_model.metadata = metadata
    def setUp(self):
        data = {
            'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50],
            'cat_1': [
                'EUR', 'USD', 'EUR', 'EUR', 'USD', 'EUR', 'EUR', 'USD', 'EUR',
                'USD'
            ],
        }

        metadata = MetaData()
        metadata.define_numerical_columns(['num_1'])
        metadata.define_categorical_columns(['cat_1'])

        self.df = pd.DataFrame(data)
        self.data_model = DataModel(self.df)
        self.data_model.metadata = metadata
    def testScrubbing(self):
        data = {
            'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50],
            'list_1': [['EUR', 'USD'], ['USD', 'JPY', 'AUD'], ['EUR'],
                       ['EUR', 'GBP', 'AUD'], ['USD'], ['EUR', 'JPY'],
                       ['EUR', 'GBP'], ['USD', 'JPY'], ['EUR', 'GBP'],
                       ['USD']],
        }

        metadata = MetaData()
        metadata.define_numerical_columns(['num_1'])
        metadata.define_multiple_cat_columns(['list_1'])

        self.df = pd.DataFrame(data)
        self.data_model = DataModel(self.df)
        self.data_model.metadata = metadata

        scrubber = MultipleCatListToMultipleHotScrubber(col_name='list_1')
        scrubber.validate(self.data_model)
        scrubber.scrub(self.data_model)

        new_df = self.data_model.get_dataframe()
        columns = list(new_df.columns.values)

        # test new columns
        self.assertEqual(len(columns), 7)
        self.assertIn('list_1_EUR', columns)
        self.assertIn('list_1_GBP', columns)
        self.assertIn('list_1_USD', columns)
        self.assertIn('list_1_JPY', columns)
        self.assertIn('list_1_AUD', columns)

        # check column contents
        has_EUR_series = new_df['list_1_EUR']
        self.assertEqual(list(has_EUR_series.to_dict().values()),
                         [1, 0, 1, 1, 0, 1, 1, 0, 1, 0])

        # test metadata
        meta_data_categorical_cols = self.data_model.metadata.binary_columns
        self.assertEqual(len(meta_data_categorical_cols), 5)
        self.assertIn('list_1_EUR', columns)
        self.assertIn('list_1_GBP', columns)
        self.assertIn('list_1_USD', columns)
        self.assertIn('list_1_JPY', columns)
        self.assertIn('list_1_AUD', columns)
Beispiel #6
0
class TestMetaData(unittest.TestCase):
    def setUp(self):
        self._data = {
            'numerical_1': [1, 2],
            'numerical_2': [3, 4],
            'numerical_3': [3, 4],
            'categorical_1': [7, 8],
            'categorical_2': [5, 6],
            'categorical_3': [5, 6],
            'unknown_1': [9, 10]
        }

        self._dataframe = pd.DataFrame(data=self._data)
        self.meta_data = MetaData(self._dataframe)

    def test_categorize_columns(self):
        # categorize all columns
        self.meta_data.define_categorical_columns(
            ['categorical_1', 'categorical_2'])
        self.meta_data.define_numerical_columns(['numerical_1', 'numerical_2'])
        self.meta_data.define_categorical_columns(['categorical_3'])
        self.meta_data.define_numerical_columns(['numerical_3'])

        # scramble the columns
        self.meta_data.define_categorical_columns(
            ['numerical_1', 'numerical_2'])
        self.meta_data.define_numerical_columns(
            ['categorical_1', 'categorical_2', 'unknown_1'])

        # categorize the columns again
        self.meta_data.define_categorical_columns(
            ['categorical_1', 'categorical_2'])
        self.meta_data.define_numerical_columns(['numerical_1', 'numerical_2'])
        self.meta_data.define_categorical_columns(['categorical_3'])
        self.meta_data.define_numerical_columns(['numerical_3'])
        self.meta_data.define_unknown_columns(['unknown_1'])

        self.assertListEqual(
            ['categorical_1', 'categorical_2', 'categorical_3'],
            self.meta_data.categorical_columns)
        self.assertListEqual(['numerical_1', 'numerical_2', 'numerical_3'],
                             self.meta_data.numerical_columns)
        self.assertListEqual(['unknown_1'],
                             self.meta_data.uncategorized_columns)

    def test_unknown_is_default(self):
        self.assertEqual(self.meta_data.get_column_type('categorical_2'),
                         'unknown')

    def test_to_string(self):
        self.meta_data.define_categorical_columns(
            ['categorical_1', 'categorical_2', 'categorical_3'])
        self.meta_data.define_numerical_columns(
            ['numerical_1', 'numerical_2', 'numerical_3'])

        expected_string = '\nmetadata:\ncategorical_1: ' + MetaData.CATEGORICAL_DATA_TYPE + '\ncategorical_2: ' + MetaData.CATEGORICAL_DATA_TYPE + '\ncategorical_3: ' + MetaData.CATEGORICAL_DATA_TYPE + '\nnumerical_1: ' + MetaData.NUMERICAL_DATA_TYPE + '\nnumerical_2: ' + MetaData.NUMERICAL_DATA_TYPE + '\nnumerical_3: ' + MetaData.NUMERICAL_DATA_TYPE + '\nunknown_1: ' + MetaData.UNKNOWN_DATA_TYPE
        stringified_metadata = str(self.meta_data)
        self.assertEqual(
            expected_string, stringified_metadata,
            'Metadata __str__ function does not meet expectations.')