Exemple #1
0
 def test_tabular_preprocess(self):
     dataset_properties = {
         'numerical_columns': list(range(15)),
         'categorical_columns': [],
         'task_type': 'tabular_classification'
     }
     X = dict(
         X_train=np.random.random((10, 15)),
         y_train=np.random.random(10),
         train_indices=[0, 1, 2, 3, 4, 5],
         val_indices=[6, 7, 8, 9],
         is_small_preprocess=True,
         numerical_columns=list(range(15)),
         categorical_columns=[],
         num_features=15,
         num_classes=2,
         categories=[],
         # Training configuration
         job_id='test',
         device='cpu',
         budget_type='epochs',
         epochs=10,
         torch_num_threads=1,
         early_stopping=20,
         dataset_properties=dataset_properties,
         split_id=0,
         backend=self.backend,
     )
     pipeline = TabularClassificationPipeline(
         dataset_properties=dataset_properties)
     # Remove the trainer
     pipeline.steps.pop()
     pipeline = pipeline.fit(X)
     X = pipeline.transform(X)
     self.assertNotIn('preprocess_transforms', X.keys())
    def test_pipeline_transform(self, fit_dictionary_tabular):
        """
        In the context of autopytorch, transform expands a fit dictionary with
        components that where previously fit. We can use this as a nice way to make sure
        that fit properly work.
        This code is added in light of components not properly added to the fit dicitonary
        """

        pipeline = TabularClassificationPipeline(
            dataset_properties=fit_dictionary_tabular['dataset_properties'])
        cs = pipeline.get_hyperparameter_search_space()
        config = cs.sample_configuration()
        pipeline.set_hyperparameters(config)

        with unittest.mock.patch.object(pipeline.named_steps['trainer'].choice, 'train_epoch') \
             as patch_train:
            patch_train.return_value = 1, {}
            # We do not want to make the same early preprocessing operation to the fit dictionary
            pipeline.fit(fit_dictionary_tabular.copy())

        transformed_fit_dictionary_tabular = pipeline.transform(
            fit_dictionary_tabular)

        # First, we do not lose anyone! (We use a fancy subset containment check)
        assert fit_dictionary_tabular.items(
        ) <= transformed_fit_dictionary_tabular.items()

        # Then the pipeline should have added the following keys
        expected_keys = {
            'imputer', 'encoder', 'scaler', 'tabular_transformer',
            'preprocess_transforms', 'network', 'optimizer', 'lr_scheduler',
            'train_data_loader', 'val_data_loader', 'run_summary'
        }
        assert expected_keys.issubset(
            set(transformed_fit_dictionary_tabular.keys()))

        # Then we need to have transformations being created.
        assert len(
            get_preprocess_transforms(transformed_fit_dictionary_tabular)) > 0

        # We expect the transformations to be in the pipeline at anytime for inference
        assert 'preprocess_transforms' in transformed_fit_dictionary_tabular.keys(
        )
Exemple #3
0
    def test_tabular_no_preprocess(self):
        dataset_properties = {
            'numerical_columns': list(range(15)),
            'categorical_columns': [],
            'task_type': TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION],
            'output_type': OUTPUT_TYPES_TO_STRING[MULTICLASS],
            'is_small_preprocess': False,
            'input_shape': (15, ),
            'output_shape': 2,
            'categories': [],
            'issparse': False
        }
        X = dict(
            X_train=np.random.random((10, 15)),
            y_train=np.random.random(10),
            train_indices=[0, 1, 2, 3, 4, 5],
            val_indices=[6, 7, 8, 9],
            dataset_properties=dataset_properties,
            # Training configuration
            num_run=16,
            device='cpu',
            budget_type='epochs',
            epochs=10,
            torch_num_threads=1,
            early_stopping=20,
            split_id=0,
            backend=self.backend,
        )

        pipeline = TabularClassificationPipeline(
            dataset_properties=dataset_properties)
        # Remove the trainer
        pipeline.steps.pop()
        pipeline = pipeline.fit(X)
        X = pipeline.transform(X)
        self.assertIn('preprocess_transforms', X.keys())
        self.assertIsInstance(X['preprocess_transforms'], list)
        self.assertIsInstance(X['preprocess_transforms'][-1].preprocessor,
                              BaseEstimator)