Example #1
0
 def test_tabular_preprocess(self):
     dataset_properties = {
         'numerical_columns': list(range(15)),
         'categorical_columns': [],
         'task_type': 'tabular_classification'
     }
     X = dict(
         X_train=np.random.random((10, 15)),
         y_train=np.random.random(10),
         train_indices=[0, 1, 2, 3, 4, 5],
         val_indices=[6, 7, 8, 9],
         is_small_preprocess=True,
         numerical_columns=list(range(15)),
         categorical_columns=[],
         num_features=15,
         num_classes=2,
         categories=[],
         # Training configuration
         job_id='test',
         device='cpu',
         budget_type='epochs',
         epochs=10,
         torch_num_threads=1,
         early_stopping=20,
         dataset_properties=dataset_properties,
         split_id=0,
         backend=self.backend,
     )
     pipeline = TabularClassificationPipeline(
         dataset_properties=dataset_properties)
     # Remove the trainer
     pipeline.steps.pop()
     pipeline = pipeline.fit(X)
     X = pipeline.transform(X)
     self.assertNotIn('preprocess_transforms', X.keys())
 def test_set_choices_updates(self, fit_dictionary_tabular):
     dataset_properties = {
         'numerical_columns': [1],
         'categorical_columns': [2],
         'task_type': 'tabular_classification'
     }
     config_dict = TabularClassificationPipeline(dataset_properties=dataset_properties). \
         get_hyperparameter_search_space()._hyperparameters
     updates = HyperparameterSearchSpaceUpdates()
     for i, (name, hyperparameter) in enumerate(config_dict.items()):
         if '__choice__' not in name:
             continue
         name = name.split(':')
         hyperparameter_name = ':'.join(name[1:])
         # Using NoEmbedding is safer for this test
         # to avoid forbidden configuration errors
         if name[0] == 'network_embedding' and hyperparameter_name == '__choice__':
             value_range = ('NoEmbedding', )
             default_value = 'NoEmbedding'
         else:
             value_range = (hyperparameter.choices[0], )
             default_value = hyperparameter.choices[0]
         updates.append(node_name=name[0],
                        hyperparameter=hyperparameter_name,
                        value_range=value_range,
                        default_value=default_value)
     pipeline = TabularClassificationPipeline(
         dataset_properties=dataset_properties,
         search_space_updates=updates)
     self._assert_pipeline_search_space(pipeline, updates)
    def test_network_optimizer_lr_handshake(self, fit_dictionary_tabular):
        """Fitting a network should put the network in the X"""
        # Create the pipeline to check. A random config should be sufficient
        pipeline = TabularClassificationPipeline(
            dataset_properties=fit_dictionary_tabular['dataset_properties'])
        cs = pipeline.get_hyperparameter_search_space()
        config = cs.sample_configuration()
        pipeline.set_hyperparameters(config)

        # Make sure that fitting a network adds a "network" to X
        assert 'network' in pipeline.named_steps.keys()
        fit_dictionary_tabular['network_embedding'] = torch.nn.Linear(3, 3)
        fit_dictionary_tabular['network_backbone'] = torch.nn.Linear(3, 4)
        fit_dictionary_tabular['network_head'] = torch.nn.Linear(4, 1)
        X = pipeline.named_steps['network'].fit(
            fit_dictionary_tabular, None).transform(fit_dictionary_tabular)
        assert 'network' in X

        # Then fitting a optimizer should fail if no network:
        assert 'optimizer' in pipeline.named_steps.keys()
        with pytest.raises(
                ValueError,
                match=
                r"To fit .+?, expected fit dictionary to have 'network' but got .*"
        ):
            pipeline.named_steps['optimizer'].fit({'dataset_properties': {}},
                                                  None)

        # No error when network is passed
        X = pipeline.named_steps['optimizer'].fit(X, None).transform(X)
        assert 'optimizer' in X
        assert isinstance(
            pipeline.named_steps['optimizer'].choice.get_optimizer(),
            torch.optim.Optimizer)

        # Then fitting a optimizer should fail if no network:
        assert 'lr_scheduler' in pipeline.named_steps.keys()
        with pytest.raises(
                ValueError,
                match=
                r"To fit .+?, expected fit dictionary to have 'optimizer' but got .*"
        ):
            pipeline.named_steps['lr_scheduler'].fit(
                {'dataset_properties': {}}, None)

        # No error when network is passed
        X = pipeline.named_steps['lr_scheduler'].fit(X, None).transform(X)
        assert 'lr_scheduler' in X
        if isinstance(pipeline.named_steps['lr_scheduler'].choice,
                      NoScheduler):
            pytest.skip("This scheduler does not support `get_scheduler`")
        lr_scheduler = pipeline.named_steps[
            'lr_scheduler'].choice.get_scheduler()
        if isinstance(lr_scheduler,
                      torch.optim.lr_scheduler.ReduceLROnPlateau):
            pytest.skip("This scheduler is not a child of _LRScheduler")
        assert isinstance(lr_scheduler, _LRScheduler)
Example #4
0
    def test_default_configuration(self):
        """Makes sure that when no config is set, we can trust the
        default configuration from the space"""
        pipeline = TabularClassificationPipeline(
            dataset_properties=self.dataset_properties)

        pipeline.fit({
            'num_features':
            self.num_features,
            'num_classes':
            self.num_classes,
            'numerical_columns':
            list(range(self.num_features)),
            'categorical_columns': [],
            'categories': [],
            'X_train':
            self.X,
            'y_train':
            self.y,
            'train_indices':
            list(range(self.X.shape[0] // 2)),
            'val_indices':
            list(range(self.X.shape[0] // 2, self.X.shape[0])),
            'is_small_preprocess':
            False,
            # Training configuration
            'dataset_properties':
            self.dataset_properties,
            'job_id':
            'example_tabular_classification_1',
            'device':
            'cpu',
            'budget_type':
            'epochs',
            'epochs':
            5,
            'torch_num_threads':
            1,
            'early_stopping':
            20,
            'working_dir':
            '/tmp',
            'use_tensorboard_logger':
            True,
            'use_pynisher':
            False,
            'metrics_during_training':
            True,
            'split_id':
            0,
            'backend':
            self.backend,
        })
Example #5
0
    def test_get_fit_requirements(self):
        dataset_properties = {
            'numerical_columns': [],
            'categorical_columns': []
        }
        pipeline = TabularClassificationPipeline(
            dataset_properties=dataset_properties)
        fit_requirements = pipeline.get_fit_requirements()

        # check if fit requirements is a list of FitRequirement named tuples
        self.assertIsInstance(fit_requirements, list)
        for requirement in fit_requirements:
            self.assertIsInstance(requirement, FitRequirement)
    def test_default_configuration(self, fit_dictionary_tabular,
                                   is_small_preprocess):
        """Makes sure that when no config is set, we can trust the
        default configuration from the space"""

        fit_dictionary_tabular['is_small_preprocess'] = is_small_preprocess

        pipeline = TabularClassificationPipeline(
            dataset_properties=fit_dictionary_tabular['dataset_properties'])
        with unittest.mock.patch.object(pipeline.named_steps['trainer'].choice, 'train_epoch') \
             as patch_train:
            patch_train.return_value = 1, {}
            pipeline.fit(fit_dictionary_tabular)
    def test_pipeline_predict_proba(self, fit_dictionary_tabular):
        """This test makes sure that the pipeline is able to fit
        given random combinations of hyperparameters across the pipeline
        And then predict using predict probability
        """
        X = fit_dictionary_tabular['X_train'].copy()
        pipeline = TabularClassificationPipeline(
            dataset_properties=fit_dictionary_tabular['dataset_properties'])

        cs = pipeline.get_hyperparameter_search_space()
        config = cs.sample_configuration()
        pipeline.set_hyperparameters(config)

        try:
            with unittest.mock.patch.object(pipeline.named_steps['trainer'].choice, 'train_epoch') \
                 as patch_train:
                patch_train.return_value = 1, {}
                pipeline.fit(fit_dictionary_tabular)
        except Exception as e:
            pytest.fail(f"Failed on config={config} with {e}")

        # we expect the output to have the same batch size as the test input,
        # and number of outputs per batch sample equal to the number of classes ("num_classes" in dataset_properties)
        expected_output_shape = (
            X.shape[0],
            fit_dictionary_tabular["dataset_properties"]["output_shape"])

        prediction = pipeline.predict_proba(X)
        assert isinstance(prediction, np.ndarray)
        assert prediction.shape == expected_output_shape
    def test_pipeline_predict(self, fit_dictionary_tabular):
        """This test makes sure that the pipeline is able to predict
        given a random configuration"""
        X = fit_dictionary_tabular['X_train'].copy()
        pipeline = TabularClassificationPipeline(
            dataset_properties=fit_dictionary_tabular['dataset_properties'])

        cs = pipeline.get_hyperparameter_search_space()
        config = cs.sample_configuration()
        pipeline.set_hyperparameters(config)

        with unittest.mock.patch.object(pipeline.named_steps['trainer'].choice, 'train_epoch') \
             as patch_train:
            patch_train.return_value = 1, {}
            pipeline.fit(fit_dictionary_tabular)

        # we expect the output to have the same batch size as the test input,
        # and number of outputs per batch sample equal to the number of outputs
        expected_output_shape = (
            X.shape[0],
            fit_dictionary_tabular["dataset_properties"]["output_shape"])

        prediction = pipeline.predict(X)
        assert isinstance(prediction, np.ndarray)
        assert prediction.shape == expected_output_shape
    def test_get_fit_requirements(self, fit_dictionary_tabular):
        dataset_properties = {
            'numerical_columns': [],
            'categorical_columns': [],
            'task_type': 'tabular_classification'
        }
        pipeline = TabularClassificationPipeline(
            dataset_properties=dataset_properties)
        fit_requirements = pipeline.get_fit_requirements()

        # check if fit requirements is a list of FitRequirement named tuples
        assert isinstance(fit_requirements, list)
        for requirement in fit_requirements:
            assert isinstance(requirement, FitRequirement)
 def test_remove_key_check_requirements(self, fit_dictionary_tabular):
     """Makes sure that when a key is removed from X, correct error is outputted"""
     pipeline = TabularClassificationPipeline(
         dataset_properties=fit_dictionary_tabular['dataset_properties'])
     for key in [
             'num_run', 'device', 'split_id', 'torch_num_threads',
             'dataset_properties'
     ]:
         fit_dictionary_tabular_copy = fit_dictionary_tabular.copy()
         fit_dictionary_tabular_copy.pop(key)
         with pytest.raises(
                 ValueError,
                 match=r"To fit .+?, expected fit dictionary to have"):
             pipeline.fit(fit_dictionary_tabular_copy)
def test_pipeline_score(fit_dictionary_tabular_dummy):
    """This test makes sure that the pipeline is able to achieve a decent score on dummy data
    given the default configuration"""
    X = fit_dictionary_tabular_dummy['X_train'].copy()
    y = fit_dictionary_tabular_dummy['y_train'].copy()

    # increase number of epochs to test for performance
    fit_dictionary_tabular_dummy['epochs'] = 50

    pipeline = TabularClassificationPipeline(
        dataset_properties=fit_dictionary_tabular_dummy['dataset_properties'])

    cs = pipeline.get_hyperparameter_search_space()
    config = cs.get_default_configuration()
    pipeline.set_hyperparameters(config)

    pipeline.fit(fit_dictionary_tabular_dummy)

    # Ensure that the network is an instance of torch Module
    assert isinstance(pipeline.named_steps['network'].get_network(),
                      torch.nn.Module)

    accuracy = pipeline.score(X, y)

    # we should be able to get a decent score on this dummy data
    assert accuracy >= 0.8, f"Pipeline:{pipeline} Config:{config} FitDict: {fit_dictionary_tabular_dummy}"
Example #12
0
    def test_network_optimizer_lr_handshake(self):
        """Fitting a network should put the network in the X"""

        # Create the pipeline to check. A random config should be sufficient
        dataset_properties = {
            'numerical_columns': [],
            'categorical_columns': [],
            'task_type': 'tabular_classification'
        }
        pipeline = TabularClassificationPipeline(
            dataset_properties=dataset_properties)
        cs = pipeline.get_hyperparameter_search_space()
        config = cs.sample_configuration()
        pipeline.set_hyperparameters(config)

        # Make sure that fitting a network adds a "network" to X
        self.assertIn('network', pipeline.named_steps.keys())
        X = pipeline.named_steps['network'].fit(
            {
                'num_features': 10,
                'num_classes': 2,
                'X_train': self.X,
                'y_train': self.y
            }, None).transform({})
        self.assertIn('network', X)

        # Then fitting a optimizer should fail if no network:
        self.assertIn('optimizer', pipeline.named_steps.keys())
        with self.assertRaisesRegex(
                ValueError,
                r"To fit .+?, expected fit dictionary to have 'network' but got .*"
        ):
            pipeline.named_steps['optimizer'].fit({}, None)

        # No error when network is passed
        X = pipeline.named_steps['optimizer'].fit(X, None).transform(X)
        self.assertIn('optimizer', X)

        # Then fitting a optimizer should fail if no network:
        self.assertIn('lr_scheduler', pipeline.named_steps.keys())
        with self.assertRaisesRegex(
                ValueError,
                r"To fit .+?, expected fit dictionary to have 'optimizer' but got .*"
        ):
            pipeline.named_steps['lr_scheduler'].fit({}, None)

        # No error when network is passed
        X = pipeline.named_steps['lr_scheduler'].fit(X, None).transform(X)
        self.assertIn('optimizer', X)
Example #13
0
def _get_classification_dataset_requirements(info: Dict[str, Any], include: Dict[str, List[str]],
                                             exclude: Dict[str, List[str]]) -> List[FitRequirement]:
    task_type = info['task_type']
    output_type = info['output_type']
    sparse = info['issparse']

    dataset_properties = {
        'task_type': task_type,
        'output_type': output_type,
        'sparse': sparse
    }
    if task_type in TABULAR_TASKS:
        dataset_properties.update({'numerical_columns': info['numerical_columns'],
                                   'categorical_columns': info['categorical_columns']})
        return TabularClassificationPipeline(
            dataset_properties=dataset_properties,
            include=include, exclude=exclude).\
            get_dataset_requirements()
    elif task_type in IMAGE_TASKS:
        return ImageClassificationPipeline(
            dataset_properties=dataset_properties,
            include=include, exclude=exclude).\
            get_dataset_requirements()
    else:
        raise ValueError("Task_type not supported")
Example #14
0
def _get_classification_configuration_space(info: Dict[str, Any], include: Dict[str, List[str]],
                                            exclude: Dict[str, List[str]],
                                            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
                                            ) -> ConfigurationSpace:
    if STRING_TO_TASK_TYPES[info['task_type']] in TABULAR_TASKS:
        pipeline = TabularClassificationPipeline(dataset_properties=info,
                                                 include=include, exclude=exclude,
                                                 search_space_updates=search_space_updates)
        return pipeline.get_hyperparameter_search_space()
    elif STRING_TO_TASK_TYPES[info['task_type']] in IMAGE_TASKS:
        return ImageClassificationPipeline(
            dataset_properties=info,
            include=include, exclude=exclude,
            search_space_updates=search_space_updates). \
            get_hyperparameter_search_space()
    else:
        raise ValueError("Task_type not supported")
Example #15
0
def random_search_and_save(fit_dictionary: typing.Dict[str, typing.Any],
                           backend: Backend, num_models: int) -> None:
    """
    A function to generate randomly fitted pipelines.
    It inefficiently pass the data in the fit dictionary, as there is no datamanager yet.

    It uses the backend to save the models and predictions for the ensemble selection
    """

    # Ensemble selection will evaluate performance on the OOF predictions. Store the OOF
    # Ground truth
    datamanager = backend.load_datamanager()
    X_train, y_train = datamanager.train_tensors
    X_test, y_test = (None, None)
    if datamanager.test_tensors is not None:
        X_test, y_test = datamanager.test_tensors
    targets = np.take(y_train, fit_dictionary['val_indices'], axis=0)
    backend.save_targets_ensemble(targets)

    for idx in range(num_models):
        pipeline = TabularClassificationPipeline(
            dataset_properties=fit_dictionary['dataset_properties'])

        # Sample a random configuration
        pipeline_cs = pipeline.get_hyperparameter_search_space()
        config = pipeline_cs.sample_configuration()
        pipeline.set_hyperparameters(config)

        # Fit the sample configuration
        pipeline.fit(fit_dictionary)

        # Predict using the fit model
        ensemble_predictions = pipeline.predict(
            np.take(X_train, fit_dictionary['val_indices'], axis=0))
        test_predictions = pipeline.predict(X_test)

        backend.save_numrun_to_dir(
            seed=fit_dictionary['seed'],
            idx=idx,
            budget=fit_dictionary['epochs'],
            model=pipeline,
            cv_model=None,
            ensemble_predictions=ensemble_predictions,
            valid_predictions=None,
            test_predictions=test_predictions,
        )

        score = accuracy_score(y_test, np.argmax(test_predictions, axis=1))
        print(f"Fitted a pipeline {idx} with score = {score}")

    return
def test_train_pipeline_with_runtime(fit_dictionary_tabular_dummy):
    """This test makes sure that the pipeline is able to achieve a decent score on dummy data
    given the default configuration"""

    # Convert the training to runtime
    fit_dictionary_tabular_dummy.pop('epochs', None)
    fit_dictionary_tabular_dummy['budget_type'] = 'runtime'
    fit_dictionary_tabular_dummy['runtime'] = 5
    fit_dictionary_tabular_dummy['early_stopping'] = -1

    pipeline = TabularClassificationPipeline(
        dataset_properties=fit_dictionary_tabular_dummy['dataset_properties'])

    cs = pipeline.get_hyperparameter_search_space()
    config = cs.get_default_configuration()
    pipeline.set_hyperparameters(config)

    pipeline.fit(fit_dictionary_tabular_dummy)
    run_summary = pipeline.named_steps['trainer'].run_summary
    budget_tracker = pipeline.named_steps['trainer'].budget_tracker
    assert budget_tracker.budget_type == 'runtime'
    assert budget_tracker.max_runtime == 5
    assert budget_tracker.is_max_time_reached()

    # There is no epoch limitation
    assert not budget_tracker.is_max_epoch_reached(epoch=np.inf)

    # More than 200 epochs would have pass in 5 seconds for this dataset
    assert len(run_summary.performance_tracker['start_time']) > 100
Example #17
0
    def test_pipeline_fit(self):
        """This test makes sure that the pipeline is able to fit
        given random combinations of hyperparameters across the pipeline"""

        pipeline = TabularClassificationPipeline(
            dataset_properties=self.dataset_properties)
        cs = pipeline.get_hyperparameter_search_space()
        config = cs.sample_configuration()
        pipeline.set_hyperparameters(config)
        pipeline.fit({
            'num_features':
            self.num_features,
            'num_classes':
            self.num_classes,
            'numerical_columns':
            list(range(self.num_features)),
            'categorical_columns': [],
            'categories': [],
            'X_train':
            self.X,
            'y_train':
            self.y,
            'train_indices':
            list(range(self.X.shape[0] // 2)),
            'val_indices':
            list(range(self.X.shape[0] // 2, self.X.shape[0])),
            'is_small_preprocess':
            False,
            # Training configuration
            'dataset_properties':
            self.dataset_properties,
            'job_id':
            'example_tabular_classification_1',
            'device':
            'cpu',
            'budget_type':
            'epochs',
            'epochs':
            5,
            'torch_num_threads':
            1,
            'early_stopping':
            20,
            'working_dir':
            '/tmp',
            'use_tensorboard_logger':
            True,
            'use_pynisher':
            False,
            'metrics_during_training':
            True,
            'split_id':
            0,
            'backend':
            self.backend,
        })

        # To make sure we fitted the model, there should be a
        # run summary object with accuracy
        self.assertIsNotNone(pipeline.named_steps['trainer'].run_summary)
    def test_pipeline_fit(self, fit_dictionary_tabular):
        """This test makes sure that the pipeline is able to fit
        given random combinations of hyperparameters across the pipeline"""

        pipeline = TabularClassificationPipeline(
            dataset_properties=fit_dictionary_tabular['dataset_properties'])
        cs = pipeline.get_hyperparameter_search_space()
        config = cs.sample_configuration()
        pipeline.set_hyperparameters(config)
        try:
            pipeline.fit(fit_dictionary_tabular)
        except Exception as e:
            pytest.fail(f"Failed due to {e} for config={config}")

        # To make sure we fitted the model, there should be a
        # run summary object with accuracy
        run_summary = pipeline.named_steps['trainer'].run_summary
        assert run_summary is not None

        # Make sure that performance was properly captured
        assert run_summary.performance_tracker['train_loss'][1] > 0
        assert run_summary.total_parameter_count > 0
        assert 'accuracy' in run_summary.performance_tracker['train_metrics'][
            1]

        # Make sure a network was fit
        assert isinstance(pipeline.named_steps['network'].get_network(),
                          torch.nn.Module)
    def test_pipeline_fit_include(self, fit_dictionary_tabular, preprocessor):
        """
        This test ensures that a tabular classification
        pipeline can be fit with all preprocessors
        in the include
        """

        fit_dictionary_tabular['epochs'] = 1

        pipeline = TabularClassificationPipeline(
            dataset_properties=fit_dictionary_tabular['dataset_properties'],
            include={'feature_preprocessor': [preprocessor]})
        cs = pipeline.get_hyperparameter_search_space()
        config = cs.sample_configuration()
        pipeline.set_hyperparameters(config)
        try:
            pipeline.fit(fit_dictionary_tabular)
        except Exception as e:
            pytest.fail(f"For config {config} failed with {e}")

        # To make sure we fitted the model, there should be a
        # run summary object with accuracy
        run_summary = pipeline.named_steps['trainer'].run_summary
        assert run_summary is not None

        assert preprocessor == pipeline.named_steps[
            'feature_preprocessor'].choice.__class__.__name__
Example #20
0
 def test_remove_key_check_requirements(self):
     """Makes sure that when a key is removed from X, correct error is outputted"""
     pipeline = TabularClassificationPipeline(
         dataset_properties=self.dataset_properties)
     X = {
         'num_features': self.num_features,
         'num_classes': self.num_classes,
         'numerical_columns': list(range(self.num_features)),
         'categorical_columns': [],
         'categories': [],
         'X_train': self.X,
         'y_train': self.y,
         'train_indices': list(range(self.X.shape[0] // 2)),
         'val_indices': list(range(self.X.shape[0] // 2, self.X.shape[0])),
         'is_small_preprocess': False,
         # Training configuration
         'dataset_properties': self.dataset_properties,
         'job_id': 'example_tabular_classification_1',
         'device': 'cpu',
         'budget_type': 'epochs',
         'epochs': 5,
         'torch_num_threads': 1,
         'early_stopping': 20,
         'working_dir': '/tmp',
         'use_tensorboard_logger': True,
         'use_pynisher': False,
         'metrics_during_training': True,
         'split_id': 0,
         'backend': self.backend,
     }
     for key in X.keys():
         # skip tests for data loader requirements as data loader has different check_requirements
         if key == 'y_train' or 'val_indices':
             continue
         X_copy = X.copy()
         X_copy.pop(key)
         try:
             pipeline.fit(X_copy)
         except ValueError as msg:
             self.assertRegex(
                 str(msg),
                 r"To fit .+?, expected fit dictionary to have .+? but got .*"
             )
 def test_apply_search_space_updates(self, fit_dictionary_tabular,
                                     search_space_updates):
     dataset_properties = {
         'numerical_columns': [1],
         'categorical_columns': [2],
         'task_type': 'tabular_classification'
     }
     pipeline = TabularClassificationPipeline(
         dataset_properties=dataset_properties,
         search_space_updates=search_space_updates)
     self._assert_pipeline_search_space(pipeline, search_space_updates)
Example #22
0
    def test_tabular_no_preprocess(self):
        dataset_properties = {
            'numerical_columns': list(range(15)),
            'categorical_columns': [],
            'task_type': TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION],
            'output_type': OUTPUT_TYPES_TO_STRING[MULTICLASS],
            'is_small_preprocess': False,
            'input_shape': (15, ),
            'output_shape': 2,
            'categories': [],
            'issparse': False
        }
        X = dict(
            X_train=np.random.random((10, 15)),
            y_train=np.random.random(10),
            train_indices=[0, 1, 2, 3, 4, 5],
            val_indices=[6, 7, 8, 9],
            dataset_properties=dataset_properties,
            # Training configuration
            num_run=16,
            device='cpu',
            budget_type='epochs',
            epochs=10,
            torch_num_threads=1,
            early_stopping=20,
            split_id=0,
            backend=self.backend,
        )

        pipeline = TabularClassificationPipeline(
            dataset_properties=dataset_properties)
        # Remove the trainer
        pipeline.steps.pop()
        pipeline = pipeline.fit(X)
        X = pipeline.transform(X)
        self.assertIn('preprocess_transforms', X.keys())
        self.assertIsInstance(X['preprocess_transforms'], list)
        self.assertIsInstance(X['preprocess_transforms'][-1].preprocessor,
                              BaseEstimator)
    def test_set_range_search_space_updates(self, fit_dictionary_tabular):
        dataset_properties = {
            'numerical_columns': [1],
            'categorical_columns': [2],
            'task_type': 'tabular_classification'
        }
        config_dict = TabularClassificationPipeline(dataset_properties=dataset_properties). \
            get_hyperparameter_search_space()._hyperparameters
        updates = HyperparameterSearchSpaceUpdates()
        for i, (name, hyperparameter) in enumerate(config_dict.items()):
            if '__choice__' in name:
                continue
            name = name.split(':')
            hyperparameter_name = ':'.join(name[1:])
            if '_' in hyperparameter_name:
                if any(l_.isnumeric() for l_ in hyperparameter_name.split('_')
                       [-1]) and 'network' in name[0]:
                    hyperparameter_name = '_'.join(
                        hyperparameter_name.split('_')[:-1])
            if isinstance(hyperparameter, CategoricalHyperparameter):
                value_range = (hyperparameter.choices[0], )
                default_value = hyperparameter.choices[0]
            else:
                value_range = (0, 1)
                default_value = 1
            updates.append(node_name=name[0],
                           hyperparameter=hyperparameter_name,
                           value_range=value_range,
                           default_value=default_value)
        pipeline = TabularClassificationPipeline(
            dataset_properties=dataset_properties,
            search_space_updates=updates)

        try:
            self._assert_pipeline_search_space(pipeline, updates)
        except AssertionError as e:
            # As we are setting num_layers to 1 for fully connected
            # head, units_layer does not exist in the configspace
            assert 'fully_connected:units_layer' in e.args[0], e.args[0]
Example #24
0
def test_show_models(fit_dictionary_tabular):
    api = BaseTask()
    api.ensemble_ = MagicMock()
    api.models_ = [
        TabularClassificationPipeline(
            dataset_properties=fit_dictionary_tabular['dataset_properties'])
    ]
    api.ensemble_.get_models_with_weights.return_value = [(1.0, api.models_[0])
                                                          ]
    # Expect the default configuration
    expected = (
        r"0\s+|\s+SimpleImputer,OneHotEncoder,NoScaler,NoFeaturePreprocessing\s+"
        r"|\s+no embedding,ShapedMLPBackbone,FullyConnectedHead,nn.Sequential\s+|\s+1"
    )
    assert re.search(expected, api.show_models()) is not None
Example #25
0
    def build_pipeline(
            self,
            dataset_properties: Dict[str,
                                     Any]) -> TabularClassificationPipeline:
        """
        Build pipeline according to current task and for the passed dataset properties

        Args:
            dataset_properties (Dict[str,Any])

        Returns:
            TabularClassificationPipeline:
                Pipeline compatible with the given dataset properties.
        """
        return TabularClassificationPipeline(
            dataset_properties=dataset_properties)
 def test_error_search_space_updates(self, fit_dictionary_tabular,
                                     error_search_space_updates):
     dataset_properties = {
         'numerical_columns': [1],
         'categorical_columns': [2],
         'task_type': 'tabular_classification'
     }
     try:
         _ = TabularClassificationPipeline(
             dataset_properties=dataset_properties,
             search_space_updates=error_search_space_updates)
     except Exception as e:
         assert isinstance(e, ValueError)
         assert re.match(
             r'Unknown hyperparameter for component .*?\. Expected update '
             r'hyperparameter to be in \[.*?\] got .+', e.args[0])
    def test_pipeline_transform(self, fit_dictionary_tabular):
        """
        In the context of autopytorch, transform expands a fit dictionary with
        components that where previously fit. We can use this as a nice way to make sure
        that fit properly work.
        This code is added in light of components not properly added to the fit dicitonary
        """

        pipeline = TabularClassificationPipeline(
            dataset_properties=fit_dictionary_tabular['dataset_properties'])
        cs = pipeline.get_hyperparameter_search_space()
        config = cs.sample_configuration()
        pipeline.set_hyperparameters(config)

        with unittest.mock.patch.object(pipeline.named_steps['trainer'].choice, 'train_epoch') \
             as patch_train:
            patch_train.return_value = 1, {}
            # We do not want to make the same early preprocessing operation to the fit dictionary
            pipeline.fit(fit_dictionary_tabular.copy())

        transformed_fit_dictionary_tabular = pipeline.transform(
            fit_dictionary_tabular)

        # First, we do not lose anyone! (We use a fancy subset containment check)
        assert fit_dictionary_tabular.items(
        ) <= transformed_fit_dictionary_tabular.items()

        # Then the pipeline should have added the following keys
        expected_keys = {
            'imputer', 'encoder', 'scaler', 'tabular_transformer',
            'preprocess_transforms', 'network', 'optimizer', 'lr_scheduler',
            'train_data_loader', 'val_data_loader', 'run_summary'
        }
        assert expected_keys.issubset(
            set(transformed_fit_dictionary_tabular.keys()))

        # Then we need to have transformations being created.
        assert len(
            get_preprocess_transforms(transformed_fit_dictionary_tabular)) > 0

        # We expect the transformations to be in the pipeline at anytime for inference
        assert 'preprocess_transforms' in transformed_fit_dictionary_tabular.keys(
        )
Example #28
0
def _get_classification_dataset_requirements(info: Dict[str, Any],
                                             include: Optional[Dict] = None,
                                             exclude: Optional[Dict] = None,
                                             search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
                                             ) -> List[FitRequirement]:
    task_type = STRING_TO_TASK_TYPES[info['task_type']]

    if task_type in TABULAR_TASKS:
        return TabularClassificationPipeline(
            dataset_properties=info,
            include=include, exclude=exclude,
            search_space_updates=search_space_updates). \
            get_dataset_requirements()
    elif task_type in IMAGE_TASKS:
        return ImageClassificationPipeline(
            dataset_properties=info,
            include=include, exclude=exclude,
            search_space_updates=search_space_updates). \
            get_dataset_requirements()
    else:
        raise ValueError("Task_type not supported")
    def test_read_and_update_search_space(self, fit_dictionary_tabular,
                                          search_space_updates):
        import tempfile
        path = tempfile.gettempdir()
        path = os.path.join(path, 'updates.txt')
        # Write to disk
        search_space_updates.save_as_file(path=path)
        assert os.path.exists(path=path)

        # Read from disk
        file_search_space_updates = parse_hyperparameter_search_space_updates(
            updates_file=path)
        assert isinstance(file_search_space_updates,
                          HyperparameterSearchSpaceUpdates)
        dataset_properties = {
            'numerical_columns': [1],
            'categorical_columns': [2],
            'task_type': 'tabular_classification'
        }
        pipeline = TabularClassificationPipeline(
            dataset_properties=dataset_properties,
            search_space_updates=file_search_space_updates)
        assert file_search_space_updates == pipeline.search_space_updates
Example #30
0
# Save data via backend to fit the pipeline
datamanager = TabularDataset(
    X=X_train,
    Y=y_train,
    X_test=X_test,
    Y_test=y_test,
)

backend = create(
    temporary_directory='./tmp/autoPyTorch_tabular_classification_tmp',
    output_directory='./tmp/autoPyTorch_tabular_classification_out',
    delete_tmp_folder_after_terminate=False)
backend.save_datamanager(datamanager)

pipeline = TabularClassificationPipeline(dataset_properties=dataset_properties)

# Create a fit dictionary
fit_dictionary = {
    'categorical_columns': categorical_columns,
    'numerical_columns': numerical_columns,
    'num_features': X.shape[1],
    'num_classes': len(np.unique(y)),
    'is_small_preprocess': True,
    'categories': categories,
    'X_train': X_train,
    'y_train': y_train,
    'train_indices': train_indices,
    'val_indices': val_indices,
    'X_test': X_test,
    'y_test': y_test,