def test_set_choices_updates(self, fit_dictionary_tabular): dataset_properties = { 'numerical_columns': [1], 'categorical_columns': [2], 'task_type': 'tabular_classification' } config_dict = TabularClassificationPipeline(dataset_properties=dataset_properties). \ get_hyperparameter_search_space()._hyperparameters updates = HyperparameterSearchSpaceUpdates() for i, (name, hyperparameter) in enumerate(config_dict.items()): if '__choice__' not in name: continue name = name.split(':') hyperparameter_name = ':'.join(name[1:]) # Using NoEmbedding is safer for this test # to avoid forbidden configuration errors if name[0] == 'network_embedding' and hyperparameter_name == '__choice__': value_range = ('NoEmbedding', ) default_value = 'NoEmbedding' else: value_range = (hyperparameter.choices[0], ) default_value = hyperparameter.choices[0] updates.append(node_name=name[0], hyperparameter=hyperparameter_name, value_range=value_range, default_value=default_value) pipeline = TabularClassificationPipeline( dataset_properties=dataset_properties, search_space_updates=updates) self._assert_pipeline_search_space(pipeline, updates)
def test_train_pipeline_with_runtime(fit_dictionary_tabular_dummy): """This test makes sure that the pipeline is able to achieve a decent score on dummy data given the default configuration""" # Convert the training to runtime fit_dictionary_tabular_dummy.pop('epochs', None) fit_dictionary_tabular_dummy['budget_type'] = 'runtime' fit_dictionary_tabular_dummy['runtime'] = 5 fit_dictionary_tabular_dummy['early_stopping'] = -1 pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary_tabular_dummy['dataset_properties']) cs = pipeline.get_hyperparameter_search_space() config = cs.get_default_configuration() pipeline.set_hyperparameters(config) pipeline.fit(fit_dictionary_tabular_dummy) run_summary = pipeline.named_steps['trainer'].run_summary budget_tracker = pipeline.named_steps['trainer'].budget_tracker assert budget_tracker.budget_type == 'runtime' assert budget_tracker.max_runtime == 5 assert budget_tracker.is_max_time_reached() # There is no epoch limitation assert not budget_tracker.is_max_epoch_reached(epoch=np.inf) # More than 200 epochs would have pass in 5 seconds for this dataset assert len(run_summary.performance_tracker['start_time']) > 100
def test_pipeline_fit(self, fit_dictionary_tabular): """This test makes sure that the pipeline is able to fit given random combinations of hyperparameters across the pipeline""" pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties']) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() pipeline.set_hyperparameters(config) try: pipeline.fit(fit_dictionary_tabular) except Exception as e: pytest.fail(f"Failed due to {e} for config={config}") # To make sure we fitted the model, there should be a # run summary object with accuracy run_summary = pipeline.named_steps['trainer'].run_summary assert run_summary is not None # Make sure that performance was properly captured assert run_summary.performance_tracker['train_loss'][1] > 0 assert run_summary.total_parameter_count > 0 assert 'accuracy' in run_summary.performance_tracker['train_metrics'][ 1] # Make sure a network was fit assert isinstance(pipeline.named_steps['network'].get_network(), torch.nn.Module)
def test_pipeline_predict(self, fit_dictionary_tabular): """This test makes sure that the pipeline is able to predict given a random configuration""" X = fit_dictionary_tabular['X_train'].copy() pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties']) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() pipeline.set_hyperparameters(config) with unittest.mock.patch.object(pipeline.named_steps['trainer'].choice, 'train_epoch') \ as patch_train: patch_train.return_value = 1, {} pipeline.fit(fit_dictionary_tabular) # we expect the output to have the same batch size as the test input, # and number of outputs per batch sample equal to the number of outputs expected_output_shape = ( X.shape[0], fit_dictionary_tabular["dataset_properties"]["output_shape"]) prediction = pipeline.predict(X) assert isinstance(prediction, np.ndarray) assert prediction.shape == expected_output_shape
def test_tabular_preprocess(self): dataset_properties = { 'numerical_columns': list(range(15)), 'categorical_columns': [], 'task_type': 'tabular_classification' } X = dict( X_train=np.random.random((10, 15)), y_train=np.random.random(10), train_indices=[0, 1, 2, 3, 4, 5], val_indices=[6, 7, 8, 9], is_small_preprocess=True, numerical_columns=list(range(15)), categorical_columns=[], num_features=15, num_classes=2, categories=[], # Training configuration job_id='test', device='cpu', budget_type='epochs', epochs=10, torch_num_threads=1, early_stopping=20, dataset_properties=dataset_properties, split_id=0, backend=self.backend, ) pipeline = TabularClassificationPipeline( dataset_properties=dataset_properties) # Remove the trainer pipeline.steps.pop() pipeline = pipeline.fit(X) X = pipeline.transform(X) self.assertNotIn('preprocess_transforms', X.keys())
def test_pipeline_predict_proba(self, fit_dictionary_tabular): """This test makes sure that the pipeline is able to fit given random combinations of hyperparameters across the pipeline And then predict using predict probability """ X = fit_dictionary_tabular['X_train'].copy() pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties']) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() pipeline.set_hyperparameters(config) try: with unittest.mock.patch.object(pipeline.named_steps['trainer'].choice, 'train_epoch') \ as patch_train: patch_train.return_value = 1, {} pipeline.fit(fit_dictionary_tabular) except Exception as e: pytest.fail(f"Failed on config={config} with {e}") # we expect the output to have the same batch size as the test input, # and number of outputs per batch sample equal to the number of classes ("num_classes" in dataset_properties) expected_output_shape = ( X.shape[0], fit_dictionary_tabular["dataset_properties"]["output_shape"]) prediction = pipeline.predict_proba(X) assert isinstance(prediction, np.ndarray) assert prediction.shape == expected_output_shape
def test_pipeline_fit(self): """This test makes sure that the pipeline is able to fit given random combinations of hyperparameters across the pipeline""" pipeline = TabularClassificationPipeline( dataset_properties=self.dataset_properties) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() pipeline.set_hyperparameters(config) pipeline.fit({ 'num_features': self.num_features, 'num_classes': self.num_classes, 'numerical_columns': list(range(self.num_features)), 'categorical_columns': [], 'categories': [], 'X_train': self.X, 'y_train': self.y, 'train_indices': list(range(self.X.shape[0] // 2)), 'val_indices': list(range(self.X.shape[0] // 2, self.X.shape[0])), 'is_small_preprocess': False, # Training configuration 'dataset_properties': self.dataset_properties, 'job_id': 'example_tabular_classification_1', 'device': 'cpu', 'budget_type': 'epochs', 'epochs': 5, 'torch_num_threads': 1, 'early_stopping': 20, 'working_dir': '/tmp', 'use_tensorboard_logger': True, 'use_pynisher': False, 'metrics_during_training': True, 'split_id': 0, 'backend': self.backend, }) # To make sure we fitted the model, there should be a # run summary object with accuracy self.assertIsNotNone(pipeline.named_steps['trainer'].run_summary)
def _get_classification_dataset_requirements(info: Dict[str, Any], include: Dict[str, List[str]], exclude: Dict[str, List[str]]) -> List[FitRequirement]: task_type = info['task_type'] output_type = info['output_type'] sparse = info['issparse'] dataset_properties = { 'task_type': task_type, 'output_type': output_type, 'sparse': sparse } if task_type in TABULAR_TASKS: dataset_properties.update({'numerical_columns': info['numerical_columns'], 'categorical_columns': info['categorical_columns']}) return TabularClassificationPipeline( dataset_properties=dataset_properties, include=include, exclude=exclude).\ get_dataset_requirements() elif task_type in IMAGE_TASKS: return ImageClassificationPipeline( dataset_properties=dataset_properties, include=include, exclude=exclude).\ get_dataset_requirements() else: raise ValueError("Task_type not supported")
def test_pipeline_fit_include(self, fit_dictionary_tabular, preprocessor): """ This test ensures that a tabular classification pipeline can be fit with all preprocessors in the include """ fit_dictionary_tabular['epochs'] = 1 pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties'], include={'feature_preprocessor': [preprocessor]}) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() pipeline.set_hyperparameters(config) try: pipeline.fit(fit_dictionary_tabular) except Exception as e: pytest.fail(f"For config {config} failed with {e}") # To make sure we fitted the model, there should be a # run summary object with accuracy run_summary = pipeline.named_steps['trainer'].run_summary assert run_summary is not None assert preprocessor == pipeline.named_steps[ 'feature_preprocessor'].choice.__class__.__name__
def test_pipeline_score(fit_dictionary_tabular_dummy): """This test makes sure that the pipeline is able to achieve a decent score on dummy data given the default configuration""" X = fit_dictionary_tabular_dummy['X_train'].copy() y = fit_dictionary_tabular_dummy['y_train'].copy() # increase number of epochs to test for performance fit_dictionary_tabular_dummy['epochs'] = 50 pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary_tabular_dummy['dataset_properties']) cs = pipeline.get_hyperparameter_search_space() config = cs.get_default_configuration() pipeline.set_hyperparameters(config) pipeline.fit(fit_dictionary_tabular_dummy) # Ensure that the network is an instance of torch Module assert isinstance(pipeline.named_steps['network'].get_network(), torch.nn.Module) accuracy = pipeline.score(X, y) # we should be able to get a decent score on this dummy data assert accuracy >= 0.8, f"Pipeline:{pipeline} Config:{config} FitDict: {fit_dictionary_tabular_dummy}"
def test_network_optimizer_lr_handshake(self, fit_dictionary_tabular): """Fitting a network should put the network in the X""" # Create the pipeline to check. A random config should be sufficient pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties']) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() pipeline.set_hyperparameters(config) # Make sure that fitting a network adds a "network" to X assert 'network' in pipeline.named_steps.keys() fit_dictionary_tabular['network_embedding'] = torch.nn.Linear(3, 3) fit_dictionary_tabular['network_backbone'] = torch.nn.Linear(3, 4) fit_dictionary_tabular['network_head'] = torch.nn.Linear(4, 1) X = pipeline.named_steps['network'].fit( fit_dictionary_tabular, None).transform(fit_dictionary_tabular) assert 'network' in X # Then fitting a optimizer should fail if no network: assert 'optimizer' in pipeline.named_steps.keys() with pytest.raises( ValueError, match= r"To fit .+?, expected fit dictionary to have 'network' but got .*" ): pipeline.named_steps['optimizer'].fit({'dataset_properties': {}}, None) # No error when network is passed X = pipeline.named_steps['optimizer'].fit(X, None).transform(X) assert 'optimizer' in X assert isinstance( pipeline.named_steps['optimizer'].choice.get_optimizer(), torch.optim.Optimizer) # Then fitting a optimizer should fail if no network: assert 'lr_scheduler' in pipeline.named_steps.keys() with pytest.raises( ValueError, match= r"To fit .+?, expected fit dictionary to have 'optimizer' but got .*" ): pipeline.named_steps['lr_scheduler'].fit( {'dataset_properties': {}}, None) # No error when network is passed X = pipeline.named_steps['lr_scheduler'].fit(X, None).transform(X) assert 'lr_scheduler' in X if isinstance(pipeline.named_steps['lr_scheduler'].choice, NoScheduler): pytest.skip("This scheduler does not support `get_scheduler`") lr_scheduler = pipeline.named_steps[ 'lr_scheduler'].choice.get_scheduler() if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): pytest.skip("This scheduler is not a child of _LRScheduler") assert isinstance(lr_scheduler, _LRScheduler)
def test_apply_search_space_updates(self, fit_dictionary_tabular, search_space_updates): dataset_properties = { 'numerical_columns': [1], 'categorical_columns': [2], 'task_type': 'tabular_classification' } pipeline = TabularClassificationPipeline( dataset_properties=dataset_properties, search_space_updates=search_space_updates) self._assert_pipeline_search_space(pipeline, search_space_updates)
def test_default_configuration(self): """Makes sure that when no config is set, we can trust the default configuration from the space""" pipeline = TabularClassificationPipeline( dataset_properties=self.dataset_properties) pipeline.fit({ 'num_features': self.num_features, 'num_classes': self.num_classes, 'numerical_columns': list(range(self.num_features)), 'categorical_columns': [], 'categories': [], 'X_train': self.X, 'y_train': self.y, 'train_indices': list(range(self.X.shape[0] // 2)), 'val_indices': list(range(self.X.shape[0] // 2, self.X.shape[0])), 'is_small_preprocess': False, # Training configuration 'dataset_properties': self.dataset_properties, 'job_id': 'example_tabular_classification_1', 'device': 'cpu', 'budget_type': 'epochs', 'epochs': 5, 'torch_num_threads': 1, 'early_stopping': 20, 'working_dir': '/tmp', 'use_tensorboard_logger': True, 'use_pynisher': False, 'metrics_during_training': True, 'split_id': 0, 'backend': self.backend, })
def test_get_fit_requirements(self): dataset_properties = { 'numerical_columns': [], 'categorical_columns': [] } pipeline = TabularClassificationPipeline( dataset_properties=dataset_properties) fit_requirements = pipeline.get_fit_requirements() # check if fit requirements is a list of FitRequirement named tuples self.assertIsInstance(fit_requirements, list) for requirement in fit_requirements: self.assertIsInstance(requirement, FitRequirement)
def test_set_range_search_space_updates(self, fit_dictionary_tabular): dataset_properties = { 'numerical_columns': [1], 'categorical_columns': [2], 'task_type': 'tabular_classification' } config_dict = TabularClassificationPipeline(dataset_properties=dataset_properties). \ get_hyperparameter_search_space()._hyperparameters updates = HyperparameterSearchSpaceUpdates() for i, (name, hyperparameter) in enumerate(config_dict.items()): if '__choice__' in name: continue name = name.split(':') hyperparameter_name = ':'.join(name[1:]) if '_' in hyperparameter_name: if any(l_.isnumeric() for l_ in hyperparameter_name.split('_') [-1]) and 'network' in name[0]: hyperparameter_name = '_'.join( hyperparameter_name.split('_')[:-1]) if isinstance(hyperparameter, CategoricalHyperparameter): value_range = (hyperparameter.choices[0], ) default_value = hyperparameter.choices[0] else: value_range = (0, 1) default_value = 1 updates.append(node_name=name[0], hyperparameter=hyperparameter_name, value_range=value_range, default_value=default_value) pipeline = TabularClassificationPipeline( dataset_properties=dataset_properties, search_space_updates=updates) try: self._assert_pipeline_search_space(pipeline, updates) except AssertionError as e: # As we are setting num_layers to 1 for fully connected # head, units_layer does not exist in the configspace assert 'fully_connected:units_layer' in e.args[0], e.args[0]
def test_default_configuration(self, fit_dictionary_tabular, is_small_preprocess): """Makes sure that when no config is set, we can trust the default configuration from the space""" fit_dictionary_tabular['is_small_preprocess'] = is_small_preprocess pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties']) with unittest.mock.patch.object(pipeline.named_steps['trainer'].choice, 'train_epoch') \ as patch_train: patch_train.return_value = 1, {} pipeline.fit(fit_dictionary_tabular)
def random_search_and_save(fit_dictionary: typing.Dict[str, typing.Any], backend: Backend, num_models: int) -> None: """ A function to generate randomly fitted pipelines. It inefficiently pass the data in the fit dictionary, as there is no datamanager yet. It uses the backend to save the models and predictions for the ensemble selection """ # Ensemble selection will evaluate performance on the OOF predictions. Store the OOF # Ground truth datamanager = backend.load_datamanager() X_train, y_train = datamanager.train_tensors X_test, y_test = (None, None) if datamanager.test_tensors is not None: X_test, y_test = datamanager.test_tensors targets = np.take(y_train, fit_dictionary['val_indices'], axis=0) backend.save_targets_ensemble(targets) for idx in range(num_models): pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary['dataset_properties']) # Sample a random configuration pipeline_cs = pipeline.get_hyperparameter_search_space() config = pipeline_cs.sample_configuration() pipeline.set_hyperparameters(config) # Fit the sample configuration pipeline.fit(fit_dictionary) # Predict using the fit model ensemble_predictions = pipeline.predict( np.take(X_train, fit_dictionary['val_indices'], axis=0)) test_predictions = pipeline.predict(X_test) backend.save_numrun_to_dir( seed=fit_dictionary['seed'], idx=idx, budget=fit_dictionary['epochs'], model=pipeline, cv_model=None, ensemble_predictions=ensemble_predictions, valid_predictions=None, test_predictions=test_predictions, ) score = accuracy_score(y_test, np.argmax(test_predictions, axis=1)) print(f"Fitted a pipeline {idx} with score = {score}") return
def test_remove_key_check_requirements(self, fit_dictionary_tabular): """Makes sure that when a key is removed from X, correct error is outputted""" pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties']) for key in [ 'num_run', 'device', 'split_id', 'torch_num_threads', 'dataset_properties' ]: fit_dictionary_tabular_copy = fit_dictionary_tabular.copy() fit_dictionary_tabular_copy.pop(key) with pytest.raises( ValueError, match=r"To fit .+?, expected fit dictionary to have"): pipeline.fit(fit_dictionary_tabular_copy)
def test_get_fit_requirements(self, fit_dictionary_tabular): dataset_properties = { 'numerical_columns': [], 'categorical_columns': [], 'task_type': 'tabular_classification' } pipeline = TabularClassificationPipeline( dataset_properties=dataset_properties) fit_requirements = pipeline.get_fit_requirements() # check if fit requirements is a list of FitRequirement named tuples assert isinstance(fit_requirements, list) for requirement in fit_requirements: assert isinstance(requirement, FitRequirement)
def test_network_optimizer_lr_handshake(self): """Fitting a network should put the network in the X""" # Create the pipeline to check. A random config should be sufficient dataset_properties = { 'numerical_columns': [], 'categorical_columns': [], 'task_type': 'tabular_classification' } pipeline = TabularClassificationPipeline( dataset_properties=dataset_properties) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() pipeline.set_hyperparameters(config) # Make sure that fitting a network adds a "network" to X self.assertIn('network', pipeline.named_steps.keys()) X = pipeline.named_steps['network'].fit( { 'num_features': 10, 'num_classes': 2, 'X_train': self.X, 'y_train': self.y }, None).transform({}) self.assertIn('network', X) # Then fitting a optimizer should fail if no network: self.assertIn('optimizer', pipeline.named_steps.keys()) with self.assertRaisesRegex( ValueError, r"To fit .+?, expected fit dictionary to have 'network' but got .*" ): pipeline.named_steps['optimizer'].fit({}, None) # No error when network is passed X = pipeline.named_steps['optimizer'].fit(X, None).transform(X) self.assertIn('optimizer', X) # Then fitting a optimizer should fail if no network: self.assertIn('lr_scheduler', pipeline.named_steps.keys()) with self.assertRaisesRegex( ValueError, r"To fit .+?, expected fit dictionary to have 'optimizer' but got .*" ): pipeline.named_steps['lr_scheduler'].fit({}, None) # No error when network is passed X = pipeline.named_steps['lr_scheduler'].fit(X, None).transform(X) self.assertIn('optimizer', X)
def test_show_models(fit_dictionary_tabular): api = BaseTask() api.ensemble_ = MagicMock() api.models_ = [ TabularClassificationPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties']) ] api.ensemble_.get_models_with_weights.return_value = [(1.0, api.models_[0]) ] # Expect the default configuration expected = ( r"0\s+|\s+SimpleImputer,OneHotEncoder,NoScaler,NoFeaturePreprocessing\s+" r"|\s+no embedding,ShapedMLPBackbone,FullyConnectedHead,nn.Sequential\s+|\s+1" ) assert re.search(expected, api.show_models()) is not None
def build_pipeline( self, dataset_properties: Dict[str, Any]) -> TabularClassificationPipeline: """ Build pipeline according to current task and for the passed dataset properties Args: dataset_properties (Dict[str,Any]) Returns: TabularClassificationPipeline: Pipeline compatible with the given dataset properties. """ return TabularClassificationPipeline( dataset_properties=dataset_properties)
def test_error_search_space_updates(self, fit_dictionary_tabular, error_search_space_updates): dataset_properties = { 'numerical_columns': [1], 'categorical_columns': [2], 'task_type': 'tabular_classification' } try: _ = TabularClassificationPipeline( dataset_properties=dataset_properties, search_space_updates=error_search_space_updates) except Exception as e: assert isinstance(e, ValueError) assert re.match( r'Unknown hyperparameter for component .*?\. Expected update ' r'hyperparameter to be in \[.*?\] got .+', e.args[0])
def _get_classification_configuration_space(info: Dict[str, Any], include: Dict[str, List[str]], exclude: Dict[str, List[str]], search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None ) -> ConfigurationSpace: if STRING_TO_TASK_TYPES[info['task_type']] in TABULAR_TASKS: pipeline = TabularClassificationPipeline(dataset_properties=info, include=include, exclude=exclude, search_space_updates=search_space_updates) return pipeline.get_hyperparameter_search_space() elif STRING_TO_TASK_TYPES[info['task_type']] in IMAGE_TASKS: return ImageClassificationPipeline( dataset_properties=info, include=include, exclude=exclude, search_space_updates=search_space_updates). \ get_hyperparameter_search_space() else: raise ValueError("Task_type not supported")
def test_pipeline_transform(self, fit_dictionary_tabular): """ In the context of autopytorch, transform expands a fit dictionary with components that where previously fit. We can use this as a nice way to make sure that fit properly work. This code is added in light of components not properly added to the fit dicitonary """ pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties']) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() pipeline.set_hyperparameters(config) with unittest.mock.patch.object(pipeline.named_steps['trainer'].choice, 'train_epoch') \ as patch_train: patch_train.return_value = 1, {} # We do not want to make the same early preprocessing operation to the fit dictionary pipeline.fit(fit_dictionary_tabular.copy()) transformed_fit_dictionary_tabular = pipeline.transform( fit_dictionary_tabular) # First, we do not lose anyone! (We use a fancy subset containment check) assert fit_dictionary_tabular.items( ) <= transformed_fit_dictionary_tabular.items() # Then the pipeline should have added the following keys expected_keys = { 'imputer', 'encoder', 'scaler', 'tabular_transformer', 'preprocess_transforms', 'network', 'optimizer', 'lr_scheduler', 'train_data_loader', 'val_data_loader', 'run_summary' } assert expected_keys.issubset( set(transformed_fit_dictionary_tabular.keys())) # Then we need to have transformations being created. assert len( get_preprocess_transforms(transformed_fit_dictionary_tabular)) > 0 # We expect the transformations to be in the pipeline at anytime for inference assert 'preprocess_transforms' in transformed_fit_dictionary_tabular.keys( )
def test_remove_key_check_requirements(self): """Makes sure that when a key is removed from X, correct error is outputted""" pipeline = TabularClassificationPipeline( dataset_properties=self.dataset_properties) X = { 'num_features': self.num_features, 'num_classes': self.num_classes, 'numerical_columns': list(range(self.num_features)), 'categorical_columns': [], 'categories': [], 'X_train': self.X, 'y_train': self.y, 'train_indices': list(range(self.X.shape[0] // 2)), 'val_indices': list(range(self.X.shape[0] // 2, self.X.shape[0])), 'is_small_preprocess': False, # Training configuration 'dataset_properties': self.dataset_properties, 'job_id': 'example_tabular_classification_1', 'device': 'cpu', 'budget_type': 'epochs', 'epochs': 5, 'torch_num_threads': 1, 'early_stopping': 20, 'working_dir': '/tmp', 'use_tensorboard_logger': True, 'use_pynisher': False, 'metrics_during_training': True, 'split_id': 0, 'backend': self.backend, } for key in X.keys(): # skip tests for data loader requirements as data loader has different check_requirements if key == 'y_train' or 'val_indices': continue X_copy = X.copy() X_copy.pop(key) try: pipeline.fit(X_copy) except ValueError as msg: self.assertRegex( str(msg), r"To fit .+?, expected fit dictionary to have .+? but got .*" )
def _get_classification_dataset_requirements(info: Dict[str, Any], include: Optional[Dict] = None, exclude: Optional[Dict] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None ) -> List[FitRequirement]: task_type = STRING_TO_TASK_TYPES[info['task_type']] if task_type in TABULAR_TASKS: return TabularClassificationPipeline( dataset_properties=info, include=include, exclude=exclude, search_space_updates=search_space_updates). \ get_dataset_requirements() elif task_type in IMAGE_TASKS: return ImageClassificationPipeline( dataset_properties=info, include=include, exclude=exclude, search_space_updates=search_space_updates). \ get_dataset_requirements() else: raise ValueError("Task_type not supported")
def test_tabular_no_preprocess(self): dataset_properties = { 'numerical_columns': list(range(15)), 'categorical_columns': [], 'task_type': TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION], 'output_type': OUTPUT_TYPES_TO_STRING[MULTICLASS], 'is_small_preprocess': False, 'input_shape': (15, ), 'output_shape': 2, 'categories': [], 'issparse': False } X = dict( X_train=np.random.random((10, 15)), y_train=np.random.random(10), train_indices=[0, 1, 2, 3, 4, 5], val_indices=[6, 7, 8, 9], dataset_properties=dataset_properties, # Training configuration num_run=16, device='cpu', budget_type='epochs', epochs=10, torch_num_threads=1, early_stopping=20, split_id=0, backend=self.backend, ) pipeline = TabularClassificationPipeline( dataset_properties=dataset_properties) # Remove the trainer pipeline.steps.pop() pipeline = pipeline.fit(X) X = pipeline.transform(X) self.assertIn('preprocess_transforms', X.keys()) self.assertIsInstance(X['preprocess_transforms'], list) self.assertIsInstance(X['preprocess_transforms'][-1].preprocessor, BaseEstimator)
def test_read_and_update_search_space(self, fit_dictionary_tabular, search_space_updates): import tempfile path = tempfile.gettempdir() path = os.path.join(path, 'updates.txt') # Write to disk search_space_updates.save_as_file(path=path) assert os.path.exists(path=path) # Read from disk file_search_space_updates = parse_hyperparameter_search_space_updates( updates_file=path) assert isinstance(file_search_space_updates, HyperparameterSearchSpaceUpdates) dataset_properties = { 'numerical_columns': [1], 'categorical_columns': [2], 'task_type': 'tabular_classification' } pipeline = TabularClassificationPipeline( dataset_properties=dataset_properties, search_space_updates=file_search_space_updates) assert file_search_space_updates == pipeline.search_space_updates
# Save data via backend to fit the pipeline datamanager = TabularDataset( X=X_train, Y=y_train, X_test=X_test, Y_test=y_test, ) backend = create( temporary_directory='./tmp/autoPyTorch_tabular_classification_tmp', output_directory='./tmp/autoPyTorch_tabular_classification_out', delete_tmp_folder_after_terminate=False) backend.save_datamanager(datamanager) pipeline = TabularClassificationPipeline(dataset_properties=dataset_properties) # Create a fit dictionary fit_dictionary = { 'categorical_columns': categorical_columns, 'numerical_columns': numerical_columns, 'num_features': X.shape[1], 'num_classes': len(np.unique(y)), 'is_small_preprocess': True, 'categories': categories, 'X_train': X_train, 'y_train': y_train, 'train_indices': train_indices, 'val_indices': val_indices, 'X_test': X_test, 'y_test': y_test,