def test_get_dataset_properties(self): # Get data to train fit_dictionary = get_data_to_train() # Build a repository with random fitted models try: backend = create( temporary_directory='/tmp/autoPyTorch_ensemble_test_tmp', output_directory='/tmp/autoPyTorch_ensemble_test_out', delete_tmp_folder_after_terminate=False) except Exception: self.assertRaises(FileExistsError) return unittest.skip("File already exists") fit_dictionary['backend'] = backend # Create the directory structure backend._make_internals_directory() # Create a datamanager for this toy problem datamanager = TabularDataset( X=fit_dictionary['X_train'], Y=fit_dictionary['y_train'], X_test=fit_dictionary['X_test'], Y_test=fit_dictionary['y_test'], ) backend.save_datamanager(datamanager) datamanager = backend.load_datamanager() info = { 'task_type': datamanager.task_type, 'output_type': datamanager.output_type, 'issparse': datamanager.issparse, 'numerical_columns': datamanager.numerical_columns, 'categorical_columns': datamanager.categorical_columns } dataset_requirements = get_dataset_requirements(info) dataset_properties = datamanager.get_dataset_properties( dataset_requirements) self.assertIsInstance(dataset_properties, dict) for dataset_requirement in dataset_requirements: self.assertIn(dataset_requirement.name, dataset_properties.keys()) self.assertIsInstance(dataset_properties[dataset_requirement.name], dataset_requirement.supported_types)
def backend(request): test_dir = os.path.dirname(__file__) tmp = os.path.join( test_dir, '.tmp__%s__%s' % (request.module.__name__, request.node.name)) output = os.path.join( test_dir, '.output__%s__%s' % (request.module.__name__, request.node.name)) for dir in (tmp, output): for i in range(10): if os.path.exists(dir): try: shutil.rmtree(dir) break except OSError: time.sleep(1) # Make sure the folders we wanna create do not already exist. backend = create( tmp, output, delete_tmp_folder_after_terminate=True, delete_output_folder_after_terminate=True, ) def get_finalizer(tmp_dir, output_dir): def session_run_at_end(): for dir in (tmp_dir, output_dir): for i in range(10): if os.path.exists(dir): try: shutil.rmtree(dir) break except OSError: time.sleep(1) return session_run_at_end request.addfinalizer(get_finalizer(tmp, output)) return backend
def setUp(self): self.num_features = 4 self.num_classes = 2 self.X, self.y = make_classification(n_samples=200, n_features=self.num_features, n_informative=3, n_redundant=1, n_repeated=0, n_classes=self.num_classes, n_clusters_per_class=2, shuffle=True, random_state=0) self.dataset_properties = { 'task_type': 'tabular_classification', 'output_type': 'binary', 'numerical_columns': list(range(4)), 'categorical_columns': [], } # Create run dir tmp_dir = '/tmp/autoPyTorch_ensemble_test_tmp' if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) output_dir = '/tmp/autoPyTorch_ensemble_test_out' if os.path.exists(output_dir): shutil.rmtree(output_dir) self.backend = create(temporary_directory=tmp_dir, output_directory=output_dir, delete_tmp_folder_after_terminate=False) # Create the directory structure self.backend._make_internals_directory() # Create a datamanager for this toy problem datamanager = TabularDataset( X=self.X, Y=self.y, X_test=self.X, Y_test=self.y, ) self.backend.save_datamanager(datamanager)
'task_type': 'tabular_classification', 'categorical_columns': categorical_columns, 'numerical_columns': numerical_columns, 'output_type': output_type, } # Save data via backend to fit the pipeline datamanager = TabularDataset( X=X_train, Y=y_train, X_test=X_test, Y_test=y_test, ) backend = create( temporary_directory='./tmp/autoPyTorch_tabular_classification_tmp', output_directory='./tmp/autoPyTorch_tabular_classification_out', delete_tmp_folder_after_terminate=False) backend.save_datamanager(datamanager) pipeline = TabularClassificationPipeline(dataset_properties=dataset_properties) # Create a fit dictionary fit_dictionary = { 'categorical_columns': categorical_columns, 'numerical_columns': numerical_columns, 'num_features': X.shape[1], 'num_classes': len(np.unique(y)), 'is_small_preprocess': True, 'categories': categories, 'X_train': X_train, 'y_train': y_train,
) score = accuracy_score(y_test, np.argmax(test_predictions, axis=1)) print(f"Fitted a pipeline {idx} with score = {score}") return if __name__ == "__main__": # Get data to train fit_dictionary = get_data_to_train() # Build a repository with random fitted models backend = create(temporary_directory='./tmp/autoPyTorch_ensemble_test_tmp', output_directory='./tmp/autoPyTorch_ensemble_test_out', delete_tmp_folder_after_terminate=False) fit_dictionary['backend'] = backend # Create the directory structure backend._make_internals_directory() # Create a datamanager for this toy problem datamanager = TabularDataset( X=fit_dictionary['X_train'], Y=fit_dictionary['y_train'], X_test=fit_dictionary['X_test'], Y_test=fit_dictionary['y_test'], ) backend.save_datamanager(datamanager)