def test_pSMAC_wrong_arguments(self): X = np.zeros((100, 100)) y = np.zeros((100, )) self.assertRaisesRegex( ValueError, "If shared_mode == True tmp_folder must not " "be None.", lambda shared_mode: AutoSklearnClassifier( shared_mode=shared_mode, ).fit(X, y), shared_mode=True ) self.assertRaisesRegex( ValueError, "If shared_mode == True output_folder must not " "be None.", lambda shared_mode, tmp_folder: AutoSklearnClassifier( shared_mode=shared_mode, tmp_folder=tmp_folder, ).fit(X, y), shared_mode=True, tmp_folder='/tmp/duitaredxtvbedb' )
def test_cv_results(tmp_dir, output_dir): # TODO restructure and actually use real SMAC output from a long run # to do this unittest! X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') cls = AutoSklearnClassifier(time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp_dir, output_folder=output_dir, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0, scoring_functions=[ autosklearn.metrics.precision, autosklearn.metrics.roc_auc ]) params = cls.get_params() original_params = copy.deepcopy(params) cls.fit(X_train, Y_train) cv_results = cls.cv_results_ assert isinstance(cv_results, dict), type(cv_results) assert isinstance(cv_results['mean_test_score'], np.ndarray), type(cv_results['mean_test_score']) assert isinstance(cv_results['mean_fit_time'], np.ndarray), type(cv_results['mean_fit_time']) assert isinstance(cv_results['params'], list), type(cv_results['params']) assert isinstance(cv_results['rank_test_scores'], np.ndarray), type(cv_results['rank_test_scores']) assert isinstance(cv_results['metric_precision'], npma.MaskedArray), type(cv_results['metric_precision']) assert isinstance(cv_results['metric_roc_auc'], npma.MaskedArray), type(cv_results['metric_roc_auc']) cv_result_items = [ isinstance(val, npma.MaskedArray) for key, val in cv_results.items() if key.startswith('param_') ] assert all(cv_result_items), cv_results.items() # Compare the state of the model parameters with the original parameters new_params = clone(cls).get_params() for param_name, original_value in original_params.items(): new_value = new_params[param_name] # Taken from Sklearn code: # We should never change or mutate the internal state of input # parameters by default. To check this we use the joblib.hash function # that introspects recursively any subobjects to compute a checksum. # The only exception to this rule of immutable constructor parameters # is possible RandomState instance but in this check we explicitly # fixed the random_state params recursively to be integer seeds. assert joblib.hash(new_value) == joblib.hash(original_value), ( "Estimator %s should not change or mutate " " the parameter %s from %s to %s during fit." % (cls, param_name, original_value, new_value)) # Comply with https://scikit-learn.org/dev/glossary.html#term-classes is_classifier(cls) assert hasattr(cls, 'classes_')
def test_metadata_directory(self): # Test that metadata directory is set correctly (if user specifies, # Auto-sklearn should check that the directory exists. If not, it # should use the default directory. automl1 = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, metadata_directory= "pyMetaLearn/metadata_dir", # user specified metadata_dir ) self.assertEqual(automl1.metadata_directory, "pyMetaLearn/metadata_dir") automl2 = AutoSklearnClassifier( # default metadata_dir time_left_for_this_task=30, per_run_time_limit=5, ) self.assertIsNone(automl2.metadata_directory) nonexistent_dir = "nonexistent_dir" automl3 = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, metadata_directory=nonexistent_dir, # user specified metadata_dir ) X, y = load_breast_cancer(return_X_y=True) self.assertRaisesRegex(ValueError, "The specified metadata directory " "\'%s\' does not exist!" % nonexistent_dir, automl3.fit, X=X, y=y)
def spawn_classifier(seed, dataset_name, automl_tmp_folder, automl_output_folder): if seed == 0: initial_configurations_via_metalearning = 25 smac_scenario_args = {} else: initial_configurations_via_metalearning = 0 smac_scenario_args = {'initial_incumbent': 'RANDOM'} automl = AutoSklearnClassifier( # time_left_for_this_task=60, # sec., how long should this seed fit process run # per_run_time_limit=15, # sec., each model may only take this long before it's killed # ml_memory_limit=1024, # MB, memory limit imposed on each call to a ML algorithm shared_mode=True, # tmp folder will be shared between seeds tmp_folder=automl_tmp_folder, output_folder=automl_output_folder, delete_tmp_folder_after_terminate=False, ensemble_size= 0, # ensembles will be built when all optimization runs are finished initial_configurations_via_metalearning= initial_configurations_via_metalearning, seed=seed, smac_scenario_args=smac_scenario_args, ) automl.fit(X_train, y_train, dataset_name=dataset_name)
def test_cv_results(self): # TODO restructure and actually use real SMAC output from a long run # to do this unittest! tmp = os.path.join(self.test_dir, '..', '.tmp_cv_results') output = os.path.join(self.test_dir, '..', '.out_cv_results') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') cls = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=False, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0) cls.fit(X_train, Y_train) cv_results = cls.cv_results_ self.assertIsInstance(cv_results, dict) self.assertIsInstance(cv_results['mean_test_score'], np.ndarray) self.assertIsInstance(cv_results['mean_fit_time'], np.ndarray) self.assertIsInstance(cv_results['params'], list) self.assertIsInstance(cv_results['rank_test_scores'], np.ndarray) self.assertTrue([isinstance(val, npma.MaskedArray) for key, val in cv_results.items() if key.startswith('param_')]) del cls self._tearDown(tmp) self._tearDown(output)
def __init__( self, name: str, model_params: Dict[str, Any], ) -> None: super().__init__(name, model_params) self._model = AutoSklearnClassifier(**model_params)
def zeroconf_fit_ensemble(y): p("Building ensemble") seed = 1 ensemble = AutoSklearnClassifier( time_left_for_this_task=300,per_run_time_limit=150,ml_memory_limit=20240,ensemble_size=50,ensemble_nbest=200, shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, initial_configurations_via_metalearning=0, seed=seed) ensemble.fit_ensemble( task = BINARY_CLASSIFICATION ,y = y ,metric = F1_METRIC ,precision = '32' ,dataset_name = 'foobar' ,ensemble_size=10 ,ensemble_nbest=15) sleep(20) p("Ensemble built") p("Show models") p(str(ensemble.show_models())) return ensemble
def main(working_directory, time_limit, per_run_time_limit, task_id, seed): configuration_output_dir = os.path.join(working_directory, str(seed)) try: os.makedirs(configuration_output_dir) except Exception as _: print( "Direcotry {0} aleardy created.".format(configuration_output_dir)) tmp_dir = os.path.join(configuration_output_dir, str(task_id)) #try: # os.makedirs(tmp_dir) #except Exception as _: # print("Direcotry {0} aleardy created.".format(configuration_output_dir)) automl_arguments = { 'time_left_for_this_task': time_limit, 'per_run_time_limit': per_run_time_limit, 'initial_configurations_via_metalearning': 0, 'ensemble_size': 0, 'seed': seed, 'ml_memory_limit': 3072, 'resampling_strategy': 'holdout', 'resampling_strategy_arguments': { 'train_size': 0.67 }, #'resampling_strategy': 'cv', #'resampling_strategy_arguments': {'folds': 5}, 'tmp_folder': tmp_dir, 'delete_tmp_folder_after_terminate': False, 'disable_evaluator_output': False, } X_train, y_train, X_test, y_test, cat = load_task(task_id) automl = AutoSklearnClassifier(**automl_arguments) automl.fit(X_train, y_train, dataset_name=str(task_id), X_test=X_test, y_test=y_test, metric=balanced_accuracy) with open(os.path.join(tmp_dir, "score_vanilla.csv"), 'w') as fh: T = 0 fh.write("Time,Train Performance,Test Performance\n") # Add start time:0, Train Performance:1, Test Performance: 1 best_loss = 1 fh.write("{0},{1},{2}\n".format(T, 0, 0)) for key, value in automl._automl.runhistory_.data.items( ): # We compute rank based on error. t = value.time loss = value.cost T += t if loss < best_loss: fh.write("{0},{1},{2}\n".format( T, 1 - loss, 1 - value.additional_info.get('test_loss', 1.0))) best_loss = loss
class AutoSklearnBaselineModel(Model): def __init__( self, name: str, model_params: Dict[str, Any], ) -> None: super().__init__(name, model_params) self._model = AutoSklearnClassifier(**model_params) def fit(self, X: np.ndarray, y: np.ndarray) -> None: self._model.fit(X, y) def save(self, path: str) -> None: with open(path, 'wb') as file: pickle.dump(self, file) @classmethod def load(cls, path: str): with open(path, 'rb') as file: model = pickle.load(file) return cast(AutoSklearnBaselineModel, model) def predict(self, X: np.ndarray) -> np.ndarray: return self._model.predict(X) def predict_proba(self, X: np.ndarray) -> np.ndarray: return self._model.predict_proba(X)
def test_grid_scores(self): output = os.path.join(self.test_dir, '..', '.tmp_grid_scores') self._setUp(output) cls = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, output_folder=output, tmp_folder=output, shared_mode=False, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0) cls_ = cls.build_automl() automl = cls_._automl automl._proc_smac = mock.MagicMock() RunKey = collections.namedtuple( 'RunKey', ['config_id', 'instance_id', 'seed']) RunValue = collections.namedtuple( 'RunValue', ['cost', 'time', 'status', 'additional_info']) runhistory = dict() runhistory[RunKey(1, 1, 1)] = RunValue(1, 1, 1, '') automl._proc_smac.runhistory.data = runhistory grid_scores_ = automl.grid_scores_ self.assertIsInstance(grid_scores_[0], _CVScoreTuple) # In the runhistory we store losses, thus the score is zero self.assertEqual(grid_scores_[0].mean_validation_score, 0) self.assertEqual(grid_scores_[0].cv_validation_scores, [0]) self.assertIsInstance(grid_scores_[0].parameters, mock.MagicMock) del automl self._tearDown(output)
def test_binary(tmp_dir, output_dir, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris', make_binary=True) automl = AutoSklearnClassifier(time_left_for_this_task=40, per_run_time_limit=10, tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test, dataset_name='binary_test_dataset') predictions = automl.predict(X_test) assert predictions.shape == (50, ), print_debug_information(automl) score = accuracy(Y_test, predictions) assert score > 0.9, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl) output_files = glob.glob( os.path.join(output_dir, 'binary_test_dataset_test_*.predict')) assert len(output_files) > 0, (output_files, print_debug_information(automl))
def spawn_classifier(seed, dataset_name): """Spawn a subprocess. auto-sklearn does not take care of spawning worker processes. This function, which is called several times in the main block is a new process which runs one instance of auto-sklearn. """ # Use the initial configurations from meta-learning only in one out of # the four processes spawned. This prevents auto-sklearn from evaluating # the same configurations in four processes. if seed == 0: initial_configurations_via_metalearning = 25 else: initial_configurations_via_metalearning = 0 # Arguments which are different to other runs of auto-sklearn: # 1. all classifiers write to the same output directory # 2. shared_mode is set to True, this enables sharing of data between # models. # 3. all instances of the AutoSklearnClassifier must have a different seed! automl = AutoSklearnClassifier( time_left_for_this_task=120, # sec., how long should this seed fit # process run per_run_time_limit=60, # sec., each model may only take this long before it's killed ml_memory_limit=1024, # MB, memory limit imposed on each call to a ML algorithm shared_mode=True, # tmp folder will be shared between seeds tmp_folder=tmp_folder, output_folder=output_folder, delete_tmp_folder_after_terminate=False, ensemble_size=0, # ensembles will be built when all optimization runs are finished initial_configurations_via_metalearning=initial_configurations_via_metalearning, seed=seed, ) automl.fit(X_train, y_train, dataset_name=dataset_name)
def test_binary(self): tmp = os.path.join(self.test_dir, '..', '.out_binary_fit') output = os.path.join(self.test_dir, '..', '.tmp_binary_fit') self._setUp(output) self._setUp(tmp) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris', make_binary=True) automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test, dataset_name='binary_test_dataset') predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (50, )) score = accuracy(Y_test, predictions) self.assertGreaterEqual(score, 0.9) output_files = os.listdir(output) self.assertIn('binary_test_dataset_test_1.predict', output_files)
def test_cv_results(self): # TODO restructure and actually use real SMAC output from a long run # to do this unittest! tmp = os.path.join(self.test_dir, '..', '.tmp_cv_results') output = os.path.join(self.test_dir, '..', '.out_cv_results') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') cls = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=False, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0) cls.fit(X_train, Y_train) cv_results = cls.cv_results_ self.assertIsInstance(cv_results, dict) self.assertIsInstance(cv_results['mean_test_score'], np.ndarray) self.assertIsInstance(cv_results['mean_fit_time'], np.ndarray) self.assertIsInstance(cv_results['params'], list) self.assertIsInstance(cv_results['rank_test_scores'], np.ndarray) self.assertTrue([ isinstance(val, npma.MaskedArray) for key, val in cv_results.items() if key.startswith('param_') ]) del cls self._tearDown(tmp) self._tearDown(output)
def test_cv_results(tmp_dir, output_dir): # TODO restructure and actually use real SMAC output from a long run # to do this unittest! X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') cls = AutoSklearnClassifier(time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp_dir, output_folder=output_dir, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0) cls.fit(X_train, Y_train) cv_results = cls.cv_results_ assert isinstance(cv_results, dict), type(cv_results) assert isinstance(cv_results['mean_test_score'], np.ndarray), type(cv_results['mean_test_score']) assert isinstance(cv_results['mean_fit_time'], np.ndarray), type(cv_results['mean_fit_time']) assert isinstance(cv_results['params'], list), type(cv_results['params']) assert isinstance(cv_results['rank_test_scores'], np.ndarray), type(cv_results['rank_test_scores']) cv_result_items = [ isinstance(val, npma.MaskedArray) for key, val in cv_results.items() if key.startswith('param_') ] assert all(cv_result_items), cv_results.items()
def test_grid_scores(self): output = os.path.join(self.test_dir, '..', '.tmp_grid_scores') self._setUp(output) cls = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=5, output_folder=output, tmp_folder=output, shared_mode=False, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0) cls_ = cls.build_automl() automl = cls_._automl automl.runhistory_ = unittest.mock.MagicMock() RunKey = collections.namedtuple( 'RunKey', ['config_id', 'instance_id', 'seed']) RunValue = collections.namedtuple( 'RunValue', ['cost', 'time', 'status', 'additional_info']) runhistory = dict() runhistory[RunKey(1, 1, 1)] = RunValue(1, 1, 1, '') automl.runhistory_.data = runhistory grid_scores_ = automl.grid_scores_ self.assertIsInstance(grid_scores_[0], _CVScoreTuple) # In the runhistory we store losses, thus the score is zero self.assertEqual(grid_scores_[0].mean_validation_score, 0) self.assertEqual(grid_scores_[0].cv_validation_scores, [0]) self.assertIsInstance(grid_scores_[0].parameters, unittest.mock.MagicMock) del automl self._tearDown(output)
def __init__( self, name: str, model_params: Dict[str, Any], classifier_paths: Iterable[Tuple[str, str]], ) -> None: super().__init__(name, model_params, classifier_paths) self.selector = AutoSklearnClassifier(**model_params)
def __init__(self, **kwargs) -> None: Ensemble.__init__(self) client = Client( processes=False, n_workers=kwargs['n_jobs'], threads_per_worker=1, dashboard_address=None, ) self.model = AutoSklearnClassifier(**kwargs, dask_client=client)
def AutoSklearn(total_runtime, train_features, train_labels): clf = AutoSklearnClassifier( time_left_for_this_task=total_runtime, include_preprocessors=["no_preprocessing"], include_estimators = ["adaboost","gaussian_nb", "extra_trees", "gradient_boosting", "liblinear_svc", "libsvm_svc","random_forest", "k_nearest_neighbors","decision_tree"], ) clf.fit(train_features, train_labels, metric = balanced_accuracy) return clf
def process_auto_sklearn(X_train, X_test, y_train, df_types, m_type, seed, *args): """Function that trains and tests data using auto-sklearn""" from autosklearn.classification import AutoSklearnClassifier from autosklearn.regression import AutoSklearnRegressor from autosklearn.metrics import f1_weighted from autosklearn.metrics import mean_squared_error categ_cols = df_types[df_types.NAME != 'target']['TYPE'].values.ravel() if m_type == 'classification': automl = AutoSklearnClassifier(time_left_for_this_task=TIME_PER_TASK, per_run_time_limit=int(TIME_PER_TASK/8), seed=seed, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}, delete_tmp_folder_after_terminate=False) else: automl = AutoSklearnRegressor(time_left_for_this_task=TIME_PER_TASK, per_run_time_limit=int(TIME_PER_TASK/8), seed=seed, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}, delete_tmp_folder_after_terminate=False) automl.fit(X_train.copy(), y_train.copy(), feat_type=categ_cols, metric=f1_weighted if m_type == 'classification' else mean_squared_error) automl.refit(X_train.copy(), y_train.copy()) return (automl.predict_proba(X_test) if m_type == 'classification' else automl.predict(X_test))
def test_fit_pSMAC(self): output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, output_folder=output, tmp_folder=output, shared_mode=True, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0) automl.fit(X_train, Y_train) # Create a 'dummy model' for the first run, which has an accuracy of # more than 99%; it should be in the final ensemble if the ensemble # building of the second AutoSklearn classifier works correct true_targets_ensemble_path = os.path.join(output, '.auto-sklearn', 'true_targets_ensemble.npy') true_targets_ensemble = np.load(true_targets_ensemble_path) true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0 probas = np.zeros((len(true_targets_ensemble), 3), dtype=float) for i, value in enumerate(true_targets_ensemble): probas[i, value] = 1.0 dummy_predictions_path = os.path.join(output, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_00030.npy') with open(dummy_predictions_path, 'wb') as fh: np.save(fh, probas) probas_test = np.zeros((len(Y_test), 3), dtype=float) for i, value in enumerate(Y_test): probas_test[i, value] = 1.0 dummy = ArrayReturningDummyPredictor(probas_test) backend = Backend(output, output) backend.save_model(dummy, 30, 1) automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, output_folder=output, tmp_folder=output, shared_mode=True, seed=2, initial_configurations_via_metalearning=0, ensemble_size=0) automl.fit(X_train, Y_train) automl.run_ensemble_builder(0, 1, 50).wait() score = automl.score(X_test, Y_test) self.assertEqual(len(os.listdir(os.path.join(output, '.auto-sklearn', 'ensembles'))), 1) self.assertGreaterEqual(score, 0.90) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def spawn_classifier( seed, time, search_space, prep_space, metric, dataset_name=None): """Spawn a subprocess. auto-sklearn does not take care of spawning worker processes. This function, which is called several times in the main block is a new process which runs one instance of auto-sklearn. """ # Use the initial configurations from meta-learning only in one out of # the four processes spawned. This prevents auto-sklearn from evaluating # the same configurations in four processes. if seed == 0: initial_configurations_via_metalearning = 25 smac_scenario_args = {} else: initial_configurations_via_metalearning = 0 smac_scenario_args = {'initial_incumbent': 'RANDOM'} # Arguments which are different to other runs of auto-sklearn: # 1. all classifiers write to the same output directory # 2. shared_mode is set to True, this enables sharing of data between # models. # 3. all instances of the AutoSklearnClassifier must have a different # seed! automl = AutoSklearnClassifier( time_left_for_this_task=time, # sec., how long should this seed fit process run per_run_time_limit=15, # sec., each model may only take this long before it's killed ml_memory_limit=1024, # MB, memory limit imposed on each call to a ML algorithm shared_mode=True, # tmp folder will be shared between seeds tmp_folder=tmp_folder, output_folder=output_folder, delete_tmp_folder_after_terminate=False, ensemble_size=0, include_estimators=search_space, exclude_estimators=None, include_preprocessors=prep_space, exclude_preprocessors=None, # ensembles will be built when all optimization runs are finished initial_configurations_via_metalearning=( initial_configurations_via_metalearning ), seed=seed, smac_scenario_args=smac_scenario_args, ) automl.fit(X_train, y_train, metric=metric, dataset_name=dataset_name) # print(automl.cv_results_) return automl.cv_results_
def simple(): stage2assistant = Exp2Assistant(stage=2) train_data = stage2assistant.train_data X_train = train_data.iloc[:, :-1] y_train = train_data.iloc[:, -1] automl = AutoSklearnClassifier( ) # change the time, in this experiment, 1h 12h 24h 48h automl.fit(X_train, y_train) joblib.dump( automl, path.join(path_service.get_resource("model"), "stage2_model.joblib"))
def __init__(self, time_left_for_this_task, per_run_time_limit, folds): now = strftime("%Y-%m-%d-%H-%M-%S", gmtime()) self.automl = AutoSklearnClassifier( time_left_for_this_task=time_left_for_this_task, per_run_time_limit=per_run_time_limit, #tmp_folder='/tmp/autosklearn_switch_tmp', #output_folder='/tmp/autosklearn_switch_out', #delete_tmp_folder_after_terminate=False, #delete_output_folder_after_terminate=False, #shared_mode=True, resampling_strategy='cv', resampling_strategy_arguments={'folds': folds})
def spawn_classifier(seed, dataset_name): automl = AutoSklearnClassifier(time_left_for_this_task=600, # sec., how long should this seed fit process run per_run_time_limit=60, # sec., each model may only take this long before it's killed ml_memory_limit=1024, # MB shared_mode=True, # tmp folder will be shared between seeds tmp_folder=tmp_folder, output_folder=output_folder, delete_tmp_folder_after_terminate=False, ensemble_size=0, # no need to build ensembles at this stage initial_configurations_via_metalearning=0, # let seeds profit from each other's results seed=seed) automl.fit(X_train, y_train, dataset_name=dataset_name)
def test_classification_methods_returns_self(self): X_train, y_train, X_test, y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, ensemble_size=0) automl_fitted = automl.fit(X_train, y_train) self.assertIs(automl, automl_fitted) automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5) self.assertIs(automl, automl_ensemble_fitted) automl_refitted = automl.refit(X_train.copy(), y_train.copy()) self.assertIs(automl, automl_refitted)
def test_pSMAC_wrong_arguments(self): self.assertRaisesRegexp(ValueError, "If shared_mode == True tmp_folder must not " "be None.", lambda shared_mode: AutoSklearnClassifier(shared_mode=shared_mode).fit(None, None), shared_mode=True) self.assertRaisesRegexp(ValueError, "If shared_mode == True output_folder must not " "be None.", lambda shared_mode, tmp_folder: AutoSklearnClassifier(shared_mode=shared_mode, tmp_folder=tmp_folder).fit(None, None), shared_mode=True, tmp_folder='/tmp/duitaredxtvbedb')
def test_classification_pandas_support(self): X, y = sklearn.datasets.fetch_openml( data_id=2, # cat/num dataset return_X_y=True, as_frame=True, ) # Drop NAN!! X = X.dropna('columns') # This test only make sense if input is dataframe self.assertTrue(isinstance(X, pd.DataFrame)) self.assertTrue(isinstance(y, pd.Series)) automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, exclude_estimators=['libsvm_svc'], seed=5, ) automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality self.assertTrue(automl.score(X, y) > 0.555) automl.refit(X, y) # Make sure that at least better than random. # accuracy in sklearn needs valid data # It should be 0.555 as the dataset is unbalanced. y = automl._automl[0].InputValidator.encode_target(y) prediction = automl._automl[0].InputValidator.encode_target(automl.predict(X)) self.assertTrue(accuracy(y, prediction) > 0.555)
def zeroconf_fit_ensemble(y, atsklrn_tempdir): lo = utl.get_logger(inspect.stack()[0][3]) lo.info("Building ensemble") seed = 1 ensemble = AutoSklearnClassifier( time_left_for_this_task=300, per_run_time_limit=150, ml_memory_limit=20240, ensemble_size=50, ensemble_nbest=200, shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, initial_configurations_via_metalearning=0, seed=seed) lo.info("Done AutoSklearnClassifier - seed:" + str(seed)) try: lo.debug("Start ensemble.fit_ensemble - seed:" + str(seed)) ensemble.fit_ensemble(task=BINARY_CLASSIFICATION, y=y, metric=autosklearn.metrics.f1, precision='32', dataset_name='foobar', ensemble_size=10, ensemble_nbest=15) except Exception: lo = utl.get_logger(inspect.stack()[0][3]) lo.exception("Error in ensemble.fit_ensemble - seed:" + str(seed)) raise lo = utl.get_logger(inspect.stack()[0][3]) lo.debug("Done ensemble.fit_ensemble - seed:" + str(seed)) sleep(20) lo.info("Ensemble built - seed:" + str(seed)) lo.info("Show models - seed:" + str(seed)) txtList = str(ensemble.show_models()).split("\n") for row in txtList: lo.info(row) return ensemble
def test_can_pickle_classifier(self): if self.travis: self.skipTest('This test does currently not run on travis-ci. ' 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_can_pickle') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) initial_predictions = automl.predict(X_test) initial_accuracy = sklearn.metrics.accuracy_score( Y_test, initial_predictions) self.assertTrue(initial_accuracy > 0.75) # Test pickle dump_file = os.path.join(output, 'automl.dump.pkl') with open(dump_file, 'wb') as f: pickle.dump(automl, f) with open(dump_file, 'rb') as f: restored_automl = pickle.load(f) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score( Y_test, restored_predictions) self.assertTrue(restored_accuracy > 0.75) self.assertEqual(initial_accuracy, restored_accuracy) # Test joblib dump_file = os.path.join(output, 'automl.dump.joblib') sklearn.externals.joblib.dump(automl, dump_file) restored_automl = sklearn.externals.joblib.load(dump_file) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score( Y_test, restored_predictions) self.assertTrue(restored_accuracy > 0.75) self.assertEqual(initial_accuracy, restored_accuracy)
def test_fit_n_jobs_2(self): tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC') output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer') # test parallel Classifier to predict classes, not only indices Y_train += 1 Y_test += 1 automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, seed=1, initial_configurations_via_metalearning=0, ensemble_size=5, n_jobs=2, include_estimators=['sgd'], include_preprocessors=['no_preprocessing'], ) automl.fit(X_train, Y_train) n_runs = len(automl.cv_results_['mean_test_score']) predictions_dir = automl._automl[0]._backend._get_prediction_output_dir( 'ensemble' ) predictions = os.listdir(predictions_dir) # two instances of the dummy self.assertEqual(n_runs, len(predictions) - 2, msg=str(predictions)) seeds = set() for predictions_file in predictions: seeds.add(int(predictions_file.split('.')[0].split('_')[2])) self.assertEqual(len(seeds), 2) ensemble_dir = automl._automl[0]._backend.get_ensemble_dir() ensembles = os.listdir(ensemble_dir) seeds = set() for ensemble_file in ensembles: seeds.add(int(ensemble_file.split('.')[0].split('_')[0])) self.assertEqual(len(seeds), 1)
def spawn_autosklearn_classifier(X_train, y_train, seed, dataset_name, time_left_for_this_task, per_run_time_limit, feat_type, memory_limit, atsklrn_tempdir): lo = utl.get_logger(inspect.stack()[0][3]) try: lo.info("Start AutoSklearnClassifier seed=" + str(seed)) clf = AutoSklearnClassifier( time_left_for_this_task=time_left_for_this_task, per_run_time_limit=per_run_time_limit, ml_memory_limit=memory_limit, shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, initial_configurations_via_metalearning=0, ensemble_size=0, seed=seed) except Exception: lo.exception("Exception AutoSklearnClassifier seed=" + str(seed)) raise lo = utl.get_logger(inspect.stack()[0][3]) lo.info("Done AutoSklearnClassifier seed=" + str(seed)) sleep(seed) try: lo.info("Starting seed=" + str(seed)) try: clf.fit(X_train, y_train, metric=autosklearn.metrics.f1, feat_type=feat_type, dataset_name=dataset_name) except Exception: lo = utl.get_logger(inspect.stack()[0][3]) lo.exception("Error in clf.fit - seed:" + str(seed)) raise except Exception: lo = utl.get_logger(inspect.stack()[0][3]) lo.exception("Exception in seed=" + str(seed) + ". ") traceback.print_exc() raise lo = utl.get_logger(inspect.stack()[0][3]) lo.info("####### Finished seed=" + str(seed)) return None
def test_can_pickle_classifier(self): tmp = os.path.join(self.test_dir, '..', '.tmp_can_pickle') output = os.path.join(self.test_dir, '..', '.out_can_pickle') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) initial_predictions = automl.predict(X_test) initial_accuracy = sklearn.metrics.accuracy_score( Y_test, initial_predictions) self.assertGreaterEqual(initial_accuracy, 0.75) self.assertGreater(self._count_succeses(automl.cv_results_), 0) # Test pickle dump_file = os.path.join(output, 'automl.dump.pkl') with open(dump_file, 'wb') as f: pickle.dump(automl, f) with open(dump_file, 'rb') as f: restored_automl = pickle.load(f) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score( Y_test, restored_predictions) self.assertGreaterEqual(restored_accuracy, 0.75) self.assertEqual(initial_accuracy, restored_accuracy) # Test joblib dump_file = os.path.join(output, 'automl.dump.joblib') joblib.dump(automl, dump_file) restored_automl = joblib.load(dump_file) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score( Y_test, restored_predictions) self.assertGreaterEqual(restored_accuracy, 0.75) self.assertEqual(initial_accuracy, restored_accuracy)
def spawn_autosklearn_classifier(X_train, y_train, seed, dataset_name, time_left_for_this_task, per_run_time_limit, feat_type): c = AutoSklearnClassifier(time_left_for_this_task=time_left_for_this_task, per_run_time_limit=per_run_time_limit, ml_memory_limit=memory_limit, shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, initial_configurations_via_metalearning=0, ensemble_size=0, seed=seed) sleep(seed) try: p("Starting seed="+str(seed)) c.fit(X_train, y_train, metric='f1_metric', feat_type=feat_type, dataset_name = dataset_name) p("####### Finished seed="+str(seed)) except Exception: p("Exception in seed="+str(seed)+". ") traceback.print_exc() raise
def test_feat_type_wrong_arguments(self): cls = AutoSklearnClassifier() X = np.zeros((100, 100)) y = np.zeros((100, )) self.assertRaisesRegexp(ValueError, 'Array feat_type does not have same number of ' 'variables as X has features. 1 vs 100.', cls.fit, X=X, y=y, feat_type=[True]) self.assertRaisesRegexp(ValueError, 'Array feat_type must only contain strings.', cls.fit, X=X, y=y, feat_type=[True] * 100) self.assertRaisesRegexp(ValueError, 'Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `Car`', cls.fit, X=X, y=y, feat_type=['Car'] * 100)
def test_can_pickle_classifier(self): tmp = os.path.join(self.test_dir, '..', '.tmp_can_pickle') output = os.path.join(self.test_dir, '..', '.out_can_pickle') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) initial_predictions = automl.predict(X_test) initial_accuracy = sklearn.metrics.accuracy_score(Y_test, initial_predictions) self.assertGreaterEqual(initial_accuracy, 0.75) # Test pickle dump_file = os.path.join(output, 'automl.dump.pkl') with open(dump_file, 'wb') as f: pickle.dump(automl, f) with open(dump_file, 'rb') as f: restored_automl = pickle.load(f) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score(Y_test, restored_predictions) self.assertGreaterEqual(restored_accuracy, 0.75) self.assertEqual(initial_accuracy, restored_accuracy) # Test joblib dump_file = os.path.join(output, 'automl.dump.joblib') sklearn.externals.joblib.dump(automl, dump_file) restored_automl = sklearn.externals.joblib.load(dump_file) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score(Y_test, restored_predictions) self.assertGreaterEqual(restored_accuracy, 0.75) self.assertEqual(initial_accuracy, restored_accuracy)
def main(): X, y = sklearn.datasets.load_digits(return_X_y=True) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=1) processes = [] spawn_classifier = get_spawn_classifier(X_train, y_train) for i in range(4): # set this at roughly half of your cores p = multiprocessing.Process(target=spawn_classifier, args=(i, 'digits')) p.start() processes.append(p) for p in processes: p.join() print('Starting to build an ensemble!') automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='digits', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
def test_fit(self): output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=5, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) print(automl.show_models()) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._automl._automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_multilabel(self): tmp = os.path.join(self.test_dir, '..', '.tmp_multilabel_fit') output = os.path.join(self.test_dir, '..', '.out_multilabel_fit') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset( 'iris', make_multilabel=True) automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (50, 3)) score = f1_macro(Y_test, predictions) self.assertGreaterEqual(score, 0.9) probs = automl.predict_proba(X_train) self.assertAlmostEqual(np.mean(probs), 0.33, places=1)
def test_fit(self): if self.travis: self.skipTest('This test does currently not run on travis-ci. ' 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) print(automl.show_models()) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_binary(self): tmp = os.path.join(self.test_dir, '..', '.out_binary_fit') output = os.path.join(self.test_dir, '..', '.tmp_binary_fit') self._setUp(output) self._setUp(tmp) X_train, Y_train, X_test, Y_test = putil.get_dataset( 'iris', make_binary=True) automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test, dataset_name='binary_test_dataset') predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (50, )) score = accuracy(Y_test, predictions) self.assertGreaterEqual(score, 0.9) output_files = os.listdir(output) self.assertIn('binary_test_dataset_test_1.predict', output_files)
def spawn_classifier(seed, dataset_name): digits = sklearn.datasets.load_digits() X = digits.data y = digits.target indices = np.arange(X.shape[0]) np.random.shuffle(indices) X = X[indices] y = y[indices] X_train = X[:1000] y_train = y[:1000] X_test = X[1000:] y_test = y[1000:] automl = AutoSklearnClassifier(time_left_for_this_task=60, per_run_time_limit=60, ml_memory_limit=1024, shared_mode=True, tmp_folder=tmp_folder, output_folder=output_folder, delete_tmp_folder_after_terminate=False, ensemble_size=0, initial_configurations_via_metalearning=0, seed=seed) automl.fit(X_train, y_train, dataset_name=dataset_name)
def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit): automl = AutoSklearnClassifier() X = [[1], [2], [3]] y = [1, 2, 3] automl.fit(X, y) self.assertEqual(fit.call_count, 1) self.assertIsInstance(fit.call_args[0][0], np.ndarray) self.assertIsInstance(fit.call_args[0][1], np.ndarray) automl.refit(X, y) self.assertEqual(refit.call_count, 1) self.assertIsInstance(refit.call_args[0][0], np.ndarray) self.assertIsInstance(refit.call_args[0][1], np.ndarray) automl.fit_ensemble(y) self.assertEqual(fit_ensemble.call_count, 1) self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
'initial_configurations_via_metalearning': 0, 'ensemble_size': 0, 'ensemble_nbest': 0, 'seed': seed, 'ml_memory_limit': 3072, 'resampling_strategy': 'partial-cv', 'resampling_strategy_arguments': {'folds': 10}, 'delete_tmp_folder_after_terminate': False, 'tmp_folder': tmp_dir, 'disable_evaluator_output': True, } X_train, y_train, X_test, y_test, cat = load_task(task_id) if task_type == 'classification': automl = AutoSklearnClassifier(**automl_arguments) metric = balanced_accuracy elif task_type == 'regression': automl = AutoSklearnRegressor(**automl_arguments) metric = r2 else: raise ValueError(task_type) automl.fit(X_train, y_train, dataset_name=str(task_id), metric=metric, feat_type=cat) data = automl._automl._backend.load_datamanager() # Data manager can't be replaced with save_datamanager, it has to be deleted # first os.remove(automl._automl._backend._get_datamanager_pickle_filename()) data.data['X_test'] = X_test data.data['Y_test'] = y_test
processes = [] for i in range(4): # set this at roughly half of your cores p = multiprocessing.Process(target=spawn_classifier, args=(i, "digits")) p.start() processes.append(p) for p in processes: p.join() print("Starting to build an ensemble!") automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=ACC_METRIC, precision="32", dataset_name="digits", ensemble_size=20,
def test_fit_pSMAC(self): tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC') output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('digits') # test parallel Classifier to predict classes, not only indexes Y_train += 1 Y_test += 1 automl = AutoSklearnClassifier( time_left_for_this_task=20, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit(X_train, Y_train) # Create a 'dummy model' for the first run, which has an accuracy of # more than 99%; it should be in the final ensemble if the ensemble # building of the second AutoSklearn classifier works correct true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn', 'true_targets_ensemble.npy') with open(true_targets_ensemble_path, 'rb') as fh: true_targets_ensemble = np.load(fh) true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0 true_targets_ensemble = true_targets_ensemble.astype(int) probas = np.zeros((len(true_targets_ensemble), 10), dtype=float) for i, value in enumerate(true_targets_ensemble): probas[i, value] = 1.0 dummy_predictions_path = os.path.join( tmp, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_00030.npy', ) with open(dummy_predictions_path, 'wb') as fh: np.save(fh, probas) probas_test = np.zeros((len(Y_test), 10), dtype=float) for i, value in enumerate(Y_test): probas_test[i, value - 1] = 1.0 dummy = ArrayReturningDummyPredictor(probas_test) context = BackendContext(tmp, output, False, False, True) backend = Backend(context) backend.save_model(dummy, 30, 1) automl = AutoSklearnClassifier( time_left_for_this_task=20, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=2, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit_ensemble(Y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='iris', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) score = sklearn.metrics.accuracy_score(Y_test, predictions) self.assertEqual(len(os.listdir(os.path.join(tmp, '.auto-sklearn', 'ensembles'))), 1) self.assertGreaterEqual(score, 0.90) self.assertEqual(automl._automl._task, MULTICLASS_CLASSIFICATION) models = automl._automl.models_ classifier_types = [type(c) for c in models.values()] self.assertIn(ArrayReturningDummyPredictor, classifier_types) del automl self._tearDown(tmp) self._tearDown(output)
y_test = y[1000:] processes = [] for i in range(4): # set this at roughly half of your cores p = multiprocessing.Process(target=spawn_classifier, args=(i, 'digits')) p.start() processes.append(p) for p in processes: p.join() print('Starting to build an ensemble!') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1) # Both the ensemble_size and ensemble_nbest parameters can be changed later automl.fit_ensemble(task=MULTICLASS_CLASSIFICATION, metric=ACC_METRIC, precision='32', dataset_name='digits', ensemble_size=10, ensemble_nbest=10) predictions = automl.predict(X_test) print(automl.show_models())