def process_auto_sklearn(X_train, X_test, y_train, df_types, m_type, seed, *args): """Function that trains and tests data using auto-sklearn""" from autosklearn.classification import AutoSklearnClassifier from autosklearn.regression import AutoSklearnRegressor from autosklearn.metrics import f1_weighted from autosklearn.metrics import mean_squared_error categ_cols = df_types[df_types.NAME != 'target']['TYPE'].values.ravel() if m_type == 'classification': automl = AutoSklearnClassifier(time_left_for_this_task=TIME_PER_TASK, per_run_time_limit=int(TIME_PER_TASK/8), seed=seed, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}, delete_tmp_folder_after_terminate=False) else: automl = AutoSklearnRegressor(time_left_for_this_task=TIME_PER_TASK, per_run_time_limit=int(TIME_PER_TASK/8), seed=seed, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}, delete_tmp_folder_after_terminate=False) automl.fit(X_train.copy(), y_train.copy(), feat_type=categ_cols, metric=f1_weighted if m_type == 'classification' else mean_squared_error) automl.refit(X_train.copy(), y_train.copy()) return (automl.predict_proba(X_test) if m_type == 'classification' else automl.predict(X_test))
def test_binary(self): tmp = os.path.join(self.test_dir, '..', '.out_binary_fit') output = os.path.join(self.test_dir, '..', '.tmp_binary_fit') self._setUp(output) self._setUp(tmp) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris', make_binary=True) automl = AutoSklearnClassifier(time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test, dataset_name='binary_test_dataset') predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (50, )) score = accuracy(Y_test, predictions) self.assertGreaterEqual(score, 0.9) output_files = os.listdir(output) self.assertIn('binary_test_dataset_test_1.predict', output_files)
def test_binary(tmp_dir, output_dir, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris', make_binary=True) automl = AutoSklearnClassifier(time_left_for_this_task=40, per_run_time_limit=10, tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test, dataset_name='binary_test_dataset') predictions = automl.predict(X_test) assert predictions.shape == (50, ), print_debug_information(automl) score = accuracy(Y_test, predictions) assert score > 0.9, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl) output_files = glob.glob( os.path.join(output_dir, 'binary_test_dataset_test_*.predict')) assert len(output_files) > 0, (output_files, print_debug_information(automl))
def test_can_pickle_classifier(self): if self.travis: self.skipTest('This test does currently not run on travis-ci. ' 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_can_pickle') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) initial_predictions = automl.predict(X_test) initial_accuracy = sklearn.metrics.accuracy_score( Y_test, initial_predictions) self.assertTrue(initial_accuracy > 0.75) # Test pickle dump_file = os.path.join(output, 'automl.dump.pkl') with open(dump_file, 'wb') as f: pickle.dump(automl, f) with open(dump_file, 'rb') as f: restored_automl = pickle.load(f) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score( Y_test, restored_predictions) self.assertTrue(restored_accuracy > 0.75) self.assertEqual(initial_accuracy, restored_accuracy) # Test joblib dump_file = os.path.join(output, 'automl.dump.joblib') sklearn.externals.joblib.dump(automl, dump_file) restored_automl = sklearn.externals.joblib.load(dump_file) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score( Y_test, restored_predictions) self.assertTrue(restored_accuracy > 0.75) self.assertEqual(initial_accuracy, restored_accuracy)
def test_can_pickle_classifier(self): tmp = os.path.join(self.test_dir, '..', '.tmp_can_pickle') output = os.path.join(self.test_dir, '..', '.out_can_pickle') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) initial_predictions = automl.predict(X_test) initial_accuracy = sklearn.metrics.accuracy_score( Y_test, initial_predictions) self.assertGreaterEqual(initial_accuracy, 0.75) self.assertGreater(self._count_succeses(automl.cv_results_), 0) # Test pickle dump_file = os.path.join(output, 'automl.dump.pkl') with open(dump_file, 'wb') as f: pickle.dump(automl, f) with open(dump_file, 'rb') as f: restored_automl = pickle.load(f) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score( Y_test, restored_predictions) self.assertGreaterEqual(restored_accuracy, 0.75) self.assertEqual(initial_accuracy, restored_accuracy) # Test joblib dump_file = os.path.join(output, 'automl.dump.joblib') joblib.dump(automl, dump_file) restored_automl = joblib.load(dump_file) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score( Y_test, restored_predictions) self.assertGreaterEqual(restored_accuracy, 0.75) self.assertEqual(initial_accuracy, restored_accuracy)
def main(): X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=1) processes = [] spawn_classifier = get_spawn_classifier(X_train, y_train) for i in range(4): # set this at roughly half of your cores p = multiprocessing.Process( target=spawn_classifier, args=(i, 'breast_cancer'), ) p.start() processes.append(p) for p in processes: p.join() print('Starting to build an ensemble!') automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='digits', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
def test_binary(self): output = os.path.join(self.test_dir, '..', '.tmp_binary_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris', make_binary=True) automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (50, )) score = accuracy(Y_test, predictions) self.assertGreaterEqual(score, 0.9)
class ClassificationPipeline(Pipeline): """ Classification pipeline """ def __init__(self, **pipeline_constructor_params): self.estimator = AutoSklearnClassifier(**pipeline_constructor_params) def fit(self, x, y): self.estimator.fit(x, y) def run(self, x): return self.estimator.predict(x) def as_json(self): pipeline = self.estimator.get_models_with_weights()[0][1] return pipeline.config.get_dictionary()
def test_can_pickle_classifier(self): tmp = os.path.join(self.test_dir, '..', '.tmp_can_pickle') output = os.path.join(self.test_dir, '..', '.out_can_pickle') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) initial_predictions = automl.predict(X_test) initial_accuracy = sklearn.metrics.accuracy_score(Y_test, initial_predictions) self.assertGreaterEqual(initial_accuracy, 0.75) # Test pickle dump_file = os.path.join(output, 'automl.dump.pkl') with open(dump_file, 'wb') as f: pickle.dump(automl, f) with open(dump_file, 'rb') as f: restored_automl = pickle.load(f) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score(Y_test, restored_predictions) self.assertGreaterEqual(restored_accuracy, 0.75) self.assertEqual(initial_accuracy, restored_accuracy) # Test joblib dump_file = os.path.join(output, 'automl.dump.joblib') sklearn.externals.joblib.dump(automl, dump_file) restored_automl = sklearn.externals.joblib.load(dump_file) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score(Y_test, restored_predictions) self.assertGreaterEqual(restored_accuracy, 0.75) self.assertEqual(initial_accuracy, restored_accuracy)
def test_can_pickle_classifier(tmp_dir, output_dir, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train) initial_predictions = automl.predict(X_test) initial_accuracy = sklearn.metrics.accuracy_score(Y_test, initial_predictions) assert initial_accuracy >= 0.75 assert count_succeses(automl.cv_results_) > 0 assert includes_train_scores(automl.performance_over_time_.columns) is True assert performance_over_time_is_plausible( automl.performance_over_time_) is True # Test pickle dump_file = os.path.join(output_dir, 'automl.dump.pkl') with open(dump_file, 'wb') as f: pickle.dump(automl, f) with open(dump_file, 'rb') as f: restored_automl = pickle.load(f) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score(Y_test, restored_predictions) assert restored_accuracy >= 0.75 assert initial_accuracy == restored_accuracy # Test joblib dump_file = os.path.join(output_dir, 'automl.dump.joblib') joblib.dump(automl, dump_file) restored_automl = joblib.load(dump_file) restored_predictions = restored_automl.predict(X_test) restored_accuracy = sklearn.metrics.accuracy_score(Y_test, restored_predictions) assert restored_accuracy >= 0.75 assert initial_accuracy == restored_accuracy
def test_multilabel(self): output = os.path.join(self.test_dir, '..', '.tmp_multilabel_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset( 'iris', make_multilabel=True) automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (50, 3)) score = f1_macro(Y_test, predictions) self.assertGreaterEqual(score, 0.9) probs = automl.predict_proba(X_train) self.assertAlmostEqual(np.mean(probs), 0.33333333333333331)
def main(): X, y = sklearn.datasets.load_digits(return_X_y=True) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=1) processes = [] spawn_classifier = get_spawn_classifier(X_train, y_train) for i in range(4): # set this at roughly half of your cores p = multiprocessing.Process(target=spawn_classifier, args=(i, 'digits')) p.start() processes.append(p) for p in processes: p.join() print('Starting to build an ensemble!') automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='digits', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
def test_multilabel(self): tmp = os.path.join(self.test_dir, '..', '.tmp_multilabel_fit') output = os.path.join(self.test_dir, '..', '.out_multilabel_fit') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset( 'iris', make_multilabel=True) automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (50, 3)) score = f1_macro(Y_test, predictions) self.assertGreaterEqual(score, 0.9) probs = automl.predict_proba(X_train) self.assertAlmostEqual(np.mean(probs), 0.33, places=1)
def test_classification_pandas_support(tmp_dir, output_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=2, # cat/num dataset return_X_y=True, as_frame=True, ) # Drop NAN!! X = X.dropna('columns') # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, exclude_estimators=['libsvm_svc'], dask_client=dask_client, seed=5, tmp_folder=tmp_dir, output_folder=output_dir, ) automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) > 0.555, print_debug_information(automl) automl.refit(X, y) # Make sure that at least better than random. # accuracy in sklearn needs valid data # It should be 0.555 as the dataset is unbalanced. prediction = automl.predict(X) assert accuracy(y, prediction) > 0.555 assert count_succeses(automl.cv_results_) > 0 assert includes_train_scores(automl.performance_over_time_.columns) is True assert performance_over_time_is_plausible( automl.performance_over_time_) is True
def test_classification_pandas_support(tmp_dir, output_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=2, # cat/num dataset return_X_y=True, as_frame=True, ) # Drop NAN!! X = X.dropna('columns') # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, exclude_estimators=['libsvm_svc'], dask_client=dask_client, seed=5, tmp_folder=tmp_dir, output_folder=output_dir, ) automl.fit(X, y) log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0] # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) > 0.555, extract_msg_from_log(log_file_path) automl.refit(X, y) # Make sure that at least better than random. # accuracy in sklearn needs valid data # It should be 0.555 as the dataset is unbalanced. y = automl.automl_.InputValidator.encode_target(y) prediction = automl.automl_.InputValidator.encode_target(automl.predict(X)) assert accuracy(y, prediction) > 0.555 assert count_succeses(automl.cv_results_) > 0
def gelpi_avdan_autosklearn(): train_df = pd.read_csv( '/home/shoe/automl_scores/TR11_Gelpi_Avdan_problem_TRAIN/11-11-2019 01:56:40/splits/train.csv' ) test_df = pd.read_csv( '/home/shoe/automl_scores/TR11_Gelpi_Avdan_problem_TRAIN/11-11-2019 01:56:40/splits/test.csv' ) X = [ "polity2b", "polity2borigin", "loggdptarget", "logpop", "majpowhome", "majpoworigin", "coloniallink", "ethnictie", "ethnicPCW", "ethnicany911", "dyadalliance", "dyadalliancePCW", "rivalrydummy", "postCW", "post911", "lndyaddist", "dyadpcyear1", "dyadpcyear2", "dyadpcyear3", "dyadpcyear4", "year" ] y = 'incident' automl = AutoSklearnClassifier(time_left_for_this_task=60 * 10) stimulus, preprocessor = preprocess( train_df, {'problem': { "predictors": X, 'targets': [y], 'categorical': [] }}) automl.fit(stimulus, train_df[y]) automl.refit(stimulus, train_df[y]) stimulus_test = preprocessor.transform(test_df) global predictions predictions = automl.predict_proba(stimulus_test) global pred_raw pred_raw = automl.predict(stimulus_test) print(predictions) print(roc_auc_score(test_df[y], predictions[:, 1]))
def test_binary(self): tmp = os.path.join(self.test_dir, '..', '.out_binary_fit') output = os.path.join(self.test_dir, '..', '.tmp_binary_fit') self._setUp(output) self._setUp(tmp) X_train, Y_train, X_test, Y_test = putil.get_dataset( 'iris', make_binary=True) automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test, dataset_name='binary_test_dataset') predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (50, )) score = accuracy(Y_test, predictions) self.assertGreaterEqual(score, 0.9) output_files = os.listdir(output) self.assertIn('binary_test_dataset_test_1.predict', output_files)
def test_multilabel(tmp_dir, output_dir, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris', make_multilabel=True) automl = AutoSklearnClassifier(time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) assert predictions.shape == (50, 3), print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl) score = f1_macro(Y_test, predictions) assert score >= 0.9, print_debug_information(automl) probs = automl.predict_proba(X_train) assert np.mean(probs) == pytest.approx(0.33, rel=1e-1)
def test_multilabel(tmp_dir, output_dir, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris', make_multilabel=True) automl = AutoSklearnClassifier(time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train) # Log file path log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0] predictions = automl.predict(X_test) assert predictions.shape == (50, 3), extract_msg_from_log(log_file_path) assert count_succeses( automl.cv_results_) > 0, extract_msg_from_log(log_file_path) score = f1_macro(Y_test, predictions) assert score >= 0.9, extract_msg_from_log(log_file_path) probs = automl.predict_proba(X_train) assert np.mean(probs) == pytest.approx(0.33, rel=1e-1)
def gleditsch_ward_autosklearn(): train_df = pd.read_csv( '/home/shoe/automl_scores/TR12c_Gleditsch_Ward_Combined_problem_TRAIN/13-11-2019 01:16:06/splits/train.csv' ) test_df = pd.read_csv( '/home/shoe/automl_scores/TR12c_Gleditsch_Ward_Combined_problem_TRAIN/13-11-2019 01:16:06/splits/test.csv' ) X = [ "pmid", "py", "py2", "py3", "terriss", "riveriss", "mariss", "terrAtt", "rivAtt", "marAtt", "minpol", "rbal", "lnkmdist" ] y = 'mido' automl = AutoSklearnClassifier(time_left_for_this_task=60 * 5) stimulus, preprocessor = preprocess( train_df, {'problem': { "predictors": X, 'targets': [y], 'categorical': [] }}) automl.fit(stimulus, train_df[y]) automl.refit(stimulus, train_df[y]) stimulus_test = preprocessor.transform(test_df) global predictions predictions = automl.predict_proba(stimulus_test) global pred_raw pred_raw = automl.predict(stimulus_test) print(predictions) print(roc_auc_score(test_df[y], predictions[:, 1]))
def test_classification_pandas_support(self): X, y = sklearn.datasets.fetch_openml( data_id=2, # cat/num dataset return_X_y=True, as_frame=True, ) # Drop NAN!! X = X.dropna('columns') # This test only make sense if input is dataframe self.assertTrue(isinstance(X, pd.DataFrame)) self.assertTrue(isinstance(y, pd.Series)) automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, exclude_estimators=['libsvm_svc'], seed=5, ) automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality self.assertTrue(automl.score(X, y) > 0.555) automl.refit(X, y) # Make sure that at least better than random. # accuracy in sklearn needs valid data # It should be 0.555 as the dataset is unbalanced. y = automl.automl_.InputValidator.encode_target(y) prediction = automl.automl_.InputValidator.encode_target( automl.predict(X)) self.assertTrue(accuracy(y, prediction) > 0.555) self.assertGreater(self._count_succeses(automl.cv_results_), 0)
def train(X, y): """example of auto-sklearn for a classification dataset""" # split into train and test sets X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=1 ) # define search model = AutoSklearnClassifier( time_left_for_this_task=30, # per_run_time_limit=30, # n_jobs=8, ) # perform the search model.fit(X_train, y_train) # summarize print(model.sprint_statistics()) # evaluate best model y_hat = model.predict(X_test) acc = accuracy_score(y_test, y_hat) print("Accuracy: %.3f" % acc) model_path = Path("./catanatron/players/estimator.pickle").resolve() with open(model_path, "wb") as f: pickle.dump(model, f)
c.fit_ensemble( task = BINARY_CLASSIFICATION ,y = y_train ,metric = F1_METRIC ,precision = '32' ,dataset_name = 'foobar' ,ensemble_size=10 ,ensemble_nbest=15) sleep(20) p("Ensemble built") p("Show models") print(c.show_models()) p("Predicting") y_hat = c.predict(X_test.values) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat)) if df_unknown.shape[0]==0: p("nothing to predict. Prediction dataset is empty.") exit() p("Re-fitting on full known dataset. This can take long for a large set.") try: c.refit(X.values, y) except Exception as e: p("Refit failed, restarting") print(e) try: X=X.values indices = np.arange(X.shape[0])
y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, ) anytime_model = cls #Prequential evaluation for i in range(24, 27): #Test on next batch for accuracy X_test = B[i].iloc[:, 0:-1] y_test = B[i].iloc[:, -1] y_test = y_test.to_numpy() y_hat = cls.predict(X_test) print("y_hat") print(y_hat) print("y_test") print(y_test) accuracy_value = sklearn.metrics.accuracy_score(y_test, y_hat) print("Test batch %d - Test score %f\n" % (i, accuracy_value)) #Check for drift drift_in_batch = 0 for j in range(0, len(B[i])): drift_detector.add_element(y_test[j] - y_hat[j]) if drift_detector.detected_change(): print('Drift in performance detected at sample {}'.format(j))
print("Starting to build an ensemble!") automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=ACC_METRIC, precision="32", dataset_name="digits", ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
""" A basic demonstration of auto-sklearn. (c) Tony Liu 2018. """ import sklearn import numpy as np from sklearn import datasets from sklearn.metrics import accuracy_score from autosklearn.classification import AutoSklearnClassifier rand_seed = 2 np.random.seed(rand_seed) # load MNIST X, y = datasets.load_digits(return_X_y=True) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=rand_seed) # this is a comment # constrain automl so that it runs in reasonable time for demo automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=10, ml_memory_limit=1024, seed=rand_seed, ensemble_size=1, initial_configurations_via_metalearning=0) automl.fit(X_train, y_train) y_hat = automl.predict(X_test) print("Accuracy score", accuracy_score(y_test, y_hat))
print('Starting to build an ensemble!') automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='digits', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
print('[INFO] Loading digits dataset.') X, y = load_digits(return_X_y=True) print('[INFO] Splitting.') X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.8) print(f'[INFO] Train shape: {X_train.shape}') print(f'[INFO] Test shape: {X_test.shape}') print('[INFO] Finding best model...') classifier = AutoSklearnClassifier(per_run_time_limit=360, ml_memory_limit=1024 * 6, time_left_for_this_task=7200) start = time.time() X_train = X_train.astype('float') classifier.fit(X_train, y_train) print( f'[INFO] Elapsed time finding best model: {time.time() - start} seconds.') predictions = classifier.predict(X_test) print('--- CLASSIFICATION REPORT: ---') print(classification_report(y_test, predictions)) print('\n\n--- MODELS: ---') print(classifier.show_models()) print('\n\n--- STATISTICS: ---') print(classifier.sprint_statistics())
def test_fit_pSMAC(self): tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC') output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('digits') # test parallel Classifier to predict classes, not only indexes Y_train += 1 Y_test += 1 automl = AutoSklearnClassifier( time_left_for_this_task=20, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit(X_train, Y_train) # Create a 'dummy model' for the first run, which has an accuracy of # more than 99%; it should be in the final ensemble if the ensemble # building of the second AutoSklearn classifier works correct true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn', 'true_targets_ensemble.npy') with open(true_targets_ensemble_path, 'rb') as fh: true_targets_ensemble = np.load(fh) true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0 true_targets_ensemble = true_targets_ensemble.astype(int) probas = np.zeros((len(true_targets_ensemble), 10), dtype=float) for i, value in enumerate(true_targets_ensemble): probas[i, value] = 1.0 dummy_predictions_path = os.path.join( tmp, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_00030.npy', ) with open(dummy_predictions_path, 'wb') as fh: np.save(fh, probas) probas_test = np.zeros((len(Y_test), 10), dtype=float) for i, value in enumerate(Y_test): probas_test[i, value - 1] = 1.0 dummy = ArrayReturningDummyPredictor(probas_test) context = BackendContext(tmp, output, False, False, True) backend = Backend(context) backend.save_model(dummy, 30, 1) automl = AutoSklearnClassifier( time_left_for_this_task=20, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=2, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit_ensemble( Y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='iris', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) score = sklearn.metrics.accuracy_score(Y_test, predictions) self.assertEqual( len(os.listdir(os.path.join(tmp, '.auto-sklearn', 'ensembles'))), 1) self.assertGreaterEqual(score, 0.90) self.assertEqual(automl._automl._task, MULTICLASS_CLASSIFICATION) models = automl._automl.models_ classifier_types = [type(c) for c in models.values()] self.assertIn(ArrayReturningDummyPredictor, classifier_types) del automl self._tearDown(tmp) self._tearDown(output)
def evaluate_ml_algorithm(dataset, algo, run_id, obj_metric, time_limit=600, seed=1, task_type=None): if algo == 'lightgbm': _algo = ['LightGBM'] add_classifier(LightGBM) elif algo == 'logistic_regression': _algo = ['Logistic_Regression'] add_classifier(Logistic_Regression) else: _algo = [algo] print('EVALUATE-%s-%s-%s: run_id=%d' % (dataset, algo, obj_metric, run_id)) train_data, test_data = load_train_test_data(dataset, task_type=task_type) if task_type in CLS_TASKS: task_type = BINARY_CLS if len(set( train_data.data[1])) == 2 else MULTICLASS_CLS print(set(train_data.data[1])) raw_data, test_raw_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) X, y = raw_data.data X_test, y_test = test_raw_data.data feat_type = [ 'Categorical' if _type == CATEGORICAL else 'Numerical' for _type in raw_data.feature_types ] from autosklearn.metrics import balanced_accuracy as balanced_acc automl = AutoSklearnClassifier( time_left_for_this_task=int(time_limit), per_run_time_limit=180, n_jobs=1, include_estimators=_algo, initial_configurations_via_metalearning=0, ensemble_memory_limit=16384, ml_memory_limit=16384, # tmp_folder='/var/folders/0t/mjph32q55hd10x3qr_kdd2vw0000gn/T/autosklearn_tmp', ensemble_size=1, seed=int(seed), resampling_strategy='holdout', resampling_strategy_arguments={'train_size': 0.67}) automl.fit(X.copy(), y.copy(), feat_type=feat_type, metric=balanced_acc) model_desc = automl.show_models() str_stats = automl.sprint_statistics() valid_results = automl.cv_results_['mean_test_score'] print('Eval num: %d' % (len(valid_results))) validation_score = np.max(valid_results) # Test performance. automl.refit(X.copy(), y.copy()) predictions = automl.predict(X_test) test_score = balanced_accuracy_score(y_test, predictions) # Print statistics about the auto-sklearn run such as number of # iterations, number of models failed with a time out. print(str_stats) print(model_desc) print('Validation Accuracy:', validation_score) print("Test Accuracy :", test_score) save_path = save_dir + '%s-%s-%s-%d-%d.pkl' % (dataset, algo, obj_metric, run_id, time_limit) with open(save_path, 'wb') as f: pickle.dump([dataset, algo, validation_score, test_score, task_type], f)
# # define dataset # cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) # # define search # model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1) # # perform the search # model.fit(X, y) # plot_confusion_matrix(model, X, y) # # export the best model # # model.export('tpot_best_model.py') if __name__ == '__main__': # example of auto-sklearn for a classification dataset from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from autosklearn.classification import AutoSklearnClassifier # define dataset # split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) # define search model = AutoSklearnClassifier(time_left_for_this_task=4*60, per_run_time_limit=60, n_jobs=8, resampling_strategy='cv', resampling_strategy_arguments={'folds': 10}) # perform the search model.fit(X_train, y_train) # summarize print(model.sprint_statistics()) print(model.cv_results_) # evaluate best model y_hat = model.predict(X_test) acc = accuracy_score(y_test, y_hat) print("Accuracy: %.3f" % acc)
def test_fit_pSMAC(self): tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC') output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('digits') # test parallel Classifier to predict classes, not only indexes Y_train += 1 Y_test += 1 automl = AutoSklearnClassifier( time_left_for_this_task=20, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit(X_train, Y_train) # Create a 'dummy model' for the first run, which has an accuracy of # more than 99%; it should be in the final ensemble if the ensemble # building of the second AutoSklearn classifier works correct true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn', 'true_targets_ensemble.npy') with open(true_targets_ensemble_path, 'rb') as fh: true_targets_ensemble = np.load(fh) true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0 true_targets_ensemble = true_targets_ensemble.astype(int) probas = np.zeros((len(true_targets_ensemble), 10), dtype=float) for i, value in enumerate(true_targets_ensemble): probas[i, value] = 1.0 dummy_predictions_path = os.path.join( tmp, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_00030.npy', ) with open(dummy_predictions_path, 'wb') as fh: np.save(fh, probas) probas_test = np.zeros((len(Y_test), 10), dtype=float) for i, value in enumerate(Y_test): probas_test[i, value - 1] = 1.0 dummy = ArrayReturningDummyPredictor(probas_test) context = BackendContext(tmp, output, False, False, True) backend = Backend(context) backend.save_model(dummy, 30, 1) automl = AutoSklearnClassifier( time_left_for_this_task=20, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=2, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit_ensemble(Y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='iris', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) score = sklearn.metrics.accuracy_score(Y_test, predictions) self.assertEqual(len(os.listdir(os.path.join(tmp, '.auto-sklearn', 'ensembles'))), 1) self.assertGreaterEqual(score, 0.90) self.assertEqual(automl._automl._task, MULTICLASS_CLASSIFICATION) models = automl._automl.models_ classifier_types = [type(c) for c in models.values()] self.assertIn(ArrayReturningDummyPredictor, classifier_types) del automl self._tearDown(tmp) self._tearDown(output)
def fit_automl(self, run_time): """Runs auto-sklearn on the uploaded data and prints results. Side effects: - Enables upload_widget Args: run_time (int): The run time for auto-sklearn in seconds. Returns: automl (AutoSklearnClassifier): fitted auto-sklearn model. """ automl_args = {} automl_args['time_left_for_this_task'] = run_time # TODO functionality to load this from Mongo automl_args['metadata_directory'] = ".metalearning/metalearning_files/" #automl_args['metadata_directory'] = "../metalearning/metalearning_files/" automl = AutoSklearnClassifier(**automl_args) thread = threading.Thread(target=self.update_progress, args=(self.progress_widget, )) thread.start() # always load a copy of the latest dataset cur_data = self.data[-1].copy() y = cur_data.pop(0) X, feat_types, _ = model_utils.process_feat_types(cur_data) X_train = X.iloc[self.train_idxs] y_train = y.iloc[self.train_idxs] X_test = X.iloc[self.test_idxs] y_test = y.iloc[self.test_idxs] with warnings.catch_warnings(): warnings.simplefilter("ignore") with HiddenPrints(): automl.fit(X_train, y_train, feat_type=feat_types) # Automl has finished fitting: self.models.append(copy.deepcopy(automl)) with self.event_output_widget: print("FITTING COMPLETED WITH FITTING TIME PARAMETER AS ", int(run_time / 60), " MINUTES") with self.metrics_output_widget: y_train_hat = automl.predict(X_train) train_accuracy_score = metrics.accuracy_score(y_train, y_train_hat) y_test_hat = automl.predict(X_test) test_accuracy_score = metrics.accuracy_score(y_test, y_test_hat) thresholdout_score = model_utils.thresholdout( train_accuracy_score, test_accuracy_score) output_str = "Run {}: train acc: {:.4}, noised test acc: {:.4}\n".format( self.queries, train_accuracy_score, thresholdout_score) print(output_str) with self.model_output_widget: print("MODELS:") print(automl.get_models_with_weights()) if self.textbox_upload: self.upload_button.disabled = False self.upload_text.disabled = False else: self.upload_widget.disabled = False if self.queries == self.budget_widget.value: self.on_budget_completion() return automl
def task_executor(task_info): """Execute task :param task_info: detail of task, dict""" data_path = task_info.get("data_path") time_max = task_info.get("time_max") task_id = task_info.get("task_id") model_type = task_info.get("model_type") LOG.info("Load data, path=%s", data_path) status = "done" try: data_set = pd.read_csv(data_path) x_set = data_set[data_set.columns[:len(data_set.keys()) - 1]] y_set = data_set[data_set.columns[-1]] x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size=0.3, random_state=0) LOG.info("start optimizer.") if platform.system() == "Linux": from autosklearn.classification import AutoSklearnClassifier from autosklearn.regression import AutoSklearnRegressor if model_type == "Classification": model = AutoSklearnClassifier( time_left_for_this_task=time_max + 5, per_run_time_limit=int(time_max / 10), include_preprocessors=["no_preprocessing"], ) elif model_type == "Regression": model = AutoSklearnRegressor( time_left_for_this_task=time_max + 5, per_run_time_limit=int(time_max / 10), include_preprocessors=["no_preprocessing"], ) else: LOG.error("not support model type=%s", model_type) raise ValueError("not support model type") else: from sklearn.ensemble import RandomForestClassifier, \ RandomForestRegressor if model_type == "Classification": model = RandomForestClassifier(n_estimators=500) elif model_type == "Regression": model = RandomForestRegressor(n_estimators=500) else: LOG.error("not support model type=%s", model_type) raise ValueError("not support model type") model.fit(x_train, y_train) prediction = model.predict(x_test) if model_type == "Classification": best_metrics = accuracy_score(y_test, prediction) LOG.info("The accuracy is %s", best_metrics) else: best_metrics = mean_squared_error(y_test, prediction) LOG.info("The mse is %s", best_metrics) except ServerException as server_error: LOG.error("Some thing wrong, reason=%s", server_error) best_metrics = 0 status = "failed" update = dict(end_time=int(time.time()), best_metrics=best_metrics, status=status) Task.objects.filter(task_id=task_id).update(**update)
# XGBOOST params - max_depth, min_child_weight, gamma # clfs = [XGB(max_depth=x) for x in range(1,10)]+\ # [XGB(min_child_weight=x) for x in range(1,10)]+\ # [XGB(gamma=x) for x in np.linspace(0,1,10)] X = data[0] X = getRelevantData(X,'vel_acc') X_f = getFeatures(X,'mean_std_max3fftpeaks') y = data[1] groups = data[2] scores = [] clf = AutoC() cv = GroupShuffleSplit(n_splits=1,test_size=0.2) for train_index, test_index in cv.split(X_f,y,groups): # Split data to train and test set X_train = X[train_index] y_train = y[train_index] X_test = X[test_index] y_test = y[test_index] clf.fit(X_train,y_train) y_pred = clf.predict(X_test) score = accuracy_score(y_test,y_pred) scores.append(score) print("{:.5f} accuracy".format(np.mean(scores)))
def test_fit_pSMAC(self): tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC') output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer') # test parallel Classifier to predict classes, not only indices Y_train += 1 Y_test += 1 automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit(X_train, Y_train) n_models_fit = len(automl.cv_results_['mean_test_score']) cv_results = automl.cv_results_['mean_test_score'] automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=2, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit(X_train, Y_train) n_models_fit_2 = len(automl.cv_results_['mean_test_score']) # Check that the results from the first run were actually read by the # second run self.assertGreater(n_models_fit_2, n_models_fit) for score in cv_results: self.assertIn( score, automl.cv_results_['mean_test_score'], msg=str((automl.cv_results_['mean_test_score'], cv_results)), ) # Create a 'dummy model' for the first run, which has an accuracy of # more than 99%; it should be in the final ensemble if the ensemble # building of the second AutoSklearn classifier works correct true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn', 'true_targets_ensemble.npy') with open(true_targets_ensemble_path, 'rb') as fh: true_targets_ensemble = np.load(fh, allow_pickle=True) true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0 true_targets_ensemble = true_targets_ensemble.astype(int) probas = np.zeros((len(true_targets_ensemble), 2), dtype=float) for i, value in enumerate(true_targets_ensemble): probas[i, value] = 1.0 dummy_predictions_path = os.path.join( tmp, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_0_999_0.0.npy', ) with open(dummy_predictions_path, 'wb') as fh: np.save(fh, probas) probas_test = np.zeros((len(Y_test), 2), dtype=float) for i, value in enumerate(Y_test): probas_test[i, value - 1] = 1.0 dummy = ArrayReturningDummyPredictor(probas_test) context = BackendContext(tmp, output, False, False, True) backend = Backend(context) model_path = backend.get_model_path(seed=0, idx=999, budget=0.0) backend.save_model(model=dummy, filepath=model_path) automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=3, initial_configurations_via_metalearning=0, ensemble_size=0, metric=accuracy, ) automl.fit_ensemble(Y_train, task=BINARY_CLASSIFICATION, precision='32', dataset_name='breast_cancer', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) score = sklearn.metrics.accuracy_score(Y_test, predictions) self.assertEqual(len(os.listdir(os.path.join(tmp, '.auto-sklearn', 'ensembles'))), 1) self.assertGreaterEqual(score, 0.90) self.assertEqual(automl._automl[0]._task, BINARY_CLASSIFICATION) models = automl._automl[0].models_ classifier_types = [type(c) for c in models.values()] self.assertIn(ArrayReturningDummyPredictor, classifier_types) del automl self._tearDown(tmp) self._tearDown(output)