def zeroconf_fit_ensemble(y): p("Building ensemble") seed = 1 ensemble = AutoSklearnClassifier( time_left_for_this_task=300,per_run_time_limit=150,ml_memory_limit=20240,ensemble_size=50,ensemble_nbest=200, shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, initial_configurations_via_metalearning=0, seed=seed) ensemble.fit_ensemble( task = BINARY_CLASSIFICATION ,y = y ,metric = F1_METRIC ,precision = '32' ,dataset_name = 'foobar' ,ensemble_size=10 ,ensemble_nbest=15) sleep(20) p("Ensemble built") p("Show models") p(str(ensemble.show_models())) return ensemble
def zeroconf_fit_ensemble(y, atsklrn_tempdir): lo = utl.get_logger(inspect.stack()[0][3]) lo.info("Building ensemble") seed = 1 ensemble = AutoSklearnClassifier( time_left_for_this_task=300, per_run_time_limit=150, ml_memory_limit=20240, ensemble_size=50, ensemble_nbest=200, shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, initial_configurations_via_metalearning=0, seed=seed) lo.info("Done AutoSklearnClassifier - seed:" + str(seed)) try: lo.debug("Start ensemble.fit_ensemble - seed:" + str(seed)) ensemble.fit_ensemble(task=BINARY_CLASSIFICATION, y=y, metric=autosklearn.metrics.f1, precision='32', dataset_name='foobar', ensemble_size=10, ensemble_nbest=15) except Exception: lo = utl.get_logger(inspect.stack()[0][3]) lo.exception("Error in ensemble.fit_ensemble - seed:" + str(seed)) raise lo = utl.get_logger(inspect.stack()[0][3]) lo.debug("Done ensemble.fit_ensemble - seed:" + str(seed)) sleep(20) lo.info("Ensemble built - seed:" + str(seed)) lo.info("Show models - seed:" + str(seed)) txtList = str(ensemble.show_models()).split("\n") for row in txtList: lo.info(row) return ensemble
def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit): automl = AutoSklearnClassifier() X = [[1], [2], [3]] y = [1, 2, 3] automl.fit(X, y) self.assertEqual(fit.call_count, 1) self.assertIsInstance(fit.call_args[0][0], np.ndarray) self.assertIsInstance(fit.call_args[0][1], np.ndarray) automl.refit(X, y) self.assertEqual(refit.call_count, 1) self.assertIsInstance(refit.call_args[0][0], np.ndarray) self.assertIsInstance(refit.call_args[0][1], np.ndarray) automl.fit_ensemble(y) self.assertEqual(fit_ensemble.call_count, 1) self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
def main(): X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=1) processes = [] spawn_classifier = get_spawn_classifier(X_train, y_train) for i in range(4): # set this at roughly half of your cores p = multiprocessing.Process( target=spawn_classifier, args=(i, 'breast_cancer'), ) p.start() processes.append(p) for p in processes: p.join() print('Starting to build an ensemble!') automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='digits', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
def multithread_tiny(): stage2assistant = Exp2Assistant(stage=2) X_train = stage2assistant.train_data y_train = stage2assistant.train_label processes = [] spawn_classifier = get_spawn_classifier(X_train, y_train) # spawn_classifier = get_spawn_classifier(X_train, y_train) for i in range(4): # set this at roughly half of your cores p = multiprocessing.Process(target=spawn_classifier, args=(i, 'label')) p.start() processes.append(p) for p in processes: p.join() print('Starting to build an ensemble!') automl = AutoSklearnClassifier( time_left_for_this_task=120, per_run_time_limit=120, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=300, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='label', ensemble_size=20, ensemble_nbest=60, ) joblib.dump( automl, path.join(path_service.get_resource(path.join("exp2", "model")), "stage2_model.joblib"))
def main(): X, y = sklearn.datasets.load_digits(return_X_y=True) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=1) processes = [] spawn_classifier = get_spawn_classifier(X_train, y_train) for i in range(4): # set this at roughly half of your cores p = multiprocessing.Process(target=spawn_classifier, args=(i, 'digits')) p.start() processes.append(p) for p in processes: p.join() print('Starting to build an ensemble!') automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='digits', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
def test_classification_methods_returns_self(self): X_train, y_train, X_test, y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, ensemble_size=0) automl_fitted = automl.fit(X_train, y_train) self.assertIs(automl, automl_fitted) automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5) self.assertIs(automl, automl_ensemble_fitted) automl_refitted = automl.refit(X_train.copy(), y_train.copy()) self.assertIs(automl, automl_refitted)
def test_autosklearn_classification_methods_returns_self(dask_client): X_train, y_train, X_test, y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=60, per_run_time_limit=10, ensemble_size=0, dask_client=dask_client, exclude_preprocessors=['fast_ica']) automl_fitted = automl.fit(X_train, y_train) assert automl is automl_fitted automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5) assert automl is automl_ensemble_fitted automl_refitted = automl.refit(X_train.copy(), y_train.copy()) assert automl is automl_refitted
def test_fit_pSMAC(self): tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC') output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer') # test parallel Classifier to predict classes, not only indices Y_train += 1 Y_test += 1 automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit(X_train, Y_train) n_models_fit = len(automl.cv_results_['mean_test_score']) cv_results = automl.cv_results_['mean_test_score'] automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=2, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit(X_train, Y_train) n_models_fit_2 = len(automl.cv_results_['mean_test_score']) # Check that the results from the first run were actually read by the # second run self.assertGreater(n_models_fit_2, n_models_fit) for score in cv_results: self.assertIn( score, automl.cv_results_['mean_test_score'], msg=str((automl.cv_results_['mean_test_score'], cv_results)), ) # Create a 'dummy model' for the first run, which has an accuracy of # more than 99%; it should be in the final ensemble if the ensemble # building of the second AutoSklearn classifier works correct true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn', 'true_targets_ensemble.npy') with open(true_targets_ensemble_path, 'rb') as fh: true_targets_ensemble = np.load(fh, allow_pickle=True) true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0 true_targets_ensemble = true_targets_ensemble.astype(int) probas = np.zeros((len(true_targets_ensemble), 2), dtype=float) for i, value in enumerate(true_targets_ensemble): probas[i, value] = 1.0 dummy_predictions_path = os.path.join( tmp, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_0_999_0.0.npy', ) with open(dummy_predictions_path, 'wb') as fh: np.save(fh, probas) probas_test = np.zeros((len(Y_test), 2), dtype=float) for i, value in enumerate(Y_test): probas_test[i, value - 1] = 1.0 dummy = ArrayReturningDummyPredictor(probas_test) context = BackendContext(tmp, output, False, False, True) backend = Backend(context) model_path = backend.get_model_path(seed=0, idx=999, budget=0.0) backend.save_model(model=dummy, filepath=model_path) automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=3, initial_configurations_via_metalearning=0, ensemble_size=0, metric=accuracy, ) automl.fit_ensemble(Y_train, task=BINARY_CLASSIFICATION, precision='32', dataset_name='breast_cancer', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) score = sklearn.metrics.accuracy_score(Y_test, predictions) self.assertEqual(len(os.listdir(os.path.join(tmp, '.auto-sklearn', 'ensembles'))), 1) self.assertGreaterEqual(score, 0.90) self.assertEqual(automl._automl[0]._task, BINARY_CLASSIFICATION) models = automl._automl[0].models_ classifier_types = [type(c) for c in models.values()] self.assertIn(ArrayReturningDummyPredictor, classifier_types) del automl self._tearDown(tmp) self._tearDown(output)
print('Starting to build an ensemble!') automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='digits', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
address=cluster.scheduler_address) as client: automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=10, memory_limit=1024, tmp_folder=tmp_folder, seed=777, # n_jobs is ignored internally as we pass a dask client. n_jobs=1, # Pass a dask client which connects to the previously constructed cluster. dask_client=client, ) automl.fit(X_train, y_train) automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, dataset_name='digits', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.sprint_statistics()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions)) # Wait until all workers are closed for process in worker_processes: process_python_worker.join()
class MLClassifier(GenericClassifier): def __init__(self, train, dataset_name, weight, num_processes=1): super().__init__(train) # init shared tmp folders for parallel automl automl_tmp_folder = "/tmp/autosklearn_parallel_tmp_%.1f" % weight automl_output_folder = "/tmp/autosklearn_parallel_out_%.1f" % weight for dir in [automl_tmp_folder, automl_output_folder]: try: shutil.rmtree(dir) except OSError as e: pass # parallel automl processes = [] spawn_classifier = MLClassifier.__get_spawn_classifier( train.X, train.y) for i in range(num_processes): p = multiprocessing.Process(target=spawn_classifier, args=(i, dataset_name, automl_tmp_folder, automl_output_folder)) p.start() processes.append(p) for p in processes: p.join() self.__cls = AutoSklearnClassifier( # time_left_for_this_task=15, # per_run_time_limit=15, # ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=automl_tmp_folder, output_folder=automl_output_folder, initial_configurations_via_metalearning=0, seed=1, ) self.__cls.fit_ensemble( train.y, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name=dataset_name, ensemble_size=20, ensemble_nbest=50, ) @property def name(self): return "MALAISE" @property def cls(self): return self.__cls def dump(self, pickle_file): # print models self.__cls.show_models() # dump model to file with open(pickle_file, 'wb') as fio: pickle.dump(self.cls, fio) def predict(self, test): return self.cls.predict(test.X) @staticmethod def __get_spawn_classifier(X_train, y_train): def spawn_classifier(seed, dataset_name, automl_tmp_folder, automl_output_folder): if seed == 0: initial_configurations_via_metalearning = 25 smac_scenario_args = {} else: initial_configurations_via_metalearning = 0 smac_scenario_args = {'initial_incumbent': 'RANDOM'} automl = AutoSklearnClassifier( # time_left_for_this_task=60, # sec., how long should this seed fit process run # per_run_time_limit=15, # sec., each model may only take this long before it's killed # ml_memory_limit=1024, # MB, memory limit imposed on each call to a ML algorithm shared_mode=True, # tmp folder will be shared between seeds tmp_folder=automl_tmp_folder, output_folder=automl_output_folder, delete_tmp_folder_after_terminate=False, ensemble_size= 0, # ensembles will be built when all optimization runs are finished initial_configurations_via_metalearning= initial_configurations_via_metalearning, seed=seed, smac_scenario_args=smac_scenario_args, ) automl.fit(X_train, y_train, dataset_name=dataset_name) return spawn_classifier
def test_fit_pSMAC(self): tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC') output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('digits') # test parallel Classifier to predict classes, not only indexes Y_train += 1 Y_test += 1 automl = AutoSklearnClassifier( time_left_for_this_task=20, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit(X_train, Y_train) # Create a 'dummy model' for the first run, which has an accuracy of # more than 99%; it should be in the final ensemble if the ensemble # building of the second AutoSklearn classifier works correct true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn', 'true_targets_ensemble.npy') with open(true_targets_ensemble_path, 'rb') as fh: true_targets_ensemble = np.load(fh) true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0 true_targets_ensemble = true_targets_ensemble.astype(int) probas = np.zeros((len(true_targets_ensemble), 10), dtype=float) for i, value in enumerate(true_targets_ensemble): probas[i, value] = 1.0 dummy_predictions_path = os.path.join( tmp, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_00030.npy', ) with open(dummy_predictions_path, 'wb') as fh: np.save(fh, probas) probas_test = np.zeros((len(Y_test), 10), dtype=float) for i, value in enumerate(Y_test): probas_test[i, value - 1] = 1.0 dummy = ArrayReturningDummyPredictor(probas_test) context = BackendContext(tmp, output, False, False, True) backend = Backend(context) backend.save_model(dummy, 30, 1) automl = AutoSklearnClassifier( time_left_for_this_task=20, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=2, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit_ensemble( Y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='iris', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) score = sklearn.metrics.accuracy_score(Y_test, predictions) self.assertEqual( len(os.listdir(os.path.join(tmp, '.auto-sklearn', 'ensembles'))), 1) self.assertGreaterEqual(score, 0.90) self.assertEqual(automl._automl._task, MULTICLASS_CLASSIFICATION) models = automl._automl.models_ classifier_types = [type(c) for c in models.values()] self.assertIn(ArrayReturningDummyPredictor, classifier_types) del automl self._tearDown(tmp) self._tearDown(output)
np.random.shuffle(indices) X = X[indices] y = y[indices] X_train = X[:1000] y_train = y[:1000] X_test = X[1000:] y_test = y[1000:] print('Starting to build an ensemble!') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1) # Both the ensemble_size and ensemble_nbest parameters can be changed later automl.fit_ensemble(task=MULTICLASS_CLASSIFICATION, metric=ACC_METRIC, precision='32', dataset_name='digits', ensemble_size=10, ensemble_nbest=10) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
def main(argv): # reading the command line helpString = 'python python_script_JAD_paper -a <trainingSet> -b <testSet> -t <timeForEachWorker> -n <numWorkers>' try: opts, args = getopt.getopt(argv, "ha:b:t:n:") except getopt.GetoptError: print(helpString) sys.exit(2) # collecting the arguments for opt, arg in opts: if opt == '-h': print(helpString) sys.exit() elif opt == '-a': training_set = arg elif opt == '-b': test_set = arg elif opt == '-t': time_left_for_this_task = int(arg) elif opt == '-n': n_processes = int(arg) # starting counting the time start_time = time.time() # folders tmp_folder = './tmp/autosklearn_tmp/' + training_set output_folder = './tmp/autosklearn_out/' + training_set # ensuring the folders are empty (?) for tmpDir in [tmp_folder, output_folder]: try: shutil.rmtree(tmpDir) except OSError as e: pass # reading the training data trainingData = pandas.read_csv(filepath_or_buffer='./tmp/data/' + training_set + '.csv', index_col=False) y_train = trainingData['target'] X_train = trainingData.drop('target', 1) # reading the test data testData = pandas.read_csv(filepath_or_buffer='./tmp/data/' + test_set + '.csv', index_col=False) y_test = testData['target'] X_test = testData.drop('target', 1) # main block try: # creating the sub-process function processes = [] spawn_classifier = get_spawn_classifier(X_train, y_train, training_set, time_left_for_this_task, tmp_folder, output_folder) # spawning the subprocesses for i in range(small_constant, small_constant + n_processes): p = multiprocessing.Process(target=spawn_classifier, args=[i]) p.start() processes.append(p) # waiting until all processes are done for p in processes: p.join() # retrieving the csRes and concatenating in a single data frame csvFiles = glob.glob('./tmp/results/' + training_set + '/*.csv') cvRes = pandas.read_csv(filepath_or_buffer=csvFiles[0], index_col=0) for csvFile in csvFiles[1:]: cvRes_tmp = pandas.read_csv(filepath_or_buffer=csvFile, index_col=0) cvRes = pandas.concat([cvRes, cvRes_tmp], axis=0, sort=False) # writing the cvRes on file cvRes.to_csv('./tmp/results/' + training_set + '/cvRes.csv', index=False) # building the ensemble automl_ensemble = AutoSklearnClassifier( time_left_for_this_task= time_left_for_this_task, # sec., how long should this seed fit process run delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, seed=12345, shared_mode=True, ensemble_size=50, ensemble_nbest=50, tmp_folder=tmp_folder, output_folder=output_folder) automl_ensemble.fit_ensemble(y_train.copy(), task=BINARY_CLASSIFICATION, metric=autosklearn.metrics.roc_auc) # building the best model automl_bestModel = AutoSklearnClassifier( time_left_for_this_task= time_left_for_this_task, # sec., how long should this seed fit process run delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=True, ensemble_size=1, ensemble_nbest=1, tmp_folder=tmp_folder, output_folder=output_folder) automl_bestModel.fit_ensemble(y_train.copy(), task=BINARY_CLASSIFICATION, metric=autosklearn.metrics.roc_auc) # refitting on the whole dataset automl_bestModel.refit(X_train.copy(), y_train.copy()) automl_ensemble.refit(X_train.copy(), y_train.copy()) # extracting the performances on test set automl_bestModel.target_type = 'multilabel-indicator' automl_ensemble.target_type = 'multilabel-indicator' predictions_bestModel = automl_bestModel.predict_proba(X_test.copy()) predictions_ensemble = automl_ensemble.predict_proba(X_test.copy()) # saving the results on file toSave = pandas.DataFrame({'outcome': y_test}) toSave['prob_ensemble'] = predictions_ensemble[:, 0] toSave['prob_bestModel'] = predictions_bestModel[:, 0] toSave.to_csv('./tmp/results/' + training_set + '/holdoutRes.csv') # stopping counting the time end_time = time.time() # saving total time total_time = end_time - start_time time_file = open('./tmp/results/' + training_set + '/etime.txt', "w+") tmp = time_file.write('Total time in seconds: %d\n' % total_time) time_file.close() except Exception as e: print(e) finally: # removing the tmp results folder shutil.rmtree(tmp_folder + '/.auto-sklearn/models')
print("Starting to build an ensemble!") automl = AutoSklearnClassifier( time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1, ) # Both the ensemble_size and ensemble_nbest parameters can be changed now if # necessary automl.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=ACC_METRIC, precision="32", dataset_name="digits", ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
def test_fit_pSMAC(self): tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC') output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('digits') # test parallel Classifier to predict classes, not only indexes Y_train += 1 Y_test += 1 automl = AutoSklearnClassifier( time_left_for_this_task=20, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit(X_train, Y_train) # Create a 'dummy model' for the first run, which has an accuracy of # more than 99%; it should be in the final ensemble if the ensemble # building of the second AutoSklearn classifier works correct true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn', 'true_targets_ensemble.npy') with open(true_targets_ensemble_path, 'rb') as fh: true_targets_ensemble = np.load(fh) true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0 true_targets_ensemble = true_targets_ensemble.astype(int) probas = np.zeros((len(true_targets_ensemble), 10), dtype=float) for i, value in enumerate(true_targets_ensemble): probas[i, value] = 1.0 dummy_predictions_path = os.path.join( tmp, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_00030.npy', ) with open(dummy_predictions_path, 'wb') as fh: np.save(fh, probas) probas_test = np.zeros((len(Y_test), 10), dtype=float) for i, value in enumerate(Y_test): probas_test[i, value - 1] = 1.0 dummy = ArrayReturningDummyPredictor(probas_test) context = BackendContext(tmp, output, False, False, True) backend = Backend(context) backend.save_model(dummy, 30, 1) automl = AutoSklearnClassifier( time_left_for_this_task=20, per_run_time_limit=5, output_folder=output, tmp_folder=tmp, shared_mode=True, seed=2, initial_configurations_via_metalearning=0, ensemble_size=0, ) automl.fit_ensemble(Y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='iris', ensemble_size=20, ensemble_nbest=50, ) predictions = automl.predict(X_test) score = sklearn.metrics.accuracy_score(Y_test, predictions) self.assertEqual(len(os.listdir(os.path.join(tmp, '.auto-sklearn', 'ensembles'))), 1) self.assertGreaterEqual(score, 0.90) self.assertEqual(automl._automl._task, MULTICLASS_CLASSIFICATION) models = automl._automl.models_ classifier_types = [type(c) for c in models.values()] self.assertIn(ArrayReturningDummyPredictor, classifier_types) del automl self._tearDown(tmp) self._tearDown(output)
for i in range(4): # set this at roughly half of your cores p = multiprocessing.Process(target=spawn_classifier, args=(i, 'digits')) p.start() processes.append(p) for p in processes: p.join() print('Starting to build an ensemble!') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, ml_memory_limit=1024, shared_mode=True, ensemble_size=50, ensemble_nbest=200, tmp_folder=tmp_folder, output_folder=output_folder, initial_configurations_via_metalearning=0, seed=1) # Both the ensemble_size and ensemble_nbest parameters can be changed later automl.fit_ensemble(task=MULTICLASS_CLASSIFICATION, metric=ACC_METRIC, precision='32', dataset_name='digits', ensemble_size=10, ensemble_nbest=10) predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
feat_type= [col_dtype_dict[c] for c in X.columns] p("starting autosklearn classifiers fiting") train_multicore(X_train.values, y_train, feat_type, pool_size, per_run_time_limit) p("Building ensemble") seed = 1 c = AutoSklearnClassifier( time_left_for_this_task=300,per_run_time_limit=150,ml_memory_limit=20240,ensemble_size=50,ensemble_nbest=200, shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, initial_configurations_via_metalearning=0, seed=seed) c.fit_ensemble( task = BINARY_CLASSIFICATION ,y = y_train ,metric = F1_METRIC ,precision = '32' ,dataset_name = 'foobar' ,ensemble_size=10 ,ensemble_nbest=15) sleep(20) p("Ensemble built") p("Show models") print(c.show_models()) p("Predicting") y_hat = c.predict(X_test.values) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat)) if df_unknown.shape[0]==0: p("nothing to predict. Prediction dataset is empty.")
cls = AutoSklearnClassifier( time_left_for_this_task=1200, #1/3 of 1 hour spared for fitting ensemble ml_memory_limit=6144, shared_mode=True, tmp_folder=tmp_folder, output_folder=output_folder, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, initial_configurations_via_metalearning=0, seed=0, ) #Fit ensemble, change size and nbest if necessary cls.fit_ensemble( y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, ) anytime_model = cls #Prequential evaluation for i in range(24, 27): #Test on next batch for accuracy X_test = B[i].iloc[:, 0:-1] y_test = B[i].iloc[:, -1] y_test = y_test.to_numpy() y_hat = cls.predict(X_test)