def zeroconf_fit_ensemble(y):
    p("Building ensemble")

    seed = 1

    ensemble = AutoSklearnClassifier(
        time_left_for_this_task=300,per_run_time_limit=150,ml_memory_limit=20240,ensemble_size=50,ensemble_nbest=200,
        shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir,
        delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False,
        initial_configurations_via_metalearning=0,
        seed=seed)

    ensemble.fit_ensemble(
        task = BINARY_CLASSIFICATION
        ,y = y
        ,metric = F1_METRIC
        ,precision = '32'
        ,dataset_name = 'foobar' 
        ,ensemble_size=10
        ,ensemble_nbest=15)
    
    sleep(20)
    p("Ensemble built")
    
    p("Show models")
    p(str(ensemble.show_models()))
    return ensemble
Example #2
0
def zeroconf_fit_ensemble(y, atsklrn_tempdir):
    lo = utl.get_logger(inspect.stack()[0][3])

    lo.info("Building ensemble")

    seed = 1

    ensemble = AutoSklearnClassifier(
        time_left_for_this_task=300,
        per_run_time_limit=150,
        ml_memory_limit=20240,
        ensemble_size=50,
        ensemble_nbest=200,
        shared_mode=True,
        tmp_folder=atsklrn_tempdir,
        output_folder=atsklrn_tempdir,
        delete_tmp_folder_after_terminate=False,
        delete_output_folder_after_terminate=False,
        initial_configurations_via_metalearning=0,
        seed=seed)

    lo.info("Done AutoSklearnClassifier - seed:" + str(seed))

    try:
        lo.debug("Start ensemble.fit_ensemble - seed:" + str(seed))
        ensemble.fit_ensemble(task=BINARY_CLASSIFICATION,
                              y=y,
                              metric=autosklearn.metrics.f1,
                              precision='32',
                              dataset_name='foobar',
                              ensemble_size=10,
                              ensemble_nbest=15)
    except Exception:
        lo = utl.get_logger(inspect.stack()[0][3])
        lo.exception("Error in ensemble.fit_ensemble - seed:" + str(seed))
        raise

    lo = utl.get_logger(inspect.stack()[0][3])
    lo.debug("Done ensemble.fit_ensemble - seed:" + str(seed))

    sleep(20)
    lo.info("Ensemble built - seed:" + str(seed))

    lo.info("Show models - seed:" + str(seed))
    txtList = str(ensemble.show_models()).split("\n")
    for row in txtList:
        lo.info(row)

    return ensemble
Example #3
0
 def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit):
     automl = AutoSklearnClassifier()
     X = [[1], [2], [3]]
     y = [1, 2, 3]
     automl.fit(X, y)
     self.assertEqual(fit.call_count, 1)
     self.assertIsInstance(fit.call_args[0][0], np.ndarray)
     self.assertIsInstance(fit.call_args[0][1], np.ndarray)
     automl.refit(X, y)
     self.assertEqual(refit.call_count, 1)
     self.assertIsInstance(refit.call_args[0][0], np.ndarray)
     self.assertIsInstance(refit.call_args[0][1], np.ndarray)
     automl.fit_ensemble(y)
     self.assertEqual(fit_ensemble.call_count, 1)
     self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
Example #4
0
 def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit):
     automl = AutoSklearnClassifier()
     X = [[1], [2], [3]]
     y = [1, 2, 3]
     automl.fit(X, y)
     self.assertEqual(fit.call_count, 1)
     self.assertIsInstance(fit.call_args[0][0], np.ndarray)
     self.assertIsInstance(fit.call_args[0][1], np.ndarray)
     automl.refit(X, y)
     self.assertEqual(refit.call_count, 1)
     self.assertIsInstance(refit.call_args[0][0], np.ndarray)
     self.assertIsInstance(refit.call_args[0][1], np.ndarray)
     automl.fit_ensemble(y)
     self.assertEqual(fit_ensemble.call_count, 1)
     self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
Example #5
0
def main():

    X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)

    processes = []
    spawn_classifier = get_spawn_classifier(X_train, y_train)
    for i in range(4):  # set this at roughly half of your cores
        p = multiprocessing.Process(
            target=spawn_classifier,
            args=(i, 'breast_cancer'),
        )
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    print('Starting to build an ensemble!')
    automl = AutoSklearnClassifier(
        time_left_for_this_task=15,
        per_run_time_limit=15,
        ml_memory_limit=1024,
        shared_mode=True,
        ensemble_size=50,
        ensemble_nbest=200,
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        initial_configurations_via_metalearning=0,
        seed=1,
    )

    # Both the ensemble_size and ensemble_nbest parameters can be changed now if
    # necessary
    automl.fit_ensemble(
        y_train,
        task=MULTICLASS_CLASSIFICATION,
        metric=accuracy,
        precision='32',
        dataset_name='digits',
        ensemble_size=20,
        ensemble_nbest=50,
    )

    predictions = automl.predict(X_test)
    print(automl.show_models())
    print("Accuracy score",
          sklearn.metrics.accuracy_score(y_test, predictions))
Example #6
0
def multithread_tiny():
    stage2assistant = Exp2Assistant(stage=2)
    X_train = stage2assistant.train_data
    y_train = stage2assistant.train_label

    processes = []
    spawn_classifier = get_spawn_classifier(X_train, y_train)
    # spawn_classifier = get_spawn_classifier(X_train, y_train)
    for i in range(4):  # set this at roughly half of your cores
        p = multiprocessing.Process(target=spawn_classifier, args=(i, 'label'))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    print('Starting to build an ensemble!')
    automl = AutoSklearnClassifier(
        time_left_for_this_task=120,
        per_run_time_limit=120,
        ml_memory_limit=1024,
        shared_mode=True,
        ensemble_size=50,
        ensemble_nbest=300,
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        initial_configurations_via_metalearning=0,
        seed=1,
    )

    # Both the ensemble_size and ensemble_nbest parameters can be changed now if
    # necessary
    automl.fit_ensemble(
        y_train,
        task=MULTICLASS_CLASSIFICATION,
        metric=accuracy,
        precision='32',
        dataset_name='label',
        ensemble_size=20,
        ensemble_nbest=60,
    )

    joblib.dump(
        automl,
        path.join(path_service.get_resource(path.join("exp2", "model")),
                  "stage2_model.joblib"))
def main():

    X, y = sklearn.datasets.load_digits(return_X_y=True)
    X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)

    processes = []
    spawn_classifier = get_spawn_classifier(X_train, y_train)
    for i in range(4): # set this at roughly half of your cores
        p = multiprocessing.Process(target=spawn_classifier, args=(i, 'digits'))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    print('Starting to build an ensemble!')
    automl = AutoSklearnClassifier(
        time_left_for_this_task=15,
        per_run_time_limit=15,
        ml_memory_limit=1024,
        shared_mode=True,
        ensemble_size=50,
        ensemble_nbest=200,
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        initial_configurations_via_metalearning=0,
        seed=1,
    )

    # Both the ensemble_size and ensemble_nbest parameters can be changed now if
    # necessary
    automl.fit_ensemble(
        y_train,
        task=MULTICLASS_CLASSIFICATION,
        metric=accuracy,
        precision='32',
        dataset_name='digits',
        ensemble_size=20,
        ensemble_nbest=50,
    )

    predictions = automl.predict(X_test)
    print(automl.show_models())
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
Example #8
0
    def test_classification_methods_returns_self(self):
        X_train, y_train, X_test, y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       ensemble_size=0)

        automl_fitted = automl.fit(X_train, y_train)
        self.assertIs(automl, automl_fitted)

        automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
        self.assertIs(automl, automl_ensemble_fitted)

        automl_refitted = automl.refit(X_train.copy(), y_train.copy())
        self.assertIs(automl, automl_refitted)
Example #9
0
    def test_classification_methods_returns_self(self):
        X_train, y_train, X_test, y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       ensemble_size=0)

        automl_fitted = automl.fit(X_train, y_train)
        self.assertIs(automl, automl_fitted)

        automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
        self.assertIs(automl, automl_ensemble_fitted)

        automl_refitted = automl.refit(X_train.copy(), y_train.copy())
        self.assertIs(automl, automl_refitted)
Example #10
0
def test_autosklearn_classification_methods_returns_self(dask_client):
    X_train, y_train, X_test, y_test = putil.get_dataset('iris')
    automl = AutoSklearnClassifier(time_left_for_this_task=60,
                                   per_run_time_limit=10,
                                   ensemble_size=0,
                                   dask_client=dask_client,
                                   exclude_preprocessors=['fast_ica'])

    automl_fitted = automl.fit(X_train, y_train)
    assert automl is automl_fitted

    automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
    assert automl is automl_ensemble_fitted

    automl_refitted = automl.refit(X_train.copy(), y_train.copy())
    assert automl is automl_refitted
Example #11
0
    def test_fit_pSMAC(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer')

        # test parallel Classifier to predict classes, not only indices
        Y_train += 1
        Y_test += 1

        automl = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=1,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit(X_train, Y_train)
        n_models_fit = len(automl.cv_results_['mean_test_score'])
        cv_results = automl.cv_results_['mean_test_score']

        automl = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=2,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit(X_train, Y_train)
        n_models_fit_2 = len(automl.cv_results_['mean_test_score'])

        # Check that the results from the first run were actually read by the
        # second run
        self.assertGreater(n_models_fit_2, n_models_fit)
        for score in cv_results:
            self.assertIn(
                score,
                automl.cv_results_['mean_test_score'],
                msg=str((automl.cv_results_['mean_test_score'], cv_results)),
            )

        # Create a 'dummy model' for the first run, which has an accuracy of
        # more than 99%; it should be in the final ensemble if the ensemble
        # building of the second AutoSklearn classifier works correct
        true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn',
                                                  'true_targets_ensemble.npy')
        with open(true_targets_ensemble_path, 'rb') as fh:
            true_targets_ensemble = np.load(fh, allow_pickle=True)
        true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
        true_targets_ensemble = true_targets_ensemble.astype(int)
        probas = np.zeros((len(true_targets_ensemble), 2), dtype=float)

        for i, value in enumerate(true_targets_ensemble):
            probas[i, value] = 1.0
        dummy_predictions_path = os.path.join(
            tmp,
            '.auto-sklearn',
            'predictions_ensemble',
            'predictions_ensemble_0_999_0.0.npy',
        )
        with open(dummy_predictions_path, 'wb') as fh:
            np.save(fh, probas)

        probas_test = np.zeros((len(Y_test), 2), dtype=float)
        for i, value in enumerate(Y_test):
            probas_test[i, value - 1] = 1.0

        dummy = ArrayReturningDummyPredictor(probas_test)
        context = BackendContext(tmp, output, False, False, True)
        backend = Backend(context)
        model_path = backend.get_model_path(seed=0, idx=999, budget=0.0)
        backend.save_model(model=dummy, filepath=model_path)

        automl = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=3,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
            metric=accuracy,
        )
        automl.fit_ensemble(Y_train, task=BINARY_CLASSIFICATION,
                            precision='32',
                            dataset_name='breast_cancer',
                            ensemble_size=20,
                            ensemble_nbest=50,
                            )

        predictions = automl.predict(X_test)
        score = sklearn.metrics.accuracy_score(Y_test, predictions)

        self.assertEqual(len(os.listdir(os.path.join(tmp, '.auto-sklearn',
                                                     'ensembles'))), 1)
        self.assertGreaterEqual(score, 0.90)
        self.assertEqual(automl._automl[0]._task, BINARY_CLASSIFICATION)

        models = automl._automl[0].models_
        classifier_types = [type(c) for c in models.values()]
        self.assertIn(ArrayReturningDummyPredictor, classifier_types)

        del automl
        self._tearDown(tmp)
        self._tearDown(output)
Example #12
0
    print('Starting to build an ensemble!')
    automl = AutoSklearnClassifier(
        time_left_for_this_task=15,
        per_run_time_limit=15,
        ml_memory_limit=1024,
        shared_mode=True,
        ensemble_size=50,
        ensemble_nbest=200,
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        initial_configurations_via_metalearning=0,
        seed=1,
    )

    # Both the ensemble_size and ensemble_nbest parameters can be changed now if
    # necessary
    automl.fit_ensemble(
        y_train,
        task=MULTICLASS_CLASSIFICATION,
        metric=accuracy,
        precision='32',
        dataset_name='digits',
        ensemble_size=20,
        ensemble_nbest=50,
    )

    predictions = automl.predict(X_test)
    print(automl.show_models())
    print("Accuracy score",
          sklearn.metrics.accuracy_score(y_test, predictions))
Example #13
0
                address=cluster.scheduler_address) as client:
            automl = AutoSklearnClassifier(
                time_left_for_this_task=30,
                per_run_time_limit=10,
                memory_limit=1024,
                tmp_folder=tmp_folder,
                seed=777,
                # n_jobs is ignored internally as we pass a dask client.
                n_jobs=1,
                # Pass a dask client which connects to the previously constructed cluster.
                dask_client=client,
            )
            automl.fit(X_train, y_train)

            automl.fit_ensemble(
                y_train,
                task=MULTICLASS_CLASSIFICATION,
                dataset_name='digits',
                ensemble_size=20,
                ensemble_nbest=50,
            )

        predictions = automl.predict(X_test)
        print(automl.sprint_statistics())
        print("Accuracy score",
              sklearn.metrics.accuracy_score(y_test, predictions))

        # Wait until all workers are closed
        for process in worker_processes:
            process_python_worker.join()
Example #14
0
class MLClassifier(GenericClassifier):
    def __init__(self, train, dataset_name, weight, num_processes=1):
        super().__init__(train)
        # init shared tmp folders for parallel automl
        automl_tmp_folder = "/tmp/autosklearn_parallel_tmp_%.1f" % weight
        automl_output_folder = "/tmp/autosklearn_parallel_out_%.1f" % weight
        for dir in [automl_tmp_folder, automl_output_folder]:
            try:
                shutil.rmtree(dir)
            except OSError as e:
                pass

        # parallel automl
        processes = []
        spawn_classifier = MLClassifier.__get_spawn_classifier(
            train.X, train.y)
        for i in range(num_processes):
            p = multiprocessing.Process(target=spawn_classifier,
                                        args=(i, dataset_name,
                                              automl_tmp_folder,
                                              automl_output_folder))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()

        self.__cls = AutoSklearnClassifier(
            #            time_left_for_this_task=15,
            #            per_run_time_limit=15,
            #            ml_memory_limit=1024,
            shared_mode=True,
            ensemble_size=50,
            ensemble_nbest=200,
            tmp_folder=automl_tmp_folder,
            output_folder=automl_output_folder,
            initial_configurations_via_metalearning=0,
            seed=1,
        )
        self.__cls.fit_ensemble(
            train.y,
            task=MULTICLASS_CLASSIFICATION,
            metric=accuracy,
            precision='32',
            dataset_name=dataset_name,
            ensemble_size=20,
            ensemble_nbest=50,
        )

    @property
    def name(self):
        return "MALAISE"

    @property
    def cls(self):
        return self.__cls

    def dump(self, pickle_file):
        # print models
        self.__cls.show_models()
        # dump model to file
        with open(pickle_file, 'wb') as fio:
            pickle.dump(self.cls, fio)

    def predict(self, test):
        return self.cls.predict(test.X)

    @staticmethod
    def __get_spawn_classifier(X_train, y_train):
        def spawn_classifier(seed, dataset_name, automl_tmp_folder,
                             automl_output_folder):
            if seed == 0:
                initial_configurations_via_metalearning = 25
                smac_scenario_args = {}
            else:
                initial_configurations_via_metalearning = 0
                smac_scenario_args = {'initial_incumbent': 'RANDOM'}

            automl = AutoSklearnClassifier(
                #            time_left_for_this_task=60, # sec., how long should this seed fit process run
                #            per_run_time_limit=15, # sec., each model may only take this long before it's killed
                #            ml_memory_limit=1024, # MB, memory limit imposed on each call to a ML algorithm
                shared_mode=True,  # tmp folder will be shared between seeds
                tmp_folder=automl_tmp_folder,
                output_folder=automl_output_folder,
                delete_tmp_folder_after_terminate=False,
                ensemble_size=
                0,  # ensembles will be built when all optimization runs are finished
                initial_configurations_via_metalearning=
                initial_configurations_via_metalearning,
                seed=seed,
                smac_scenario_args=smac_scenario_args,
            )
            automl.fit(X_train, y_train, dataset_name=dataset_name)

        return spawn_classifier
Example #15
0
    def test_fit_pSMAC(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('digits')

        # test parallel Classifier to predict classes, not only indexes
        Y_train += 1
        Y_test += 1

        automl = AutoSklearnClassifier(
            time_left_for_this_task=20,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=1,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit(X_train, Y_train)
        # Create a 'dummy model' for the first run, which has an accuracy of
        # more than 99%; it should be in the final ensemble if the ensemble
        # building of the second AutoSklearn classifier works correct
        true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn',
                                                  'true_targets_ensemble.npy')
        with open(true_targets_ensemble_path, 'rb') as fh:
            true_targets_ensemble = np.load(fh)
        true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
        true_targets_ensemble = true_targets_ensemble.astype(int)
        probas = np.zeros((len(true_targets_ensemble), 10), dtype=float)

        for i, value in enumerate(true_targets_ensemble):
            probas[i, value] = 1.0
        dummy_predictions_path = os.path.join(
            tmp,
            '.auto-sklearn',
            'predictions_ensemble',
            'predictions_ensemble_1_00030.npy',
        )
        with open(dummy_predictions_path, 'wb') as fh:
            np.save(fh, probas)

        probas_test = np.zeros((len(Y_test), 10), dtype=float)
        for i, value in enumerate(Y_test):
            probas_test[i, value - 1] = 1.0

        dummy = ArrayReturningDummyPredictor(probas_test)
        context = BackendContext(tmp, output, False, False, True)
        backend = Backend(context)
        backend.save_model(dummy, 30, 1)

        automl = AutoSklearnClassifier(
            time_left_for_this_task=20,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=2,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit_ensemble(
            Y_train,
            task=MULTICLASS_CLASSIFICATION,
            metric=accuracy,
            precision='32',
            dataset_name='iris',
            ensemble_size=20,
            ensemble_nbest=50,
        )

        predictions = automl.predict(X_test)
        score = sklearn.metrics.accuracy_score(Y_test, predictions)

        self.assertEqual(
            len(os.listdir(os.path.join(tmp, '.auto-sklearn', 'ensembles'))),
            1)
        self.assertGreaterEqual(score, 0.90)
        self.assertEqual(automl._automl._task, MULTICLASS_CLASSIFICATION)

        models = automl._automl.models_
        classifier_types = [type(c) for c in models.values()]
        self.assertIn(ArrayReturningDummyPredictor, classifier_types)

        del automl
        self._tearDown(tmp)
        self._tearDown(output)
Example #16
0
    np.random.shuffle(indices)
    X = X[indices]
    y = y[indices]
    X_train = X[:1000]
    y_train = y[:1000]
    X_test = X[1000:]
    y_test = y[1000:]

    print('Starting to build an ensemble!')
    automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                   per_run_time_limit=15,
                                   ml_memory_limit=1024,
                                   shared_mode=True,
                                   ensemble_size=50,
                                   ensemble_nbest=200,
                                   tmp_folder=tmp_folder,
                                   output_folder=output_folder,
                                   initial_configurations_via_metalearning=0,
                                   seed=1)

    # Both the ensemble_size and ensemble_nbest parameters can be changed later
    automl.fit_ensemble(task=MULTICLASS_CLASSIFICATION,
                        metric=ACC_METRIC,
                        precision='32',
                        dataset_name='digits',
                        ensemble_size=10,
                        ensemble_nbest=10)

    predictions = automl.predict(X_test)
    print(automl.show_models())
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
Example #17
0
def main(argv):

    # reading the command line
    helpString = 'python python_script_JAD_paper -a <trainingSet> -b <testSet> -t <timeForEachWorker> -n <numWorkers>'
    try:
        opts, args = getopt.getopt(argv, "ha:b:t:n:")
    except getopt.GetoptError:
        print(helpString)
        sys.exit(2)

    # collecting the arguments
    for opt, arg in opts:
        if opt == '-h':
            print(helpString)
            sys.exit()
        elif opt == '-a':
            training_set = arg
        elif opt == '-b':
            test_set = arg
        elif opt == '-t':
            time_left_for_this_task = int(arg)
        elif opt == '-n':
            n_processes = int(arg)

    # starting counting the time
    start_time = time.time()

    # folders
    tmp_folder = './tmp/autosklearn_tmp/' + training_set
    output_folder = './tmp/autosklearn_out/' + training_set

    # ensuring the folders are empty (?)
    for tmpDir in [tmp_folder, output_folder]:
        try:
            shutil.rmtree(tmpDir)
        except OSError as e:
            pass

    # reading the training data
    trainingData = pandas.read_csv(filepath_or_buffer='./tmp/data/' +
                                   training_set + '.csv',
                                   index_col=False)
    y_train = trainingData['target']
    X_train = trainingData.drop('target', 1)

    # reading the test data
    testData = pandas.read_csv(filepath_or_buffer='./tmp/data/' + test_set +
                               '.csv',
                               index_col=False)
    y_test = testData['target']
    X_test = testData.drop('target', 1)

    # main block
    try:

        # creating the sub-process function
        processes = []
        spawn_classifier = get_spawn_classifier(X_train, y_train, training_set,
                                                time_left_for_this_task,
                                                tmp_folder, output_folder)

        # spawning the subprocesses
        for i in range(small_constant, small_constant + n_processes):
            p = multiprocessing.Process(target=spawn_classifier, args=[i])
            p.start()
            processes.append(p)

        # waiting until all processes are done
        for p in processes:
            p.join()

        # retrieving the csRes and concatenating in a single data frame
        csvFiles = glob.glob('./tmp/results/' + training_set + '/*.csv')
        cvRes = pandas.read_csv(filepath_or_buffer=csvFiles[0], index_col=0)
        for csvFile in csvFiles[1:]:
            cvRes_tmp = pandas.read_csv(filepath_or_buffer=csvFile,
                                        index_col=0)
            cvRes = pandas.concat([cvRes, cvRes_tmp], axis=0, sort=False)

        # writing the cvRes on file
        cvRes.to_csv('./tmp/results/' + training_set + '/cvRes.csv',
                     index=False)

        # building the ensemble
        automl_ensemble = AutoSklearnClassifier(
            time_left_for_this_task=
            time_left_for_this_task,  # sec., how long should this seed fit process run
            delete_tmp_folder_after_terminate=False,
            delete_output_folder_after_terminate=False,
            seed=12345,
            shared_mode=True,
            ensemble_size=50,
            ensemble_nbest=50,
            tmp_folder=tmp_folder,
            output_folder=output_folder)
        automl_ensemble.fit_ensemble(y_train.copy(),
                                     task=BINARY_CLASSIFICATION,
                                     metric=autosklearn.metrics.roc_auc)

        # building the best model
        automl_bestModel = AutoSklearnClassifier(
            time_left_for_this_task=
            time_left_for_this_task,  # sec., how long should this seed fit process run
            delete_tmp_folder_after_terminate=False,
            delete_output_folder_after_terminate=False,
            shared_mode=True,
            ensemble_size=1,
            ensemble_nbest=1,
            tmp_folder=tmp_folder,
            output_folder=output_folder)
        automl_bestModel.fit_ensemble(y_train.copy(),
                                      task=BINARY_CLASSIFICATION,
                                      metric=autosklearn.metrics.roc_auc)

        # refitting on the whole dataset
        automl_bestModel.refit(X_train.copy(), y_train.copy())
        automl_ensemble.refit(X_train.copy(), y_train.copy())

        # extracting the performances on test set
        automl_bestModel.target_type = 'multilabel-indicator'
        automl_ensemble.target_type = 'multilabel-indicator'
        predictions_bestModel = automl_bestModel.predict_proba(X_test.copy())
        predictions_ensemble = automl_ensemble.predict_proba(X_test.copy())

        # saving the results on file
        toSave = pandas.DataFrame({'outcome': y_test})
        toSave['prob_ensemble'] = predictions_ensemble[:, 0]
        toSave['prob_bestModel'] = predictions_bestModel[:, 0]
        toSave.to_csv('./tmp/results/' + training_set + '/holdoutRes.csv')

        # stopping counting the time
        end_time = time.time()

        # saving total time
        total_time = end_time - start_time
        time_file = open('./tmp/results/' + training_set + '/etime.txt', "w+")
        tmp = time_file.write('Total time in seconds: %d\n' % total_time)
        time_file.close()

    except Exception as e:
        print(e)

    finally:

        # removing the tmp results folder
        shutil.rmtree(tmp_folder + '/.auto-sklearn/models')
Example #18
0
    print("Starting to build an ensemble!")
    automl = AutoSklearnClassifier(
        time_left_for_this_task=15,
        per_run_time_limit=15,
        ml_memory_limit=1024,
        shared_mode=True,
        ensemble_size=50,
        ensemble_nbest=200,
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        initial_configurations_via_metalearning=0,
        seed=1,
    )

    # Both the ensemble_size and ensemble_nbest parameters can be changed now if
    # necessary
    automl.fit_ensemble(
        y_train,
        task=MULTICLASS_CLASSIFICATION,
        metric=ACC_METRIC,
        precision="32",
        dataset_name="digits",
        ensemble_size=20,
        ensemble_nbest=50,
    )

    predictions = automl.predict(X_test)
    print(automl.show_models())
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
Example #19
0
    def test_fit_pSMAC(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('digits')

        # test parallel Classifier to predict classes, not only indexes
        Y_train += 1
        Y_test += 1

        automl = AutoSklearnClassifier(
            time_left_for_this_task=20,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=1,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit(X_train, Y_train)
        # Create a 'dummy model' for the first run, which has an accuracy of
        # more than 99%; it should be in the final ensemble if the ensemble
        # building of the second AutoSklearn classifier works correct
        true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn',
                                                  'true_targets_ensemble.npy')
        with open(true_targets_ensemble_path, 'rb') as fh:
            true_targets_ensemble = np.load(fh)
        true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
        true_targets_ensemble = true_targets_ensemble.astype(int)
        probas = np.zeros((len(true_targets_ensemble), 10), dtype=float)

        for i, value in enumerate(true_targets_ensemble):
            probas[i, value] = 1.0
        dummy_predictions_path = os.path.join(
            tmp,
            '.auto-sklearn',
            'predictions_ensemble',
            'predictions_ensemble_1_00030.npy',
        )
        with open(dummy_predictions_path, 'wb') as fh:
            np.save(fh, probas)

        probas_test = np.zeros((len(Y_test), 10), dtype=float)
        for i, value in enumerate(Y_test):
            probas_test[i, value - 1] = 1.0

        dummy = ArrayReturningDummyPredictor(probas_test)
        context = BackendContext(tmp, output, False, False, True)
        backend = Backend(context)
        backend.save_model(dummy, 30, 1)

        automl = AutoSklearnClassifier(
            time_left_for_this_task=20,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=2,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit_ensemble(Y_train, task=MULTICLASS_CLASSIFICATION,
                            metric=accuracy,
                            precision='32',
                            dataset_name='iris',
                            ensemble_size=20,
                            ensemble_nbest=50,
                            )

        predictions = automl.predict(X_test)
        score = sklearn.metrics.accuracy_score(Y_test, predictions)

        self.assertEqual(len(os.listdir(os.path.join(tmp, '.auto-sklearn',
                                                     'ensembles'))), 1)
        self.assertGreaterEqual(score, 0.90)
        self.assertEqual(automl._automl._task, MULTICLASS_CLASSIFICATION)

        models = automl._automl.models_
        classifier_types = [type(c) for c in models.values()]
        self.assertIn(ArrayReturningDummyPredictor, classifier_types)

        del automl
        self._tearDown(tmp)
        self._tearDown(output)
Example #20
0
    for i in range(4): # set this at roughly half of your cores
        p = multiprocessing.Process(target=spawn_classifier, args=(i, 'digits'))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    print('Starting to build an ensemble!')
    automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                   per_run_time_limit=15,
                                   ml_memory_limit=1024,
                                   shared_mode=True,
                                   ensemble_size=50,
                                   ensemble_nbest=200,
                                   tmp_folder=tmp_folder,
                                   output_folder=output_folder,
                                   initial_configurations_via_metalearning=0,
                                   seed=1)

    # Both the ensemble_size and ensemble_nbest parameters can be changed later
    automl.fit_ensemble(task=MULTICLASS_CLASSIFICATION,
                        metric=ACC_METRIC,
                        precision='32',
                        dataset_name='digits',
                        ensemble_size=10,
                        ensemble_nbest=10)

    predictions = automl.predict(X_test)
    print(automl.show_models())
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
Example #21
0
feat_type= [col_dtype_dict[c] for c in X.columns]
p("starting autosklearn classifiers fiting")
train_multicore(X_train.values, y_train, feat_type, pool_size, per_run_time_limit)

p("Building ensemble")
seed = 1
c = AutoSklearnClassifier(
    time_left_for_this_task=300,per_run_time_limit=150,ml_memory_limit=20240,ensemble_size=50,ensemble_nbest=200,
    shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir,
    delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False,
    initial_configurations_via_metalearning=0,
    seed=seed)
c.fit_ensemble(
    task = BINARY_CLASSIFICATION
    ,y = y_train
    ,metric = F1_METRIC
    ,precision = '32'
    ,dataset_name = 'foobar' 
    ,ensemble_size=10
    ,ensemble_nbest=15)

sleep(20)
p("Ensemble built")

p("Show models")
print(c.show_models())
p("Predicting")
y_hat = c.predict(X_test.values)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))

if df_unknown.shape[0]==0:
    p("nothing to predict. Prediction dataset is empty.")
Example #22
0
cls = AutoSklearnClassifier(
    time_left_for_this_task=1200,  #1/3 of 1 hour spared for fitting ensemble
    ml_memory_limit=6144,
    shared_mode=True,
    tmp_folder=tmp_folder,
    output_folder=output_folder,
    delete_tmp_folder_after_terminate=False,
    delete_output_folder_after_terminate=False,
    initial_configurations_via_metalearning=0,
    seed=0,
)

#Fit ensemble, change size and nbest if necessary
cls.fit_ensemble(
    y_train,
    task=MULTICLASS_CLASSIFICATION,
    metric=accuracy,
)

anytime_model = cls

#Prequential evaluation

for i in range(24, 27):

    #Test on next batch for accuracy
    X_test = B[i].iloc[:, 0:-1]
    y_test = B[i].iloc[:, -1]
    y_test = y_test.to_numpy()
    y_hat = cls.predict(X_test)