Example #1
0
class AutoClassifier(Classifier):
    def __init__(self, time_left_for_this_task, per_run_time_limit, folds):
        now = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
        self.automl = AutoSklearnClassifier(
            time_left_for_this_task=time_left_for_this_task,
            per_run_time_limit=per_run_time_limit,
            #tmp_folder='/tmp/autosklearn_switch_tmp',
            #output_folder='/tmp/autosklearn_switch_out',
            #delete_tmp_folder_after_terminate=False,
            #delete_output_folder_after_terminate=False,
            #shared_mode=True,
            resampling_strategy='cv',
            resampling_strategy_arguments={'folds': folds})

    def classify(self, X_train, y_train, X_test):
        # fit() changes the data in place, but refit needs the original data. We
        # therefore copy the data. In practice, one should reload the data
        self.automl.fit(X_train.copy(), y_train.copy())
        # During fit(), models are fit on individual cross-validation folds. To use
        # all available data, we call refit() which trains all models in the
        # final ensemble on the whole dataset.
        self.automl.refit(X_train.copy(), y_train.copy())

        predictions = self.automl.predict(X_test)

        return predictions


    def show_models(self):
        return self.automl.show_models()
def zeroconf_fit_ensemble(y):
    p("Building ensemble")

    seed = 1

    ensemble = AutoSklearnClassifier(
        time_left_for_this_task=300,per_run_time_limit=150,ml_memory_limit=20240,ensemble_size=50,ensemble_nbest=200,
        shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir,
        delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False,
        initial_configurations_via_metalearning=0,
        seed=seed)

    ensemble.fit_ensemble(
        task = BINARY_CLASSIFICATION
        ,y = y
        ,metric = F1_METRIC
        ,precision = '32'
        ,dataset_name = 'foobar' 
        ,ensemble_size=10
        ,ensemble_nbest=15)
    
    sleep(20)
    p("Ensemble built")
    
    p("Show models")
    p(str(ensemble.show_models()))
    return ensemble
Example #3
0
def zeroconf_fit_ensemble(y, atsklrn_tempdir):
    lo = utl.get_logger(inspect.stack()[0][3])

    lo.info("Building ensemble")

    seed = 1

    ensemble = AutoSklearnClassifier(
        time_left_for_this_task=300,
        per_run_time_limit=150,
        ml_memory_limit=20240,
        ensemble_size=50,
        ensemble_nbest=200,
        shared_mode=True,
        tmp_folder=atsklrn_tempdir,
        output_folder=atsklrn_tempdir,
        delete_tmp_folder_after_terminate=False,
        delete_output_folder_after_terminate=False,
        initial_configurations_via_metalearning=0,
        seed=seed)

    lo.info("Done AutoSklearnClassifier - seed:" + str(seed))

    try:
        lo.debug("Start ensemble.fit_ensemble - seed:" + str(seed))
        ensemble.fit_ensemble(task=BINARY_CLASSIFICATION,
                              y=y,
                              metric=autosklearn.metrics.f1,
                              precision='32',
                              dataset_name='foobar',
                              ensemble_size=10,
                              ensemble_nbest=15)
    except Exception:
        lo = utl.get_logger(inspect.stack()[0][3])
        lo.exception("Error in ensemble.fit_ensemble - seed:" + str(seed))
        raise

    lo = utl.get_logger(inspect.stack()[0][3])
    lo.debug("Done ensemble.fit_ensemble - seed:" + str(seed))

    sleep(20)
    lo.info("Ensemble built - seed:" + str(seed))

    lo.info("Show models - seed:" + str(seed))
    txtList = str(ensemble.show_models()).split("\n")
    for row in txtList:
        lo.info(row)

    return ensemble
Example #4
0
def main():

    X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)

    processes = []
    spawn_classifier = get_spawn_classifier(X_train, y_train)
    for i in range(4):  # set this at roughly half of your cores
        p = multiprocessing.Process(
            target=spawn_classifier,
            args=(i, 'breast_cancer'),
        )
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    print('Starting to build an ensemble!')
    automl = AutoSklearnClassifier(
        time_left_for_this_task=15,
        per_run_time_limit=15,
        ml_memory_limit=1024,
        shared_mode=True,
        ensemble_size=50,
        ensemble_nbest=200,
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        initial_configurations_via_metalearning=0,
        seed=1,
    )

    # Both the ensemble_size and ensemble_nbest parameters can be changed now if
    # necessary
    automl.fit_ensemble(
        y_train,
        task=MULTICLASS_CLASSIFICATION,
        metric=accuracy,
        precision='32',
        dataset_name='digits',
        ensemble_size=20,
        ensemble_nbest=50,
    )

    predictions = automl.predict(X_test)
    print(automl.show_models())
    print("Accuracy score",
          sklearn.metrics.accuracy_score(y_test, predictions))
def main():

    X, y = sklearn.datasets.load_digits(return_X_y=True)
    X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)

    processes = []
    spawn_classifier = get_spawn_classifier(X_train, y_train)
    for i in range(4): # set this at roughly half of your cores
        p = multiprocessing.Process(target=spawn_classifier, args=(i, 'digits'))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    print('Starting to build an ensemble!')
    automl = AutoSklearnClassifier(
        time_left_for_this_task=15,
        per_run_time_limit=15,
        ml_memory_limit=1024,
        shared_mode=True,
        ensemble_size=50,
        ensemble_nbest=200,
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        initial_configurations_via_metalearning=0,
        seed=1,
    )

    # Both the ensemble_size and ensemble_nbest parameters can be changed now if
    # necessary
    automl.fit_ensemble(
        y_train,
        task=MULTICLASS_CLASSIFICATION,
        metric=accuracy,
        precision='32',
        dataset_name='digits',
        ensemble_size=20,
        ensemble_nbest=50,
    )

    predictions = automl.predict(X_test)
    print(automl.show_models())
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
Example #6
0
    def test_fit(self):

        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=5,
                                       tmp_folder=output,
                                       output_folder=output)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        print(automl.show_models())

        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._automl._automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Example #7
0
    def test_fit(self):

        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=5,
                                       tmp_folder=output,
                                       output_folder=output)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        print(automl.show_models())

        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._automl._automl._task,
                         MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Example #8
0
    def test_fit(self):
        if self.travis:
            self.skipTest('This test does currently not run on travis-ci. '
                          'Make sure it runs locally on your machine!')

        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       tmp_folder=output,
                                       output_folder=output)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        print(automl.show_models())

        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._automl._automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
    def test_fit(self):
        if self.travis:
            self.skipTest('This test does currently not run on travis-ci. '
                          'Make sure it runs locally on your machine!')

        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       tmp_folder=output,
                                       output_folder=output)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        print(automl.show_models())

        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Example #10
0
    initial_configurations_via_metalearning=0,
    seed=seed)
c.fit_ensemble(
    task = BINARY_CLASSIFICATION
    ,y = y_train
    ,metric = F1_METRIC
    ,precision = '32'
    ,dataset_name = 'foobar' 
    ,ensemble_size=10
    ,ensemble_nbest=15)

sleep(20)
p("Ensemble built")

p("Show models")
print(c.show_models())
p("Predicting")
y_hat = c.predict(X_test.values)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))

if df_unknown.shape[0]==0:
    p("nothing to predict. Prediction dataset is empty.")
    exit()

p("Re-fitting on full known dataset. This can take long for a large set.")
try:
    c.refit(X.values, y)
except Exception as e:
    p("Refit failed, restarting")
    print(e)
    try:
Example #11
0
    print("Starting to build an ensemble!")
    automl = AutoSklearnClassifier(
        time_left_for_this_task=15,
        per_run_time_limit=15,
        ml_memory_limit=1024,
        shared_mode=True,
        ensemble_size=50,
        ensemble_nbest=200,
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        initial_configurations_via_metalearning=0,
        seed=1,
    )

    # Both the ensemble_size and ensemble_nbest parameters can be changed now if
    # necessary
    automl.fit_ensemble(
        y_train,
        task=MULTICLASS_CLASSIFICATION,
        metric=ACC_METRIC,
        precision="32",
        dataset_name="digits",
        ensemble_size=20,
        ensemble_nbest=50,
    )

    predictions = automl.predict(X_test)
    print(automl.show_models())
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
print('[INFO] Loading digits dataset.')
X, y = load_digits(return_X_y=True)

print('[INFO] Splitting.')
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    train_size=0.8)

print(f'[INFO] Train shape: {X_train.shape}')
print(f'[INFO] Test shape: {X_test.shape}')

print('[INFO] Finding best model...')
classifier = AutoSklearnClassifier(per_run_time_limit=360,
                                   ml_memory_limit=1024 * 6,
                                   time_left_for_this_task=7200)
start = time.time()

X_train = X_train.astype('float')
classifier.fit(X_train, y_train)
print(
    f'[INFO] Elapsed time finding best model: {time.time() - start} seconds.')

predictions = classifier.predict(X_test)
print('--- CLASSIFICATION REPORT: ---')
print(classification_report(y_test, predictions))
print('\n\n--- MODELS: ---')
print(classifier.show_models())
print('\n\n--- STATISTICS: ---')
print(classifier.sprint_statistics())
Example #13
0
    print('Starting to build an ensemble!')
    automl = AutoSklearnClassifier(
        time_left_for_this_task=15,
        per_run_time_limit=15,
        ml_memory_limit=1024,
        shared_mode=True,
        ensemble_size=50,
        ensemble_nbest=200,
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        initial_configurations_via_metalearning=0,
        seed=1,
    )

    # Both the ensemble_size and ensemble_nbest parameters can be changed now if
    # necessary
    automl.fit_ensemble(
        y_train,
        task=MULTICLASS_CLASSIFICATION,
        metric=accuracy,
        precision='32',
        dataset_name='digits',
        ensemble_size=20,
        ensemble_nbest=50,
    )

    predictions = automl.predict(X_test)
    print(automl.show_models())
    print("Accuracy score",
          sklearn.metrics.accuracy_score(y_test, predictions))
Example #14
0
class MLClassifier(GenericClassifier):
    def __init__(self, train, dataset_name, weight, num_processes=1):
        super().__init__(train)
        # init shared tmp folders for parallel automl
        automl_tmp_folder = "/tmp/autosklearn_parallel_tmp_%.1f" % weight
        automl_output_folder = "/tmp/autosklearn_parallel_out_%.1f" % weight
        for dir in [automl_tmp_folder, automl_output_folder]:
            try:
                shutil.rmtree(dir)
            except OSError as e:
                pass

        # parallel automl
        processes = []
        spawn_classifier = MLClassifier.__get_spawn_classifier(
            train.X, train.y)
        for i in range(num_processes):
            p = multiprocessing.Process(target=spawn_classifier,
                                        args=(i, dataset_name,
                                              automl_tmp_folder,
                                              automl_output_folder))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()

        self.__cls = AutoSklearnClassifier(
            #            time_left_for_this_task=15,
            #            per_run_time_limit=15,
            #            ml_memory_limit=1024,
            shared_mode=True,
            ensemble_size=50,
            ensemble_nbest=200,
            tmp_folder=automl_tmp_folder,
            output_folder=automl_output_folder,
            initial_configurations_via_metalearning=0,
            seed=1,
        )
        self.__cls.fit_ensemble(
            train.y,
            task=MULTICLASS_CLASSIFICATION,
            metric=accuracy,
            precision='32',
            dataset_name=dataset_name,
            ensemble_size=20,
            ensemble_nbest=50,
        )

    @property
    def name(self):
        return "MALAISE"

    @property
    def cls(self):
        return self.__cls

    def dump(self, pickle_file):
        # print models
        self.__cls.show_models()
        # dump model to file
        with open(pickle_file, 'wb') as fio:
            pickle.dump(self.cls, fio)

    def predict(self, test):
        return self.cls.predict(test.X)

    @staticmethod
    def __get_spawn_classifier(X_train, y_train):
        def spawn_classifier(seed, dataset_name, automl_tmp_folder,
                             automl_output_folder):
            if seed == 0:
                initial_configurations_via_metalearning = 25
                smac_scenario_args = {}
            else:
                initial_configurations_via_metalearning = 0
                smac_scenario_args = {'initial_incumbent': 'RANDOM'}

            automl = AutoSklearnClassifier(
                #            time_left_for_this_task=60, # sec., how long should this seed fit process run
                #            per_run_time_limit=15, # sec., each model may only take this long before it's killed
                #            ml_memory_limit=1024, # MB, memory limit imposed on each call to a ML algorithm
                shared_mode=True,  # tmp folder will be shared between seeds
                tmp_folder=automl_tmp_folder,
                output_folder=automl_output_folder,
                delete_tmp_folder_after_terminate=False,
                ensemble_size=
                0,  # ensembles will be built when all optimization runs are finished
                initial_configurations_via_metalearning=
                initial_configurations_via_metalearning,
                seed=seed,
                smac_scenario_args=smac_scenario_args,
            )
            automl.fit(X_train, y_train, dataset_name=dataset_name)

        return spawn_classifier
Example #15
0
def evaluate_ml_algorithm(dataset,
                          algo,
                          run_id,
                          obj_metric,
                          time_limit=600,
                          seed=1,
                          task_type=None):
    if algo == 'lightgbm':
        _algo = ['LightGBM']
        add_classifier(LightGBM)
    elif algo == 'logistic_regression':
        _algo = ['Logistic_Regression']
        add_classifier(Logistic_Regression)
    else:
        _algo = [algo]
    print('EVALUATE-%s-%s-%s: run_id=%d' % (dataset, algo, obj_metric, run_id))
    train_data, test_data = load_train_test_data(dataset, task_type=task_type)
    if task_type in CLS_TASKS:
        task_type = BINARY_CLS if len(set(
            train_data.data[1])) == 2 else MULTICLASS_CLS
    print(set(train_data.data[1]))

    raw_data, test_raw_data = load_train_test_data(dataset,
                                                   task_type=MULTICLASS_CLS)
    X, y = raw_data.data
    X_test, y_test = test_raw_data.data
    feat_type = [
        'Categorical' if _type == CATEGORICAL else 'Numerical'
        for _type in raw_data.feature_types
    ]
    from autosklearn.metrics import balanced_accuracy as balanced_acc
    automl = AutoSklearnClassifier(
        time_left_for_this_task=int(time_limit),
        per_run_time_limit=180,
        n_jobs=1,
        include_estimators=_algo,
        initial_configurations_via_metalearning=0,
        ensemble_memory_limit=16384,
        ml_memory_limit=16384,
        # tmp_folder='/var/folders/0t/mjph32q55hd10x3qr_kdd2vw0000gn/T/autosklearn_tmp',
        ensemble_size=1,
        seed=int(seed),
        resampling_strategy='holdout',
        resampling_strategy_arguments={'train_size': 0.67})
    automl.fit(X.copy(), y.copy(), feat_type=feat_type, metric=balanced_acc)
    model_desc = automl.show_models()
    str_stats = automl.sprint_statistics()
    valid_results = automl.cv_results_['mean_test_score']
    print('Eval num: %d' % (len(valid_results)))

    validation_score = np.max(valid_results)

    # Test performance.
    automl.refit(X.copy(), y.copy())
    predictions = automl.predict(X_test)
    test_score = balanced_accuracy_score(y_test, predictions)

    # Print statistics about the auto-sklearn run such as number of
    # iterations, number of models failed with a time out.
    print(str_stats)
    print(model_desc)
    print('Validation Accuracy:', validation_score)
    print("Test Accuracy      :", test_score)

    save_path = save_dir + '%s-%s-%s-%d-%d.pkl' % (dataset, algo, obj_metric,
                                                   run_id, time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, algo, validation_score, test_score, task_type],
                    f)
Example #16
0
                                  delete_output_folder_after_terminate=False,
                                  shared_mode=False)

    model.fit(data,
              target,
              metric='f1_metric',
              feat_type=None,
              dataset_name='numerai_20161021')

    try:
        report(model.grid_scores_)
    except:
        pass

    with open('result.txt', 'w') as f:
        f.write(model.show_models())

    cv = StratifiedKFold(target, n_folds=3, shuffle=True, random_state=0)
    for train_idx, test_idx in list(cv)[:1]:
        model.refit(data.ix[train_idx, :], target[train_idx])
        ans = model.predict_proba(data.ix[test_idx, :])[:, 1]
        score = roc_auc_score(target[test_idx], ans)
        print('    score: %s' % score)
        print('    model thresh: %s, score: %s' %
              mcc_optimize(ans, target[test_idx]))

    model.refit(data.ix, target)
    del data
    gc.collect()

    try:
    data = dataframe.values
    X, y = data[:, :-1], data[:, -1]

    # minimally prepare dataset
    X = X.astype('float32')
    y = LabelEncoder().fit_transform(y.astype('str'))

    # split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

    # define search
    model = AutoSklearnClassifier(time_left_for_this_task=10*60, per_run_time_limit=45, n_jobs=6)

    # perform the search
    model.fit(X_train, y_train)
    # summarize
    print(model.sprint_statistics())

    # get model and weights
    model_weights = model.get_models_with_weights()
    for model_weight in model_weights:
        print(model_weight)

    print("Show models")
    models_def = model.show_models()
    print(models_def)

    # evaluate best model
    y_hat = model.predict(X_test)
    acc = accuracy_score(y_test, y_hat)
    print("Test Dataset Accuracy: %.3f" % acc)