def process_auto_sklearn(X_train, X_test, y_train, df_types, m_type, seed, *args):
    """Function that trains and tests data using auto-sklearn"""

    from autosklearn.classification import AutoSklearnClassifier
    from autosklearn.regression import AutoSklearnRegressor
    from autosklearn.metrics import f1_weighted
    from autosklearn.metrics import mean_squared_error

    categ_cols = df_types[df_types.NAME != 'target']['TYPE'].values.ravel()

    if m_type == 'classification':
        automl = AutoSklearnClassifier(time_left_for_this_task=TIME_PER_TASK,
                                       per_run_time_limit=int(TIME_PER_TASK/8),
                                       seed=seed,
                                       resampling_strategy='cv',
                                       resampling_strategy_arguments={'folds': 5},
                                       delete_tmp_folder_after_terminate=False)
    else:
        automl = AutoSklearnRegressor(time_left_for_this_task=TIME_PER_TASK,
                                      per_run_time_limit=int(TIME_PER_TASK/8),
                                      seed=seed,
                                      resampling_strategy='cv',
                                      resampling_strategy_arguments={'folds': 5},
                                      delete_tmp_folder_after_terminate=False)
    
    automl.fit(X_train.copy(),
        y_train.copy(), 
        feat_type=categ_cols,
        metric=f1_weighted if m_type == 'classification' else mean_squared_error)
    automl.refit(X_train.copy(), y_train.copy())

    return (automl.predict_proba(X_test) if m_type == 'classification' else 
            automl.predict(X_test))
Example #2
0
    def test_classification_pandas_support(self):
        X, y = sklearn.datasets.fetch_openml(
            data_id=2,  # cat/num dataset
            return_X_y=True,
            as_frame=True,
        )

        # Drop NAN!!
        X = X.dropna('columns')

        # This test only make sense if input is dataframe
        self.assertTrue(isinstance(X, pd.DataFrame))
        self.assertTrue(isinstance(y, pd.Series))
        automl = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            exclude_estimators=['libsvm_svc'],
            seed=5,
        )

        automl.fit(X, y)

        # Make sure that at least better than random.
        # We use same X_train==X_test to test code quality
        self.assertTrue(automl.score(X, y) > 0.555)

        automl.refit(X, y)

        # Make sure that at least better than random.
        # accuracy in sklearn needs valid data
        # It should be 0.555 as the dataset is unbalanced.
        y = automl._automl[0].InputValidator.encode_target(y)
        prediction = automl._automl[0].InputValidator.encode_target(automl.predict(X))
        self.assertTrue(accuracy(y, prediction) > 0.555)
def goldstone_autosklearn():

    all_df = pd.read_csv(
        '/home/shoe/automl_scores/TR13a_Goldstone_Table_1_Full_problem_TRAIN/13-11-2019 01:54:44/splits/all.csv'
    )

    X = [
        "sftptv2a3", "sftptv2a4", "sftptv2a5", "sftptv2a2", "sftptv2a6",
        "logim", "maccat", "disp4cat", "stratidc"
    ]

    y = 'sftpcons'
    automl = AutoSklearnClassifier(time_left_for_this_task=60 * 5)

    stimulus, preprocessor = preprocess(
        all_df,
        {'problem': {
            "predictors": X,
            'targets': [y],
            'categorical': []
        }})
    automl.fit(stimulus, all_df[y])

    stimulus_all = preprocessor.transform(all_df)
    automl.refit(stimulus_all, all_df[y])

    print(accuracy_score(all_df[y], automl.predict(stimulus_all)))
Example #4
0
class AutoClassifier(Classifier):
    def __init__(self, time_left_for_this_task, per_run_time_limit, folds):
        now = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
        self.automl = AutoSklearnClassifier(
            time_left_for_this_task=time_left_for_this_task,
            per_run_time_limit=per_run_time_limit,
            #tmp_folder='/tmp/autosklearn_switch_tmp',
            #output_folder='/tmp/autosklearn_switch_out',
            #delete_tmp_folder_after_terminate=False,
            #delete_output_folder_after_terminate=False,
            #shared_mode=True,
            resampling_strategy='cv',
            resampling_strategy_arguments={'folds': folds})

    def classify(self, X_train, y_train, X_test):
        # fit() changes the data in place, but refit needs the original data. We
        # therefore copy the data. In practice, one should reload the data
        self.automl.fit(X_train.copy(), y_train.copy())
        # During fit(), models are fit on individual cross-validation folds. To use
        # all available data, we call refit() which trains all models in the
        # final ensemble on the whole dataset.
        self.automl.refit(X_train.copy(), y_train.copy())

        predictions = self.automl.predict(X_test)

        return predictions


    def show_models(self):
        return self.automl.show_models()
Example #5
0
 def fit_autosk_trial(self, trial, metric, **kwargs):
     # n_jobs = basic.get_approp_n_jobs(n_jobs)
     trial_number = trial.number
     params = trial.clf_params
     autosk_clf = AutoSklearnClassifier(**params)
     # X_train = self.storage.X_train
     # y_train = self.storage.y_train
     # TODO metrics to trial
     autosk_clf.fit(self.storage.X_train,
                    self.storage.y_train,
                    metric=metric)
     if autosk_clf.resampling_strategy not in [
             'holdout', 'holdout-iterative-fit'
     ]:
         self.logger.warning(
             'Predict is currently not implemented for resampling strategy, refit it.'
         )
         self.logger.warning(
             'we call refit() which trains all models in the final ensemble on the whole dataset.'
         )
         autosk_clf.refit(self.storage.X_train, self.storage.y_train)
         self.logger.info('Trial#{0} info :{1}'.format(
             trial_number, autosk_clf.sprint_statistics()))
     trial.clf = autosk_clf
     return trial
Example #6
0
 def fit_model(self, X, y, classifier_params=None, fit_params=None):
     classifier_params = classifier_params or {}
     fit_params = fit_params or {}
     X_train, _, y_train, _ = train_test_split(X, y)
     auto = AutoSklearnClassifier(**classifier_params)
     auto.fit(X_train, y_train, **fit_params)
     if ("resampling_strategy" in classifier_params
             and classifier_params["resampling_strategy"] == "cv"
             and auto.ensemble_size != 0):
         # X_train, _, y_train, _ = train_test_split(X, y)
         auto.refit(X_train, y_train)
     return auto
Example #7
0
 def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit):
     automl = AutoSklearnClassifier()
     X = [[1], [2], [3]]
     y = [1, 2, 3]
     automl.fit(X, y)
     self.assertEqual(fit.call_count, 1)
     self.assertIsInstance(fit.call_args[0][0], np.ndarray)
     self.assertIsInstance(fit.call_args[0][1], np.ndarray)
     automl.refit(X, y)
     self.assertEqual(refit.call_count, 1)
     self.assertIsInstance(refit.call_args[0][0], np.ndarray)
     self.assertIsInstance(refit.call_args[0][1], np.ndarray)
     automl.fit_ensemble(y)
     self.assertEqual(fit_ensemble.call_count, 1)
     self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
Example #8
0
 def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit):
     automl = AutoSklearnClassifier()
     X = [[1], [2], [3]]
     y = [1, 2, 3]
     automl.fit(X, y)
     self.assertEqual(fit.call_count, 1)
     self.assertIsInstance(fit.call_args[0][0], np.ndarray)
     self.assertIsInstance(fit.call_args[0][1], np.ndarray)
     automl.refit(X, y)
     self.assertEqual(refit.call_count, 1)
     self.assertIsInstance(refit.call_args[0][0], np.ndarray)
     self.assertIsInstance(refit.call_args[0][1], np.ndarray)
     automl.fit_ensemble(y)
     self.assertEqual(fit_ensemble.call_count, 1)
     self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
def test_classification_pandas_support(tmp_dir, output_dir, dask_client):

    X, y = sklearn.datasets.fetch_openml(
        data_id=2,  # cat/num dataset
        return_X_y=True,
        as_frame=True,
    )

    # Drop NAN!!
    X = X.dropna('columns')

    # This test only make sense if input is dataframe
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
    automl = AutoSklearnClassifier(
        time_left_for_this_task=30,
        per_run_time_limit=5,
        exclude_estimators=['libsvm_svc'],
        dask_client=dask_client,
        seed=5,
        tmp_folder=tmp_dir,
        output_folder=output_dir,
    )

    automl.fit(X, y)

    log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0]

    # Make sure that at least better than random.
    # We use same X_train==X_test to test code quality
    assert automl.score(X, y) > 0.555, extract_msg_from_log(log_file_path)

    automl.refit(X, y)

    # Make sure that at least better than random.
    # accuracy in sklearn needs valid data
    # It should be 0.555 as the dataset is unbalanced.
    y = automl.automl_.InputValidator.encode_target(y)
    prediction = automl.automl_.InputValidator.encode_target(automl.predict(X))
    assert accuracy(y, prediction) > 0.555
    assert count_succeses(automl.cv_results_) > 0
Example #10
0
def test_classification_pandas_support(tmp_dir, output_dir, dask_client):

    X, y = sklearn.datasets.fetch_openml(
        data_id=2,  # cat/num dataset
        return_X_y=True,
        as_frame=True,
    )

    # Drop NAN!!
    X = X.dropna('columns')

    # This test only make sense if input is dataframe
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
    automl = AutoSklearnClassifier(
        time_left_for_this_task=30,
        per_run_time_limit=5,
        exclude_estimators=['libsvm_svc'],
        dask_client=dask_client,
        seed=5,
        tmp_folder=tmp_dir,
        output_folder=output_dir,
    )

    automl.fit(X, y)

    # Make sure that at least better than random.
    # We use same X_train==X_test to test code quality
    assert automl.score(X, y) > 0.555, print_debug_information(automl)

    automl.refit(X, y)

    # Make sure that at least better than random.
    # accuracy in sklearn needs valid data
    # It should be 0.555 as the dataset is unbalanced.
    prediction = automl.predict(X)
    assert accuracy(y, prediction) > 0.555
    assert count_succeses(automl.cv_results_) > 0
    assert includes_train_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True
def gelpi_avdan_autosklearn():

    train_df = pd.read_csv(
        '/home/shoe/automl_scores/TR11_Gelpi_Avdan_problem_TRAIN/11-11-2019 01:56:40/splits/train.csv'
    )
    test_df = pd.read_csv(
        '/home/shoe/automl_scores/TR11_Gelpi_Avdan_problem_TRAIN/11-11-2019 01:56:40/splits/test.csv'
    )

    X = [
        "polity2b", "polity2borigin", "loggdptarget", "logpop", "majpowhome",
        "majpoworigin", "coloniallink", "ethnictie", "ethnicPCW",
        "ethnicany911", "dyadalliance", "dyadalliancePCW", "rivalrydummy",
        "postCW", "post911", "lndyaddist", "dyadpcyear1", "dyadpcyear2",
        "dyadpcyear3", "dyadpcyear4", "year"
    ]

    y = 'incident'
    automl = AutoSklearnClassifier(time_left_for_this_task=60 * 10)

    stimulus, preprocessor = preprocess(
        train_df,
        {'problem': {
            "predictors": X,
            'targets': [y],
            'categorical': []
        }})
    automl.fit(stimulus, train_df[y])
    automl.refit(stimulus, train_df[y])

    stimulus_test = preprocessor.transform(test_df)

    global predictions
    predictions = automl.predict_proba(stimulus_test)

    global pred_raw
    pred_raw = automl.predict(stimulus_test)

    print(predictions)
    print(roc_auc_score(test_df[y], predictions[:, 1]))
Example #12
0
    def test_classification_methods_returns_self(self):
        X_train, y_train, X_test, y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       ensemble_size=0)

        automl_fitted = automl.fit(X_train, y_train)
        self.assertIs(automl, automl_fitted)

        automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
        self.assertIs(automl, automl_ensemble_fitted)

        automl_refitted = automl.refit(X_train.copy(), y_train.copy())
        self.assertIs(automl, automl_refitted)
Example #13
0
    def test_classification_methods_returns_self(self):
        X_train, y_train, X_test, y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       ensemble_size=0)

        automl_fitted = automl.fit(X_train, y_train)
        self.assertIs(automl, automl_fitted)

        automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
        self.assertIs(automl, automl_ensemble_fitted)

        automl_refitted = automl.refit(X_train.copy(), y_train.copy())
        self.assertIs(automl, automl_refitted)
def gleditsch_ward_autosklearn():

    train_df = pd.read_csv(
        '/home/shoe/automl_scores/TR12c_Gleditsch_Ward_Combined_problem_TRAIN/13-11-2019 01:16:06/splits/train.csv'
    )
    test_df = pd.read_csv(
        '/home/shoe/automl_scores/TR12c_Gleditsch_Ward_Combined_problem_TRAIN/13-11-2019 01:16:06/splits/test.csv'
    )

    X = [
        "pmid", "py", "py2", "py3", "terriss", "riveriss", "mariss", "terrAtt",
        "rivAtt", "marAtt", "minpol", "rbal", "lnkmdist"
    ]

    y = 'mido'
    automl = AutoSklearnClassifier(time_left_for_this_task=60 * 5)

    stimulus, preprocessor = preprocess(
        train_df,
        {'problem': {
            "predictors": X,
            'targets': [y],
            'categorical': []
        }})
    automl.fit(stimulus, train_df[y])
    automl.refit(stimulus, train_df[y])

    stimulus_test = preprocessor.transform(test_df)

    global predictions
    predictions = automl.predict_proba(stimulus_test)

    global pred_raw
    pred_raw = automl.predict(stimulus_test)

    print(predictions)
    print(roc_auc_score(test_df[y], predictions[:, 1]))
Example #15
0
def test_autosklearn_classification_methods_returns_self(dask_client):
    X_train, y_train, X_test, y_test = putil.get_dataset('iris')
    automl = AutoSklearnClassifier(time_left_for_this_task=60,
                                   per_run_time_limit=10,
                                   ensemble_size=0,
                                   dask_client=dask_client,
                                   exclude_preprocessors=['fast_ica'])

    automl_fitted = automl.fit(X_train, y_train)
    assert automl is automl_fitted

    automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
    assert automl is automl_ensemble_fitted

    automl_refitted = automl.refit(X_train.copy(), y_train.copy())
    assert automl is automl_refitted
Example #16
0
              target,
              metric='f1_metric',
              feat_type=None,
              dataset_name='numerai_20161021')

    try:
        report(model.grid_scores_)
    except:
        pass

    with open('result.txt', 'w') as f:
        f.write(model.show_models())

    cv = StratifiedKFold(target, n_folds=3, shuffle=True, random_state=0)
    for train_idx, test_idx in list(cv)[:1]:
        model.refit(data.ix[train_idx, :], target[train_idx])
        ans = model.predict_proba(data.ix[test_idx, :])[:, 1]
        score = roc_auc_score(target[test_idx], ans)
        print('    score: %s' % score)
        print('    model thresh: %s, score: %s' %
              mcc_optimize(ans, target[test_idx]))

    model.refit(data.ix, target)
    del data
    gc.collect()

    try:
        with open('tmp_model.pkl', 'wb') as f:
            pickle.dump(model, f, -1)
    except:
        pass
Example #17
0
sleep(20)
p("Ensemble built")

p("Show models")
print(c.show_models())
p("Predicting")
y_hat = c.predict(X_test.values)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))

if df_unknown.shape[0]==0:
    p("nothing to predict. Prediction dataset is empty.")
    exit()

p("Re-fitting on full known dataset. This can take long for a large set.")
try:
    c.refit(X.values, y)
except Exception as e:
    p("Refit failed, restarting")
    print(e)
    try:
        X=X.values
        indices = np.arange(X.shape[0])
        np.random.shuffle(indices)
        X = X[indices]
        y = y[indices]
        c.refit(X, y)
    except Exception as e:
        p("Second refit failed, exiting")
        print(e)
        exit()
Example #18
0
def main(argv):

    # reading the command line
    helpString = 'python python_script_JAD_paper -a <trainingSet> -b <testSet> -t <timeForEachWorker> -n <numWorkers>'
    try:
        opts, args = getopt.getopt(argv, "ha:b:t:n:")
    except getopt.GetoptError:
        print(helpString)
        sys.exit(2)

    # collecting the arguments
    for opt, arg in opts:
        if opt == '-h':
            print(helpString)
            sys.exit()
        elif opt == '-a':
            training_set = arg
        elif opt == '-b':
            test_set = arg
        elif opt == '-t':
            time_left_for_this_task = int(arg)
        elif opt == '-n':
            n_processes = int(arg)

    # starting counting the time
    start_time = time.time()

    # folders
    tmp_folder = './tmp/autosklearn_tmp/' + training_set
    output_folder = './tmp/autosklearn_out/' + training_set

    # ensuring the folders are empty (?)
    for tmpDir in [tmp_folder, output_folder]:
        try:
            shutil.rmtree(tmpDir)
        except OSError as e:
            pass

    # reading the training data
    trainingData = pandas.read_csv(filepath_or_buffer='./tmp/data/' +
                                   training_set + '.csv',
                                   index_col=False)
    y_train = trainingData['target']
    X_train = trainingData.drop('target', 1)

    # reading the test data
    testData = pandas.read_csv(filepath_or_buffer='./tmp/data/' + test_set +
                               '.csv',
                               index_col=False)
    y_test = testData['target']
    X_test = testData.drop('target', 1)

    # main block
    try:

        # creating the sub-process function
        processes = []
        spawn_classifier = get_spawn_classifier(X_train, y_train, training_set,
                                                time_left_for_this_task,
                                                tmp_folder, output_folder)

        # spawning the subprocesses
        for i in range(small_constant, small_constant + n_processes):
            p = multiprocessing.Process(target=spawn_classifier, args=[i])
            p.start()
            processes.append(p)

        # waiting until all processes are done
        for p in processes:
            p.join()

        # retrieving the csRes and concatenating in a single data frame
        csvFiles = glob.glob('./tmp/results/' + training_set + '/*.csv')
        cvRes = pandas.read_csv(filepath_or_buffer=csvFiles[0], index_col=0)
        for csvFile in csvFiles[1:]:
            cvRes_tmp = pandas.read_csv(filepath_or_buffer=csvFile,
                                        index_col=0)
            cvRes = pandas.concat([cvRes, cvRes_tmp], axis=0, sort=False)

        # writing the cvRes on file
        cvRes.to_csv('./tmp/results/' + training_set + '/cvRes.csv',
                     index=False)

        # building the ensemble
        automl_ensemble = AutoSklearnClassifier(
            time_left_for_this_task=
            time_left_for_this_task,  # sec., how long should this seed fit process run
            delete_tmp_folder_after_terminate=False,
            delete_output_folder_after_terminate=False,
            seed=12345,
            shared_mode=True,
            ensemble_size=50,
            ensemble_nbest=50,
            tmp_folder=tmp_folder,
            output_folder=output_folder)
        automl_ensemble.fit_ensemble(y_train.copy(),
                                     task=BINARY_CLASSIFICATION,
                                     metric=autosklearn.metrics.roc_auc)

        # building the best model
        automl_bestModel = AutoSklearnClassifier(
            time_left_for_this_task=
            time_left_for_this_task,  # sec., how long should this seed fit process run
            delete_tmp_folder_after_terminate=False,
            delete_output_folder_after_terminate=False,
            shared_mode=True,
            ensemble_size=1,
            ensemble_nbest=1,
            tmp_folder=tmp_folder,
            output_folder=output_folder)
        automl_bestModel.fit_ensemble(y_train.copy(),
                                      task=BINARY_CLASSIFICATION,
                                      metric=autosklearn.metrics.roc_auc)

        # refitting on the whole dataset
        automl_bestModel.refit(X_train.copy(), y_train.copy())
        automl_ensemble.refit(X_train.copy(), y_train.copy())

        # extracting the performances on test set
        automl_bestModel.target_type = 'multilabel-indicator'
        automl_ensemble.target_type = 'multilabel-indicator'
        predictions_bestModel = automl_bestModel.predict_proba(X_test.copy())
        predictions_ensemble = automl_ensemble.predict_proba(X_test.copy())

        # saving the results on file
        toSave = pandas.DataFrame({'outcome': y_test})
        toSave['prob_ensemble'] = predictions_ensemble[:, 0]
        toSave['prob_bestModel'] = predictions_bestModel[:, 0]
        toSave.to_csv('./tmp/results/' + training_set + '/holdoutRes.csv')

        # stopping counting the time
        end_time = time.time()

        # saving total time
        total_time = end_time - start_time
        time_file = open('./tmp/results/' + training_set + '/etime.txt', "w+")
        tmp = time_file.write('Total time in seconds: %d\n' % total_time)
        time_file.close()

    except Exception as e:
        print(e)

    finally:

        # removing the tmp results folder
        shutil.rmtree(tmp_folder + '/.auto-sklearn/models')
Example #19
0
def evaluate_ml_algorithm(dataset,
                          algo,
                          run_id,
                          obj_metric,
                          time_limit=600,
                          seed=1,
                          task_type=None):
    if algo == 'lightgbm':
        _algo = ['LightGBM']
        add_classifier(LightGBM)
    elif algo == 'logistic_regression':
        _algo = ['Logistic_Regression']
        add_classifier(Logistic_Regression)
    else:
        _algo = [algo]
    print('EVALUATE-%s-%s-%s: run_id=%d' % (dataset, algo, obj_metric, run_id))
    train_data, test_data = load_train_test_data(dataset, task_type=task_type)
    if task_type in CLS_TASKS:
        task_type = BINARY_CLS if len(set(
            train_data.data[1])) == 2 else MULTICLASS_CLS
    print(set(train_data.data[1]))

    raw_data, test_raw_data = load_train_test_data(dataset,
                                                   task_type=MULTICLASS_CLS)
    X, y = raw_data.data
    X_test, y_test = test_raw_data.data
    feat_type = [
        'Categorical' if _type == CATEGORICAL else 'Numerical'
        for _type in raw_data.feature_types
    ]
    from autosklearn.metrics import balanced_accuracy as balanced_acc
    automl = AutoSklearnClassifier(
        time_left_for_this_task=int(time_limit),
        per_run_time_limit=180,
        n_jobs=1,
        include_estimators=_algo,
        initial_configurations_via_metalearning=0,
        ensemble_memory_limit=16384,
        ml_memory_limit=16384,
        # tmp_folder='/var/folders/0t/mjph32q55hd10x3qr_kdd2vw0000gn/T/autosklearn_tmp',
        ensemble_size=1,
        seed=int(seed),
        resampling_strategy='holdout',
        resampling_strategy_arguments={'train_size': 0.67})
    automl.fit(X.copy(), y.copy(), feat_type=feat_type, metric=balanced_acc)
    model_desc = automl.show_models()
    str_stats = automl.sprint_statistics()
    valid_results = automl.cv_results_['mean_test_score']
    print('Eval num: %d' % (len(valid_results)))

    validation_score = np.max(valid_results)

    # Test performance.
    automl.refit(X.copy(), y.copy())
    predictions = automl.predict(X_test)
    test_score = balanced_accuracy_score(y_test, predictions)

    # Print statistics about the auto-sklearn run such as number of
    # iterations, number of models failed with a time out.
    print(str_stats)
    print(model_desc)
    print('Validation Accuracy:', validation_score)
    print("Test Accuracy      :", test_score)

    save_path = save_dir + '%s-%s-%s-%d-%d.pkl' % (dataset, algo, obj_metric,
                                                   run_id, time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, algo, validation_score, test_score, task_type],
                    f)