Example #1
0
class AutoSklearnBaselineModel(Model):
    def __init__(
        self,
        name: str,
        model_params: Dict[str, Any],
    ) -> None:
        super().__init__(name, model_params)
        self._model = AutoSklearnClassifier(**model_params)

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        self._model.fit(X, y)

    def save(self, path: str) -> None:
        with open(path, 'wb') as file:
            pickle.dump(self, file)

    @classmethod
    def load(cls, path: str):
        with open(path, 'rb') as file:
            model = pickle.load(file)
            return cast(AutoSklearnBaselineModel, model)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return self._model.predict(X)

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        return self._model.predict_proba(X)
Example #2
0
class AutoSklearnClassifierEnsemble(AutoSklearnModel, Ensemble):
    """
    Wrapper around an autosklearn model.
    """
    _kind: ModelType = 'classifier'

    def __init__(self, **kwargs) -> None:
        Ensemble.__init__(self)
        client = Client(
            processes=False,
            n_workers=kwargs['n_jobs'],
            threads_per_worker=1,
            dashboard_address=None,
        )
        self.model = AutoSklearnClassifier(**kwargs, dask_client=client)

    def autosklearn_model(self) -> AutoSklearnClassifier:
        return self.model

    def predict(self, X: np.ndarray) -> np.ndarray:
        """ Get the models prediction """
        return self.model.predict_proba(X)

    def model_predictions(self, X: np.ndarray) -> np.ndarray:
        """ Get the models probability predicitons """
        return np.asarray([m.predict_proba(X) for m in self.models()])

    @classmethod
    def kind(cls) -> ModelType:
        return cls._kind
Example #3
0
def test_multilabel(tmp_dir, output_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                         make_multilabel=True)
    automl = AutoSklearnClassifier(time_left_for_this_task=30,
                                   per_run_time_limit=5,
                                   tmp_folder=tmp_dir,
                                   dask_client=dask_client,
                                   output_folder=output_dir)

    automl.fit(X_train, Y_train)

    predictions = automl.predict(X_test)
    assert predictions.shape == (50, 3), print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
    assert includes_train_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True

    score = f1_macro(Y_test, predictions)
    assert score >= 0.9, print_debug_information(automl)

    probs = automl.predict_proba(X_train)
    assert np.mean(probs) == pytest.approx(0.33, rel=1e-1)
def process_auto_sklearn(X_train, X_test, y_train, df_types, m_type, seed, *args):
    """Function that trains and tests data using auto-sklearn"""

    from autosklearn.classification import AutoSklearnClassifier
    from autosklearn.regression import AutoSklearnRegressor
    from autosklearn.metrics import f1_weighted
    from autosklearn.metrics import mean_squared_error

    categ_cols = df_types[df_types.NAME != 'target']['TYPE'].values.ravel()

    if m_type == 'classification':
        automl = AutoSklearnClassifier(time_left_for_this_task=TIME_PER_TASK,
                                       per_run_time_limit=int(TIME_PER_TASK/8),
                                       seed=seed,
                                       resampling_strategy='cv',
                                       resampling_strategy_arguments={'folds': 5},
                                       delete_tmp_folder_after_terminate=False)
    else:
        automl = AutoSklearnRegressor(time_left_for_this_task=TIME_PER_TASK,
                                      per_run_time_limit=int(TIME_PER_TASK/8),
                                      seed=seed,
                                      resampling_strategy='cv',
                                      resampling_strategy_arguments={'folds': 5},
                                      delete_tmp_folder_after_terminate=False)
    
    automl.fit(X_train.copy(),
        y_train.copy(), 
        feat_type=categ_cols,
        metric=f1_weighted if m_type == 'classification' else mean_squared_error)
    automl.refit(X_train.copy(), y_train.copy())

    return (automl.predict_proba(X_test) if m_type == 'classification' else 
            automl.predict(X_test))
Example #5
0
class AutoSklearnSelectorModel(SelectorModel):
    def __init__(
        self,
        name: str,
        model_params: Dict[str, Any],
        classifier_paths: Iterable[Tuple[str, str]],
    ) -> None:
        super().__init__(name, model_params, classifier_paths)
        self.selector = AutoSklearnClassifier(**model_params)

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        self.selector.fit(X, y)

    def predict(self, X: np.ndarray) -> np.ndarray:
        # TODO: Can optimize and make cleaner
        return np.asarray([
            self.classifiers[i].predict(instance.reshape(1, -1))
            for i, instance in zip(self.selections(X), X)
        ])

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        # TODO: Can optimize and make cleaner
        return np.asarray([
            self.classifiers[i].predict_proba(instance.reshape(1, -1))
            for i, instance in zip(self.selections(X), X)
        ])

    def selections(self, X: np.ndarray) -> np.ndarray:
        competences = self.competences(X)
        return np.argmax(competences, axis=1)

    def competences(self, X: np.ndarray) -> np.ndarray:
        return self.selector.predict_proba(X)

    def save(self, path: str) -> None:
        with open(path, 'wb') as f:
            pickle.dump(self, f)

    @classmethod
    def ensemble_selector(cls) -> bool:
        return False

    @classmethod
    def load(cls, path: str):
        # Inherits typing from parent
        with open(path, 'rb') as file:
            return cast(AutoSklearnSelectorModel, pickle.load(file))
Example #6
0
    def test_multilabel(self):
        output = os.path.join(self.test_dir, '..', '.tmp_multilabel_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset(
            'iris', make_multilabel=True)
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=output,
                                       output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (50, 3))
        score = f1_macro(Y_test, predictions)
        self.assertGreaterEqual(score, 0.9)
        probs = automl.predict_proba(X_train)
        self.assertAlmostEqual(np.mean(probs), 0.33333333333333331)
Example #7
0
    def test_multilabel(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_multilabel_fit')
        output = os.path.join(self.test_dir, '..', '.out_multilabel_fit')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset(
            'iris', make_multilabel=True)
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=tmp,
                                       output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (50, 3))
        score = f1_macro(Y_test, predictions)
        self.assertGreaterEqual(score, 0.9)
        probs = automl.predict_proba(X_train)
        self.assertAlmostEqual(np.mean(probs), 0.33, places=1)
def gelpi_avdan_autosklearn():

    train_df = pd.read_csv(
        '/home/shoe/automl_scores/TR11_Gelpi_Avdan_problem_TRAIN/11-11-2019 01:56:40/splits/train.csv'
    )
    test_df = pd.read_csv(
        '/home/shoe/automl_scores/TR11_Gelpi_Avdan_problem_TRAIN/11-11-2019 01:56:40/splits/test.csv'
    )

    X = [
        "polity2b", "polity2borigin", "loggdptarget", "logpop", "majpowhome",
        "majpoworigin", "coloniallink", "ethnictie", "ethnicPCW",
        "ethnicany911", "dyadalliance", "dyadalliancePCW", "rivalrydummy",
        "postCW", "post911", "lndyaddist", "dyadpcyear1", "dyadpcyear2",
        "dyadpcyear3", "dyadpcyear4", "year"
    ]

    y = 'incident'
    automl = AutoSklearnClassifier(time_left_for_this_task=60 * 10)

    stimulus, preprocessor = preprocess(
        train_df,
        {'problem': {
            "predictors": X,
            'targets': [y],
            'categorical': []
        }})
    automl.fit(stimulus, train_df[y])
    automl.refit(stimulus, train_df[y])

    stimulus_test = preprocessor.transform(test_df)

    global predictions
    predictions = automl.predict_proba(stimulus_test)

    global pred_raw
    pred_raw = automl.predict(stimulus_test)

    print(predictions)
    print(roc_auc_score(test_df[y], predictions[:, 1]))
def test_multilabel(tmp_dir, output_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                         make_multilabel=True)
    automl = AutoSklearnClassifier(time_left_for_this_task=30,
                                   per_run_time_limit=5,
                                   tmp_folder=tmp_dir,
                                   dask_client=dask_client,
                                   output_folder=output_dir)

    automl.fit(X_train, Y_train)
    # Log file path
    log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0]
    predictions = automl.predict(X_test)
    assert predictions.shape == (50, 3), extract_msg_from_log(log_file_path)
    assert count_succeses(
        automl.cv_results_) > 0, extract_msg_from_log(log_file_path)

    score = f1_macro(Y_test, predictions)
    assert score >= 0.9, extract_msg_from_log(log_file_path)

    probs = automl.predict_proba(X_train)
    assert np.mean(probs) == pytest.approx(0.33, rel=1e-1)
def gleditsch_ward_autosklearn():

    train_df = pd.read_csv(
        '/home/shoe/automl_scores/TR12c_Gleditsch_Ward_Combined_problem_TRAIN/13-11-2019 01:16:06/splits/train.csv'
    )
    test_df = pd.read_csv(
        '/home/shoe/automl_scores/TR12c_Gleditsch_Ward_Combined_problem_TRAIN/13-11-2019 01:16:06/splits/test.csv'
    )

    X = [
        "pmid", "py", "py2", "py3", "terriss", "riveriss", "mariss", "terrAtt",
        "rivAtt", "marAtt", "minpol", "rbal", "lnkmdist"
    ]

    y = 'mido'
    automl = AutoSklearnClassifier(time_left_for_this_task=60 * 5)

    stimulus, preprocessor = preprocess(
        train_df,
        {'problem': {
            "predictors": X,
            'targets': [y],
            'categorical': []
        }})
    automl.fit(stimulus, train_df[y])
    automl.refit(stimulus, train_df[y])

    stimulus_test = preprocessor.transform(test_df)

    global predictions
    predictions = automl.predict_proba(stimulus_test)

    global pred_raw
    pred_raw = automl.predict(stimulus_test)

    print(predictions)
    print(roc_auc_score(test_df[y], predictions[:, 1]))
        raise ValueError("Wrong set type, should be `train` or `test`!")
    # when the task if binary.classification or regression, transform it to multilabel
    if task == 'regression':
        labels = regression_to_multilabel(labels)
    elif task == 'binary.classification':
        labels = binary_to_multilabel(labels)
    return features, labels


if __name__ == '__main__':
    input_dir = '../../../autodl-contrib/raw_datasets/automl'
    output_dir = '../'
    for dataset_name in ['dorothea', 'adult']:
        D = DataManager(dataset_name,
                        input_dir,
                        replace_missing=False,
                        verbose=verbose)
        X_test, Y_test = _prepare_metadata_features_and_labels(D,
                                                               set_type='test')
        X_train, Y_train = _prepare_metadata_features_and_labels(
            D, set_type='train')
        print(Y_test.shape)
        time_budget = 7200
        model = AutoSklearnClassifier(time_left_for_this_task=time_budget,
                                      per_run_time_limit=time_budget // 10)
        model.fit(X_train, Y_train)
        predict_path = os.path.join(output_dir, dataset_name + '.predict')
        Y_hat_test = model.predict_proba(X_test)
        print(Y_hat_test.shape)
        data_io.write(predict_path, Y_hat_test)
Example #12
0
              metric='f1_metric',
              feat_type=None,
              dataset_name='numerai_20161021')

    try:
        report(model.grid_scores_)
    except:
        pass

    with open('result.txt', 'w') as f:
        f.write(model.show_models())

    cv = StratifiedKFold(target, n_folds=3, shuffle=True, random_state=0)
    for train_idx, test_idx in list(cv)[:1]:
        model.refit(data.ix[train_idx, :], target[train_idx])
        ans = model.predict_proba(data.ix[test_idx, :])[:, 1]
        score = roc_auc_score(target[test_idx], ans)
        print('    score: %s' % score)
        print('    model thresh: %s, score: %s' %
              mcc_optimize(ans, target[test_idx]))

    model.refit(data.ix, target)
    del data
    gc.collect()

    try:
        with open('tmp_model.pkl', 'wb') as f:
            pickle.dump(model, f, -1)
    except:
        pass
    p = Pool()
Example #13
0
def main(argv):

    # reading the command line
    helpString = 'python python_script_JAD_paper -a <trainingSet> -b <testSet> -t <timeForEachWorker> -n <numWorkers>'
    try:
        opts, args = getopt.getopt(argv, "ha:b:t:n:")
    except getopt.GetoptError:
        print(helpString)
        sys.exit(2)

    # collecting the arguments
    for opt, arg in opts:
        if opt == '-h':
            print(helpString)
            sys.exit()
        elif opt == '-a':
            training_set = arg
        elif opt == '-b':
            test_set = arg
        elif opt == '-t':
            time_left_for_this_task = int(arg)
        elif opt == '-n':
            n_processes = int(arg)

    # starting counting the time
    start_time = time.time()

    # folders
    tmp_folder = './tmp/autosklearn_tmp/' + training_set
    output_folder = './tmp/autosklearn_out/' + training_set

    # ensuring the folders are empty (?)
    for tmpDir in [tmp_folder, output_folder]:
        try:
            shutil.rmtree(tmpDir)
        except OSError as e:
            pass

    # reading the training data
    trainingData = pandas.read_csv(filepath_or_buffer='./tmp/data/' +
                                   training_set + '.csv',
                                   index_col=False)
    y_train = trainingData['target']
    X_train = trainingData.drop('target', 1)

    # reading the test data
    testData = pandas.read_csv(filepath_or_buffer='./tmp/data/' + test_set +
                               '.csv',
                               index_col=False)
    y_test = testData['target']
    X_test = testData.drop('target', 1)

    # main block
    try:

        # creating the sub-process function
        processes = []
        spawn_classifier = get_spawn_classifier(X_train, y_train, training_set,
                                                time_left_for_this_task,
                                                tmp_folder, output_folder)

        # spawning the subprocesses
        for i in range(small_constant, small_constant + n_processes):
            p = multiprocessing.Process(target=spawn_classifier, args=[i])
            p.start()
            processes.append(p)

        # waiting until all processes are done
        for p in processes:
            p.join()

        # retrieving the csRes and concatenating in a single data frame
        csvFiles = glob.glob('./tmp/results/' + training_set + '/*.csv')
        cvRes = pandas.read_csv(filepath_or_buffer=csvFiles[0], index_col=0)
        for csvFile in csvFiles[1:]:
            cvRes_tmp = pandas.read_csv(filepath_or_buffer=csvFile,
                                        index_col=0)
            cvRes = pandas.concat([cvRes, cvRes_tmp], axis=0, sort=False)

        # writing the cvRes on file
        cvRes.to_csv('./tmp/results/' + training_set + '/cvRes.csv',
                     index=False)

        # building the ensemble
        automl_ensemble = AutoSklearnClassifier(
            time_left_for_this_task=
            time_left_for_this_task,  # sec., how long should this seed fit process run
            delete_tmp_folder_after_terminate=False,
            delete_output_folder_after_terminate=False,
            seed=12345,
            shared_mode=True,
            ensemble_size=50,
            ensemble_nbest=50,
            tmp_folder=tmp_folder,
            output_folder=output_folder)
        automl_ensemble.fit_ensemble(y_train.copy(),
                                     task=BINARY_CLASSIFICATION,
                                     metric=autosklearn.metrics.roc_auc)

        # building the best model
        automl_bestModel = AutoSklearnClassifier(
            time_left_for_this_task=
            time_left_for_this_task,  # sec., how long should this seed fit process run
            delete_tmp_folder_after_terminate=False,
            delete_output_folder_after_terminate=False,
            shared_mode=True,
            ensemble_size=1,
            ensemble_nbest=1,
            tmp_folder=tmp_folder,
            output_folder=output_folder)
        automl_bestModel.fit_ensemble(y_train.copy(),
                                      task=BINARY_CLASSIFICATION,
                                      metric=autosklearn.metrics.roc_auc)

        # refitting on the whole dataset
        automl_bestModel.refit(X_train.copy(), y_train.copy())
        automl_ensemble.refit(X_train.copy(), y_train.copy())

        # extracting the performances on test set
        automl_bestModel.target_type = 'multilabel-indicator'
        automl_ensemble.target_type = 'multilabel-indicator'
        predictions_bestModel = automl_bestModel.predict_proba(X_test.copy())
        predictions_ensemble = automl_ensemble.predict_proba(X_test.copy())

        # saving the results on file
        toSave = pandas.DataFrame({'outcome': y_test})
        toSave['prob_ensemble'] = predictions_ensemble[:, 0]
        toSave['prob_bestModel'] = predictions_bestModel[:, 0]
        toSave.to_csv('./tmp/results/' + training_set + '/holdoutRes.csv')

        # stopping counting the time
        end_time = time.time()

        # saving total time
        total_time = end_time - start_time
        time_file = open('./tmp/results/' + training_set + '/etime.txt', "w+")
        tmp = time_file.write('Total time in seconds: %d\n' % total_time)
        time_file.close()

    except Exception as e:
        print(e)

    finally:

        # removing the tmp results folder
        shutil.rmtree(tmp_folder + '/.auto-sklearn/models')