Beispiel #1
0
def evaluate_evaluation_based_fe(dataset, time_limit, seed=1):
    # Prepare the configuration for random forest.
    from ConfigSpace.hyperparameters import UnParametrizedHyperparameter
    from autosklearn.pipeline.components.classification.random_forest import RandomForest
    cs = RandomForest.get_hyperparameter_search_space()
    clf_hp = UnParametrizedHyperparameter("estimator", 'random_forest')
    cs.add_hyperparameter(clf_hp)
    evaluator = Evaluator(cs.get_default_configuration(), name='fe', seed=seed)

    raw_data = load_data(dataset, datanode_returned=True)

    pipeline = FEPipeline(fe_enabled=True,
                          optimizer_type='eval_base',
                          time_budget=time_limit,
                          evaluator=evaluator,
                          seed=seed,
                          model_id='random_forest',
                          time_limit_per_trans=300)
    train_data = pipeline.fit_transform(raw_data)

    score = evaluator(None, data_node=train_data)
    print('==> Base validation score', score)

    save_path = proj_dir + 'data/fe_%s_%d.pkl' % (dataset, time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, score], f)
    return score
def evaluate_fe_pipeline():
    from automlToolkit.utils.data_manager import DataManager
    dm = DataManager()
    # file_path = "data/proprocess_data.csv"
    file_path = 'data/a9a/dataset_183_adult.csv'
    dm.load_train_csv(file_path)

    pipeline = FEPipeline(fe_enabled=True).fit(dm)
    train_data = pipeline.transform(dm)
    print(train_data)
    print(train_data.data)
Beispiel #3
0
def evaluate_fe_bugs(dataset, run_id, time_limit, seed):
    algorithms = [
        'lda', 'k_nearest_neighbors', 'libsvm_svc', 'sgd', 'adaboost',
        'random_forest', 'extra_trees', 'decision_tree'
    ]
    algo_id = np.random.choice(algorithms, 1)[0]
    task_id = '%s-fe-%s-%d' % (dataset, algo_id, run_id)
    print(task_id)

    # Prepare the configuration for random forest.
    clf_class = _classifiers[algo_id]
    cs = clf_class.get_hyperparameter_search_space()
    clf_hp = UnParametrizedHyperparameter("estimator", algo_id)
    cs.add_hyperparameter(clf_hp)
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')

    pipeline = FEPipeline(fe_enabled=True,
                          optimizer_type='eval_base',
                          time_budget=time_limit,
                          evaluator=evaluator,
                          seed=seed,
                          model_id=algo_id,
                          time_limit_per_trans=per_run_time_limit,
                          task_id=task_id)

    raw_data, test_raw_data = load_train_test_data(dataset)
    train_data = pipeline.fit_transform(raw_data.copy_())
    test_data = pipeline.transform(test_raw_data.copy_())
    train_data_new = pipeline.transform(raw_data.copy_())

    assert (train_data.data[0] == train_data_new.data[0]).all()
    assert (train_data.data[1] == train_data_new.data[1]).all()
    assert (train_data_new == train_data)

    score = evaluator(None, data_node=test_data)
    print('==> Test score', score)
Beispiel #4
0
def load_data(dataset, data_dir='./', datanode_returned=False):
    dm = DataManager()
    data_path = data_dir + 'data/datasets/%s.csv' % dataset

    if dataset in ['credit_default']:
        data_path = data_dir + 'data/datasets/%s.xls' % dataset

    # Load train data.
    if dataset in ['higgs', 'amazon_employee', 'spectf', 'usps']:
        label_column = 0
    else:
        label_column = -1

    if dataset in ['spambase', 'messidor_features']:
        header = None
    else:
        header = 'infer'

    if dataset in ['winequality_white', 'winequality_red']:
        sep = ';'
    else:
        sep = ','

    train_data_node = dm.load_train_csv(data_path,
                                        label_col=label_column,
                                        header=header,
                                        sep=sep)

    pipeline = FEPipeline(fe_enabled=False, metric='acc')
    train_data = pipeline.fit_transform(train_data_node)
    if datanode_returned:
        return train_data
    else:
        X, y = train_data.data
        feature_types = train_data.feature_types
        return X, y, feature_types
if ensemble_method == 'none':
    ensemble_method = None

print('==> Start to evaluate with Budget %d' % time_limit)

dm = DataManager()
train_node = dm.load_train_csv("train_dataset.csv",
                               label_col=-1,
                               header='infer',
                               na_values=['nan', '?'])
test_node = dm.load_test_csv("test_dataset.csv",
                             header='infer',
                             has_label=True)
from automlToolkit.components.utils.constants import REGRESSION

pipeline = FEPipeline(fe_enabled=False, task_type=REGRESSION)
train_data = pipeline.fit_transform(train_node)
test_data = pipeline.transform(test_node)

save_dir = './data/eval_exps/automl-toolkit'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

rgs = Regressor(metric='mse',
                ensemble_method=ensemble_method,
                evaluation=eval_type,
                time_limit=time_limit,
                output_dir=save_dir,
                random_state=1,
                n_jobs=n_jobs)
def evaluation_based_feature_engineering(time_limit, seed=1):
    if task_id == 3 and regressor_id == 'lightgbm':
        config = {'colsample_bytree': 0.556390018826356,
                  'estimator': 'lightgbm',
                  'learning_rate': 0.027650212980431577,
                  'min_child_weight': 4,
                  'n_estimators': 1000,  # 2493,
                  'num_leaves': 818,
                  'reg_alpha': 0.00012695064964599962,
                  'reg_lambda': 0.0006320421481400761,
                  'subsample': 0.5611631795995178}
    elif task_id == 1 and regressor_id == 'lightgbm':
        config = {'colsample_bytree': 0.5836692544286752,
                  'estimator': 'lightgbm',
                  'learning_rate': 0.025011125056624308,
                  'min_child_weight': 3,
                  'n_estimators': 1000,  # 2000,
                  'num_leaves': 958,
                  'reg_alpha': 0.00025307513851761005,
                  'reg_lambda': 0.01911305077512719,
                  'subsample': 0.7850946965061745
                  }
    elif task_id == 3 and regressor_id == 'catboost_gpu':
        config = {'loss_function': 'RMSE',
                  'task_type': 'GPU',
                  'bootstrap_type': 'Poisson',
                  'learning_rate': 0.07215105304885769,
                  'n_estimators': 10000,
                  'min_child_samples': 7,
                  'max_depth': 8,
                  'reg_lambda': 4.084654778260157e-06,
                  'subsample': 0.9998568450178255
                  }
    elif task_id == 1 and regressor_id == 'catboost_gpu':
        config = {'loss_function': 'RMSE',
                  'task_type': 'GPU',
                  'bootstrap_type': 'Poisson',
                  'learning_rate': 0.030167431274216235,
                  'n_estimators': 10000,
                  'min_child_samples': 2,
                  'max_depth': 11,
                  'reg_lambda': 0.00010924008880152775,
                  'subsample': 0.9996005646983249
                  }
    else:
        raise ValueError("Hyperparameters not available!")

    config.pop('estimator', None)
    if regressor_id == 'lightgbm':
        estimator = LightGBMRegressor(**config)
    elif 'catboost' in regressor_id:
        estimator = CatBoostRegressor(**config)
    scorer = make_scorer(smape, greater_is_better=False)
    evaluator = RegressionEvaluator(None, scorer, name='fe', seed=seed, estimator=estimator)
    train_data, test_data = fetch_data(task_id)

    X, y = train_data.data
    idxs = np.arange(X.shape[0])
    np.random.shuffle(idxs)
    sample_size = int(X.shape[0] * train_size)
    subset_ids = idxs[:sample_size]
    X, y = X.iloc[subset_ids, :], y[subset_ids]
    train_data.data = [X, y]
    print(train_data)
    """
    nystronem_sampler: 15 bad
    kitchen_sinks: 13 bad
    random_trees_embedding: 18 bad
    feature_agglomeration_decomposer: 11 timeout.
    """
    # TODO: fast_ica, kernel_pca, and polynomial_features.
    # trans_used = [0, 3, 4, 5, 12, 16, 19, 30, 31, 32]
    # trans_used = [0, 3, 4, 5, 10, 11, 12, 16, 17, 19]
    # trans_used = [17, 30, 31]
    # trans_used = [30]
    pipeline = FEPipeline(task_type='regression', task_id='anti_plague',
                          fe_enabled=True, optimizer_type='eval_base',
                          time_budget=time_limit, evaluator=evaluator,
                          seed=seed, model_id='lightgbm',
                          time_limit_per_trans=900,
                          trans_set=None
                          )
    transformed_train_data = pipeline.fit_transform(train_data)
    print(pipeline.optimizer.get_incumbent_path())
    print('final train data shape & score', transformed_train_data.shape, transformed_train_data.score)
    transformed_test_datanode = pipeline.transform(test_data)
    print('final test data shape', transformed_test_datanode.shape)

    # Save results.
    np.save(data_dir + 'data/transformed_train_x-%d.csv' % task_id, transformed_train_data.data[0])
    np.save(data_dir + 'data/transformed_train_y-%d.csv' % task_id, transformed_train_data.data[1])
    np.save(data_dir + 'data/transformed_test-%d.csv' % task_id, transformed_test_datanode.data[0])