def main():
    save_dir = './data/eval_exps/soln-ml'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    time_limit = 120
    print('==> Start to evaluate with Budget %d' % time_limit)
    ensemble_method = None
    eval_type = 'holdout'

    iris = load_iris()
    X, y = iris.data, iris.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1, stratify=y)
    dm = DataManager(X_train, y_train)
    train_data = dm.get_data_node(X_train, y_train)
    test_data = dm.get_data_node(X_test, y_test)

    clf = Classifier(time_limit=time_limit,
                     output_dir=save_dir,
                     ensemble_method=ensemble_method,
                     evaluation=eval_type,
                     metric='acc')
    clf.fit(train_data)
    pred = clf.predict(test_data)
    print(accuracy_score(test_data.data[1], pred))
Beispiel #2
0
def main():
    time_limit = 120
    print('==> Start to evaluate with Budget %d' % time_limit)

    iris = load_iris()
    X, y = iris.data, iris.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1, stratify=y)
    dm = DataManager(X_train, y_train)
    train_data = dm.get_data_node(X_train, y_train)
    test_data = dm.get_data_node(X_test, y_test)

    save_dir = './data/eval_exps/soln-ml'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    add_classifier(UserDefinedDecisionTree)
    clf = Classifier(time_limit=time_limit,
                     output_dir=save_dir,
                     enable_meta_algorithm_selection=False,
                     include_algorithms=['UserDefinedDecisionTree'],
                     ensemble_method=None,
                     metric='acc')
    _start_time = time.time()
    _iter_id = 0

    clf.fit(train_data)
    pred = clf.predict(test_data)

    print(accuracy_score(test_data.data[1], pred))
Beispiel #3
0
def model_fit(_id,obj,paramsj,X_trainj,y_trainj):
    info_path = './models_information/'+_id+'_information'
    info_file = open(info_path,'w')
    print('Model training begins!')
    try:
        # read data
        X_train = np.array(pd.DataFrame(json.loads(X_trainj)))
        y_train = np.array(pd.DataFrame(json.loads(y_trainj)))[:,0]
        params = json.loads(paramsj)

        #print(y_train)
        dm = DataManager(X_train, y_train)
        train_data = dm.get_data_node(X_train, y_train)
        save_dir = '../data/eval_exps/soln-ml'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        # train mode
        if(obj == 'clf'):
            mdl = Classifier(time_limit=params['time_limit'],
                    output_dir=save_dir,
                    ensemble_method=params['ensemble_method'],
                    evaluation=params['evaluation'],
                    metric=params['metric'],
                    n_jobs=4)

        elif(obj == 'reg'):
            mdl = rgs = Regressor(metric=params['metric'],
                    ensemble_method=params['ensemble_method'],
                    evaluation=params['evaluation'],
                    time_limit=params['time_limit'],
                    output_dir=save_dir,
                    random_state=1,
                    n_jobs=n_jobs)

        mdl.fit(train_data)

    except:
        print('Model training failed!')
        info_file.write('Model training failed!')
        info_file.close()
        return -1
    result = dict()
    result['best_algo_id'] = str(mdl.best_algo_id)
    result['best_hpo_config'] = str(mdl.best_hpo_config)
    result['nbest_algo_id'] = str(mdl.nbest_algo_id)
    result['best_perf'] = str(mdl.best_perf)
    result['best_fe_config'] = str(mdl.best_fe_config)
    result['get_ens_model_info'] = str(mdl.get_ens_model_info)
    #get_ens_model_info is not realized in this version yet
    info_file.write(json.dumps(result))
    info_file.close()
    print('Model training finished!')
    return 0
Beispiel #4
0
def test_cls_without_ensemble():
    save_dir = './data/eval_exps/soln-ml'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    time_limit = 60
    print('==> Start to evaluate with Budget %d' % time_limit)
    ensemble_method = None
    eval_type = 'cv'

    iris = load_iris()
    X, y = iris.data, iris.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1, stratify=y)
    dm = DataManager(X_train, y_train)
    train_data = dm.get_data_node(X_train, y_train)
    test_data = dm.get_data_node(X_test, y_test)

    clf = Classifier(time_limit=time_limit,
                     output_dir=save_dir,
                     ensemble_method=ensemble_method,
                     enable_meta_algorithm_selection=False,
                     evaluation=eval_type,
                     metric='acc')
    clf.fit(train_data)
    print(clf.summary())
    clf.refit()

    pred = clf.predict(test_data)
    print(accuracy_score(test_data.data[1], pred))

    shutil.rmtree(save_dir)
def evaluate():
    iris = load_iris()
    X, y = iris.data, iris.target
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=1)
    try:
        dm = DataManager(X_train, y_train)
        train_data = dm.get_data_node(X_train, y_train)
        test_data = dm.get_data_node(X_test, y_test)

        clf = Classifier(dataset_name='iris',
                         time_limit=150,
                         output_dir=save_dir,
                         ensemble_method=ensemble_method,
                         evaluation=eval_type,
                         metric='acc')
        clf.fit(train_data)
        clf.refit()
        pred = clf.predict(test_data)
        print('final score', clf.score(test_data))
    except Exception as e:
        return False
    return True
Beispiel #6
0
def evaluate_hmab(algorithms, run_id,
                  time_limit=600,
                  dataset='credit',
                  eval_type='holdout',
                  enable_ens=True, seed=1):
    task_id = '[hmab][%s-%d-%d]' % (dataset, len(algorithms), time_limit)
    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS)
    if enable_ens is True:
        ensemble_method = 'ensemble_selection'
    else:
        ensemble_method = None

    clf = Classifier(time_limit=time_limit,
                     amount_of_resource=None,
                     output_dir=save_dir,
                     ensemble_method=ensemble_method,
                     evaluation=eval_type,
                     metric='bal_acc',
                     n_jobs=1)
    clf.fit(train_data)
    clf.refit()
    pred = clf.predict(test_data)
    test_score = balanced_accuracy_score(test_data.data[1], pred)
    timestamps, perfs = clf.get_val_stats()
    validation_score = np.max(perfs)
    print('Dataset          : %s' % dataset)
    print('Validation/Test score : %f - %f' % (validation_score, test_score))

    save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
    with open(save_path, 'wb') as f:
        stats = [timestamps, perfs]
        pickle.dump([validation_score, test_score, stats], f)
def evaluate_sys(run_id, task_type, mth, dataset, ens_method, enable_meta,
                 eval_type='holdout', time_limit=1200, seed=1):
    _task_type = MULTICLASS_CLS if task_type == 'cls' else REGRESSION
    train_data, test_data = load_train_test_data(dataset, task_type=_task_type)
    _enable_meta = True if enable_meta == 'true' else False
    if task_type == 'cls':
        from solnml.estimators import Classifier
        estimator = Classifier(time_limit=time_limit,
                               per_run_time_limit=300,
                               output_dir=save_folder,
                               ensemble_method=ens_method,
                               enable_meta_algorithm_selection=_enable_meta,
                               evaluation=eval_type,
                               metric='bal_acc',
                               include_algorithms=['random_forest'],
                               include_preprocessors=['extra_trees_based_selector',
                                                      'generic_univariate_selector',
                                                      'liblinear_based_selector',
                                                      'percentile_selector'],
                               n_jobs=1)
    else:
        from solnml.estimators import Regressor
        estimator = Regressor(time_limit=time_limit,
                              per_run_time_limit=300,
                              output_dir=save_folder,
                              ensemble_method=ens_method,
                              enable_meta_algorithm_selection=_enable_meta,
                              evaluation=eval_type,
                              metric='mse',
                              include_algorithms=['random_forest'],
                              include_preprocessors=['extra_trees_based_selector_regression',
                                                     'generic_univariate_selector',
                                                     'liblinear_based_selector',
                                                     'percentile_selector_regression'],
                              n_jobs=1)

    start_time = time.time()
    estimator.fit(train_data, opt_strategy=mth, dataset_id=dataset)
    pred = estimator.predict(test_data)
    if task_type == 'cls':
        test_score = balanced_accuracy_score(test_data.data[1], pred)
    else:
        test_score = mean_squared_error(test_data.data[1], pred)
    validation_score = estimator._ml_engine.solver.incumbent_perf
    eval_dict = estimator._ml_engine.solver.get_eval_dict()
    print('Run ID         : %d' % run_id)
    print('Dataset        : %s' % dataset)
    print('Val/Test score : %f - %f' % (validation_score, test_score))

    save_path = save_folder + 'extremely_small_%s_%s_%s_%s_%d_%d_%d.pkl' % (
        task_type, mth, dataset, enable_meta, time_limit, (ens_method is None), run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, validation_score, test_score, start_time, eval_dict], f)

    # Delete output dir
    shutil.rmtree(os.path.join(estimator.get_output_dir()))
Beispiel #8
0
def evaluate_2rd_hmab(run_id, mth, dataset, algo,
                      eval_type='holdout', time_limit=1200, seed=1):
    task_type = MULTICLASS_CLS
    train_data, test_data = load_train_test_data(dataset, task_type=task_type)

    from solnml.estimators import Classifier
    clf = Classifier(time_limit=time_limit,
                     per_run_time_limit=300,
                     output_dir=save_folder,
                     ensemble_method=None,
                     evaluation=eval_type,
                     enable_meta_algorithm_selection=False,
                     metric='bal_acc',
                     include_algorithms=[algo],
                     n_jobs=1)

    clf.fit(train_data, opt_strategy=mth)
    pred = clf.predict(test_data)
    test_score = balanced_accuracy_score(test_data.data[1], pred)
    timestamps, perfs = clf.get_val_stats()
    validation_score = np.max(perfs)
    print('Evaluation Num : %d' % len(perfs))
    print('Run ID         : %d' % run_id)
    print('Dataset        : %s' % dataset)
    print('Val/Test score : %f - %f' % (validation_score, test_score))

    save_path = save_folder + '%s_%s_%d_%d_%s.pkl' % (mth, dataset, time_limit, run_id, algo)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, validation_score, test_score], f)
    def post(self):
        start_time = time.time()
        args = self.parser.parse_args()
        _id = request.form['model_name']
        X_file = request.files['data_file_X']
        y_file = request.files['data_file_y']
        _id = request.form['model_name']
        obj = request.form['objective']
        # read data
        X_train = np.array(pd.read_csv(X_file))
        y_train = np.array(pd.read_csv(y_file))[:, 0]
        print(y_train)
        if not (obj):
            obj = 'clf'
        dm = DataManager(X_train, y_train)
        train_data = dm.get_data_node(X_train, y_train)

        save_dir = './data/eval_exps/soln-ml'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # train mode
        if (obj == 'clf'):
            mdl = Classifier(time_limit=100,
                             output_dir=save_dir,
                             ensemble_method='bagging',
                             evaluation='holdout',
                             metric='acc',
                             n_jobs=4)
        elif (obj == 'reg'):
            mdl = rgs = Regressor(metric='mse',
                                  ensemble_method=ensemble_method,
                                  evaluation=eval_type,
                                  time_limit=time_limit,
                                  output_dir=save_dir,
                                  random_state=1,
                                  n_jobs=n_jobs)
        model_fit(_id, mdl, train_data)
        self.model_factory.add_pipeline(mdl, train_data, _id)
        print(self.model_factory)
        result = {
            'trainTime': time.time() - start_time,
            'trainShape': X_train.shape
        }
        self.model_factory[params['pipeline_id']]['stats'] = result
        return json.dumps(result)
Beispiel #10
0
def evaluate_package():
    train_data, test_data = load_train_test_data('pc4', data_dir='./')
    Classifier().fit(train_data)
Beispiel #11
0
ensemble_method = args.ens_method
if ensemble_method == 'none':
    ensemble_method = None

save_dir = './data/eval_exps/soln-ml'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

print('==> Start to evaluate with Budget %d' % time_limit)

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
dm = DataManager(X_train, y_train)
train_data = dm.get_data_node(X_train, y_train)
test_data = dm.get_data_node(X_test, y_test)

clf = Classifier(time_limit=time_limit,output_dir=save_dir,ensemble_method=ensemble_method,evaluation=eval_type,metric='acc',n_jobs=n_jobs)
clf.fit(train_data)
pred = clf.predict(test_data)
print(pred)
print(balanced_accuracy_score(test_data.data[1], pred))


#save and load example

#saveloadmodel.save_model(clf,'./data/model_clf9')
#ens = saveloadmodel.load_model('./data/model_clf9')
#print(ens.predict_proba(X_test))

print('==> Start to evaluate with Budget %d' % time_limit)

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=1)
dm = DataManager(X_train, y_train)
train_data = dm.get_data_node(X_train, y_train)
test_data = dm.get_data_node(X_test, y_test)

save_dir = './data/eval_exps/soln-ml'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

add_classifier(UserDefinedDecisionTree)
clf = Classifier(time_limit=time_limit,
                 output_dir=save_dir,
                 include_algorithms=['UserDefinedDecisionTree'],
                 random_state=1,
                 metric='acc',
                 n_jobs=1)
_start_time = time.time()
_iter_id = 0

clf.fit(train_data)
pred = clf.predict(test_data)

print(balanced_accuracy_score(test_data.data[1], pred))
Beispiel #13
0
import numpy as np
import os
import sys

sys.path.append(os.getcwd())

from solnml.components.feature_engineering.transformations.preprocessor.text2vector import \
    Text2VectorTransformation
from solnml.components.feature_engineering.transformation_graph import DataNode
from solnml.components.utils.constants import *
from solnml.estimators import Classifier

x = np.array([[1, 'I am good', 'I am right', 3],
              [2, 'He is good', 'He is ok', 4],
              [2.5, 'Everyone is good', 'Everyone is ok', 7],
              [1.3333, 'well', 'what', 5]])
y = np.array([0, 1, 0, 1])

t2v = Text2VectorTransformation()
data = (x, y)
feature_type = [NUMERICAL, TEXT, TEXT, DISCRETE]
datanode = DataNode(data, feature_type)

clf = Classifier(time_limit=20,
                 enable_meta_algorithm_selection=False,
                 include_algorithms=['random_forest'])

clf.fit(datanode, opt_strategy='combined')
print(clf.predict(datanode))