Beispiel #1
0
    if (res.test_size > 0.0):
        do_test = True
        splits = train_test_split(X_train,
                                  y_train,
                                  test_size=res.test_size,
                                  random_state=res.seed)

        X_train, X_test, y_train, y_test = splits
        print('Train/hillclimbing set size: %d' % len(X_train))
        print('              Test set size: %d\n' % len(X_test))
    else:
        do_test = False
        print('Train/hillclimbing set size: %d' % len(X_train))

    # get model lib
    models = build_model_library(res.model_types, res.seed)
    print('built %d models\n' % len(models))

    param_dict = {
        'models': models,
        'db_file': res.db_file,
        'n_best': res.n_best,
        'n_folds': res.n_folds,
        'n_bags': res.n_bags,
        'bag_fraction': res.bag_fraction,
        'prune_fraction': res.prune_fraction,
        'score_metric': res.score_metric,
        'verbose': res.verbose,
        'epsilon': res.epsilon,
        'use_epsilon': res.use_epsilon,
        'use_bootstrap': res.use_bootstrap,
Beispiel #2
0
def trainMan(res):
    X_train, y_train = load_svmlight_file(res.data_file)
    X_train = X_train.toarray()

    # train_test_split for testing set if test_size>0.0
    if (res.test_size > 0.0):
        do_test = True
        splits = train_test_split(X_train, y_train,
                                  test_size=res.test_size,
                                  random_state=res.seed)

        X_train, X_test, y_train, y_test = splits
        '''
        #for speedups, convert to sparse matrices
        X_train = sparse.csc_matrix(X_train)
        y_train = sparse.csc_matrix(y_train)

        X_test = sparse.csr_matrix(X_test)
        y_test = sparse.csr_matrix(y_test)
        '''

        print('Train/hillclimbing set size: %d' % len(X_train))
        print('              Test set size: %d\n' % len(X_test))
    else:
        do_test = False
        print('Train/hillclimbing set size: %d' % len(X_train))

    # get model lib
    models = build_model_library(res.model_types, res.seed)
    print('built %d models\n' % len(models))

    param_dict = {
        'models': models,
        'db_file': res.db_file,
        'n_best': res.n_best,
        'n_folds': res.n_folds,
        'n_bags': res.n_bags,
        'bag_fraction': res.bag_fraction,
        'prune_fraction': res.prune_fraction,
        'score_metric': res.score_metric,
        'verbose': res.verbose,
        'epsilon': res.epsilon,
        'use_epsilon': res.use_epsilon,
        'use_bootstrap': res.use_bootstrap,
        'max_models': res.max_models,
        'random_state': res.seed,
        'meth': res.meth,
        'sweight': res.sweight,
    }
    print(str(res.meth))
    try:
        if res.meth[0] == 'Classification':
            ens = EnsembleSelectionClassifier(**param_dict)
            print('fitting ensemble:\n%s\n' % ens)
        elif res.meth[0] == 'Regression':
            ens = EnsembleSelectionRegressor(**param_dict)
            print('fitting ensemble:\n%s\n' % ens)
        else:
            msg = "Invalid method passed (-T does not conform to ['Regression','Classification']"
            raise ValueError(msg)
    except ValueError as e:
        print('ERROR: %s' % e)
        import sys
        sys.exit(1)

    # fit models, score, build ensemble
    ens.fit(X_train, y_train)

    list_of_results = {}

    preds = ens.best_model_predict(X_train)
    if res.meth[0] == 'Classification':
        score = accuracy_score(y_train, preds)
        list_of_results['best_train_score'] = score
    elif res.meth[0] == 'Regression':
        score = r2_score(y_train, preds)
        rmse = sqrt(mean_squared_error(y_train, preds))
        print('Train set RMSE from best model: %.5f' % rmse)
        list_of_results['best_train_score'] = score
        list_of_results['best_train_rmse'] = rmse
    print('Train set accuracy from best model: %.5f' % score)

    preds = ens.predict(X_train)
    if res.meth[0] == 'Classification':
        score = accuracy_score(y_train, preds)
        list_of_results['ens_train_score'] = score
    elif res.meth[0] == 'Regression':
        score = r2_score(y_train, preds)
        rmse = sqrt(mean_squared_error(y_train, preds))
        list_of_results['ens_train_score'] = score
        list_of_results['ens_train_rmse'] = rmse
        print('Train set RMSE from final ensemble: %.5f' % rmse)
    print('Train set accuracy from final ensemble: %.5f' % score)

    if (do_test):
        preds = ens.best_model_predict(X_test)
        if res.meth[0] == 'Classification':
            score = accuracy_score(y_test, preds)
            list_of_results['best_test_score'] = score
            fmt = '\n Test set classification report for best model:\n%s'
            report = classification_report(y_test, preds)
            print(fmt % report)
        elif res.meth[0] == 'Regression':
            score = r2_score(y_test, preds)
            rmse = sqrt(mean_squared_error(y_test, preds))
            list_of_results['best_test_score'] = score
            list_of_results['best_test_rmse'] = rmse
            print('Test set RMSE from best model: %.5f' % rmse)
        print('\n Test set accuracy from best model: %.5f' % score)

        preds = ens.predict(X_test)

        if res.meth[0] == 'Classification':
            score = accuracy_score(y_test, preds)
            list_of_results['ens_test_score'] = score
        elif res.meth[0] == 'Regression':
            score = r2_score(y_test, preds)
            rmse = sqrt(mean_squared_error(y_test, preds))
            list_of_results['ens_test_score'] = score
            list_of_results['ens_test_rmse'] = rmse
            print('Test set RMSE from final ensemble: %.5f' % rmse)
        print(' Test set accuracy from final ensemble: %.5f' % score)

        if res.meth[0] == 'Classification':
            fmt = '\n Test set classification report for final ensemble:\n%s'
            report = classification_report(y_test, preds)
            print(fmt % report)

        if res.retrain:
            X_train, y_train = load_svmlight_file(res.data_file)
            X_train = X_train.toarray()
            print('Retraining models comprising ensemble on full training set!')
            ens.retrain_ensemble(X_train, y_train)

        if res.removal:
            try:
                db_cleanup(res.db_file)
                print("Removing unwanted models...")
            except:
                print("Error pruning db_file")
    metric_keys = ['best_train_score', 'best_train_rmse', 'ens_train_score', 'ens_train_rmse',
                   'best_test_score', 'best_test_rmse', 'ens_test_score', 'ens_test_rmse']
    if set(list_of_results.keys()) <> set(metric_keys):
        for keynm in set(metric_keys) - set(list_of_results.keys()):
            list_of_results[keynm] = 0.0

    return list_of_results
import sys

from model_library import build_model_library

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print "Usage: python extract_doc_rankings.py doc_rankings_dir dataset_path out_dir"
    else:
        build_model_library(sys.argv[1], sys.argv[2], sys.argv[3])
    # train_test_split for testing set if test_size>0.0
    if (res.test_size > 0.0):
        do_test = True
        splits = train_test_split(X_train, y_train,
                                  test_size=res.test_size,
                                  random_state=res.seed)

        X_train, X_test, y_train, y_test = splits
        print('Train/hillclimbing set size: %d' % len(X_train))
        print('              Test set size: %d\n' % len(X_test))
    else:
        do_test = False
        print('Train/hillclimbing set size: %d' % len(X_train))

    # get model lib
    models = build_model_library(res.model_types, res.seed)
    print('built %d models\n' % len(models))

    param_dict = {
        'models': models,
        'db_file': res.db_file,
        'n_best': res.n_best,
        'n_folds': res.n_folds,
        'n_bags': res.n_bags,
        'bag_fraction': res.bag_fraction,
        'prune_fraction': res.prune_fraction,
        'score_metric': res.score_metric,
        'verbose': res.verbose,
        'epsilon': res.epsilon,
        'use_epsilon': res.use_epsilon,
        'use_bootstrap': res.use_bootstrap,