Ejemplo n.º 1
0
def evaluate_model(dataset, save_file, random_state, pipeline_components,
                   pipeline_parameters, n_combos, label):

    features, labels, feature_names = read_file(dataset, label)
    # pipelines = [dict(zip(pipeline_parameters.keys(), list(parameter_combination)))
    #              for parameter_combination in itertools.product(*pipeline_parameters.values())]

    # Create a temporary folder to store the transformers of the pipeline
    cachedir = mkdtemp()
    memory = Memory(cachedir=cachedir, verbose=0)

    # print ( pipeline_components)
    # print(pipeline_parameters)
    with warnings.catch_warnings():
        # Squash warning messages. Turn this off when debugging!
        warnings.simplefilter('ignore')
        cv = StratifiedKFold(n_splits=10,
                             shuffle=True,
                             random_state=random_state)
        hyperparameters = {}
        for k, v in pipeline_parameters.items():
            for param, pvals in v.items():
                hyperparameters.update({k + '__' + param: pvals})
        pipeline = Pipeline(pipeline_components, memory=memory)

        # run Randomized Search CV to tune the hyperparameter settings
        est = RandomizedSearchCV(estimator=pipeline,
                                 param_distributions=hyperparameters,
                                 n_iter=n_combos,
                                 cv=cv,
                                 random_state=random_state,
                                 refit=True,
                                 error_score=0.0)
        est.fit(features, labels)
        best_est = est.best_estimator_
        # generate cross-validated predictions for each data point using the best estimator
        cv_predictions = cross_val_predict(estimator=best_est,
                                           X=features,
                                           y=labels,
                                           cv=cv)

        # get cv probabilities
        skip = False
        if getattr(best_est, "predict_proba", None):
            method = "predict_proba"
        elif getattr(best_est, "decision_function", None):
            method = "decision_function"
        else:
            skip = True

        if not skip:
            cv_probabilities = cross_val_predict(estimator=best_est,
                                                 X=features,
                                                 y=labels,
                                                 method=method,
                                                 cv=cv)
            if method == "predict_proba":
                cv_probabilities = cv_probabilities[:, 1]

        accuracy = accuracy_score(labels, cv_predictions)
        macro_f1 = f1_score(labels, cv_predictions, average='macro')
        balanced_accuracy = balanced_accuracy_score(labels, cv_predictions)
        try:
            roc_auc = roc_auc_score(labels, cv_probabilities)
        except ValueError as ve:
            print("roc_auc_score: %s" % (str(ve)))
            roc_auc = -1

        preprocessor_classes = [p[0] for p in pipeline_components[:-1]]

        preprocessor_param_string = 'default'
        for preprocessor_class in preprocessor_classes:
            if preprocessor_class in pipeline_parameters.keys():
                preprocessor_param_string = ','.join([
                    '{}={}'.format(
                        parameter,
                        '|'.join([x.strip() for x in str(value).split(',')]))
                    for parameter, value in
                    pipeline_parameters[preprocessor_class].items()
                ])

        classifier_class = pipeline_components[-1][0]
        param_string = ','.join(
            ['{}={}'.format(p, v) for p, v in est.best_params_.items()])
        # for parameter, value in pipeline_parameters[classifier_class].items()])

        out_text = '\t'.join([
            dataset.split('/')[-1].split('.')[0],
            ','.join(preprocessor_classes), preprocessor_param_string,
            classifier_class, param_string,
            str(random_state),
            str(accuracy),
            str(macro_f1),
            str(balanced_accuracy),
            str(roc_auc)
        ])
        print(out_text)
        with open(save_file, 'a') as out:
            out.write(out_text + '\n')
        sys.stdout.flush()

        # write feature importances
        est_name = classifier_class
        feature_importance(save_file, best_est, est_name, feature_names,
                           features, labels, random_state,
                           ','.join(preprocessor_classes),
                           preprocessor_param_string, classifier_class,
                           param_string)
        # write roc curves
        if not skip:
            roc(save_file, best_est, labels, cv_probabilities, random_state,
                ','.join(preprocessor_classes), preprocessor_param_string,
                classifier_class, param_string)
    # Delete the temporary cache before exiting
    rmtree(cachedir)
Ejemplo n.º 2
0
                         y=y_train,
                         bounds=bounds,
                         metric=single_balanced_accuracy_score,
                         init_points=10,
                         n_iter=15,
                         groups=uuid_groups)

    nb_clf = GaussianNB()
    clf = FlexOneVsRestClassifier(nb_clf, n_estimators=y_train.shape[1])

    clf.fit(X_train_clean, y_train)
    y_pred = clf.predict(X_test_clean)
    print(
        "Balanced accuracy NB: ",
        balanced_accuracy_score(y_test.T,
                                y_pred,
                                average="macro",
                                zero_default=0))

    lr_clf = LogisticRegression(solver="lbfgs", tol=1e-3, max_iter=500)
    bounds = {"C": (0.0001, 1)}
    clf = FlexOneVsRestClassifier(lr_clf, n_estimators=y_train.shape[1])

    clf.tune_hyperparams(X=X_train_clean,
                         y=y_train,
                         bounds=bounds,
                         metric=single_balanced_accuracy_score,
                         init_points=6,
                         n_iter=9,
                         groups=uuid_groups)

    dump(clf.get_params(), "params_separated_lr.joblib")
Ejemplo n.º 3
0
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from metrics import balanced_accuracy_score
import pandas as pd

dataset = pd.read_csv('d_heart.csv')
X = StandardScaler().fit_transform(dataset.drop('class', axis=1))
y = dataset['class']

X_t, X_v, y_t, y_v = train_test_split(X, y, test_size=0.25, shuffle=False)

clf = DecisionTreeClassifier(max_depth=4, criterion='gini').fit(X_t, y_t)

print('train score:', balanced_accuracy_score(y_t, clf.predict(X_t)))
print('test score:', balanced_accuracy_score(y_v, clf.predict(X_v)))
print('feature importances:', clf.feature_importances_)
import numpy as np

print('argsort: ', np.argsort(clf.feature_importances_))
Ejemplo n.º 4
0
                         int_params=int_params)

    dump(clf.get_params(), "params_separated.joblib")
    param_dict = load("params_separated.joblib")
    print(param_dict)

    preprocess = time.time()
    print("Preprocess: {}".format(preprocess - start))

    clf.fit(X_train, y_train, ignore_nan=True)

    fit_time = time.time()
    print("Fit time: {}".format(fit_time - preprocess))

    y_pred = clf.predict(X_test)
    y_pred_bias = clf.predict(X_train)

    pred_time = time.time()
    print("Prediction time: {}".format(pred_time - fit_time))
    print("Balanced accuracy: ", balanced_accuracy_score(y_test,
                                                         y_pred,
                                                         average="macro",
                                                         zero_default=0))

    print("Balanced accuracy bias:", balanced_accuracy_score(y_train,
                                                             y_pred_bias,
                                                             average="macro",
                                                             zero_default=0))

    score_time = time.time()
    print("Score time: {}".format(score_time - pred_time))