Example #1
0
def main():
    "Main function"
    optmgr  = OptionParser(SCORERS.keys())
    opts, _ = optmgr.get_opt()

    predictions = loader(opts.fpred, opts.fpred_target, opts.fpred_sep)
    real_values = loader(opts.fin, opts.fin_target, opts.fin_sep)
    checker(predictions, real_values, opts.scorer, opts.verbose)
Example #2
0
def main():
    "Main function"
    optmgr  = OptionParser(SCORERS.keys())
    opts, _ = optmgr.get_opt()
    predictions = loader(opts.fpred, opts.fpred_target, opts.fpred_sep, opts.threshold)
    real_values = loader(opts.fin, opts.fin_target, opts.fin_sep, None)
    probabilities = None
    if  opts.threshold:
        probabilities = loader(opts.fpred, opts.fpred_target, opts.fpred_sep, None)
    if  len(predictions) != len(real_values):
        print("Error: input file and prediction file lengths are different: %s vs %s" % (len(predictions), len(real_values)))
        sys.exit(1)
    if  opts.tiers_break:
        checker_with_tiers(predictions, real_values, probabilities, opts.fin, opts.scorer, opts.tiers_col, opts.tiers_map, opts.tiers_map_kval, opts.plainout, opts.verbose)
    else:
        checker(predictions, real_values, probabilities, opts.scorer, opts.verbose, opts.plainout)
Example #3
0
def main():
    "Main function"
    optmgr = OptionParser(learners().keys(), SCORERS.keys())
    opts, _ = optmgr.options()
    if  opts.learner_help:
        obj = learners()[opts.learner_help]
        print(obj)
        print(obj.__doc__)
        sys.exit(0)
    ofile = opts.predict
    if  not ofile:
        ofile = "%s.predictions" % opts.learner
    model2run = 'model'
    if  opts.train.find(',') != -1: # list of files
        train_files = opts.train.split(',')
        model2run = 'model_iter'
    elif os.path.isdir(opts.train): # we got directory name
        for ext in ['.csv.gz', '.csv']:
            train_files = [f for f in files(opts.train, ext)]
            model2run = 'model_iter'
            if  len(train_files):
                break

#    random.seed(12345)
    if  model2run == 'model_iter':
        model_iter(train_file_list=train_files, newdata_file=opts.newdata,
                idcol=opts.idcol, tcol=opts.target,
                learner=opts.learner, lparams=opts.lparams,
                drops=opts.drops, split=opts.split,
                scaler=opts.scaler, ofile=ofile, seed=opts.seed, verbose=opts.verbose)
    else:
        model(train_file=opts.train, newdata_file=opts.newdata,
                idcol=opts.idcol, tcol=opts.target,
                learner=opts.learner, lparams=opts.lparams,
                drops=opts.drops, split=opts.split,
                scorer=opts.scorer, scaler=opts.scaler, ofile=ofile,
                idx=opts.idx, limit=opts.limit, gsearch=opts.gsearch,
                crossval=opts.cv, seed=opts.seed, verbose=opts.verbose,
                timeout=opts.timeout, proba=opts.proba)
print("Explicit accuracy scoring: {}".format(explicit_accuracy))
roc_auc = cross_val_score(SVC(), digits.data, digits.target == 9,scoring="roc_auc")
print("AUC scoring: {}".format(roc_auc))

X_train, X_test, y_train, y_test = train_test_split(
digits.data, digits.target == 9, random_state=0)
# we provide a somewhat bad grid to illustrate the point:
param_grid = {'gamma': [0.0001, 0.01, 0.1, 1, 10]}
# using the default scoring of accuracy:
grid = GridSearchCV(SVC(), param_grid=param_grid)
grid.fit(X_train, y_train)
print("Grid-Search with accuracy")
print("Best parameters:", grid.best_params_)
print("Best cross-validation score (accuracy)): {:.3f}".format(grid.best_score_))
print("Test set AUC: {:.3f}".format(
roc_auc_score(y_test, grid.decision_function(X_test))))
print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test)))

# using AUC scoring instead:
grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc")
grid.fit(X_train, y_train)
print("\nGrid-Search with AUC")
print("Best parameters:", grid.best_params_)
print("Best cross-validation score (AUC): {:.3f}".format(grid.best_score_))
print("Test set AUC: {:.3f}".format(
roc_auc_score(y_test, grid.decision_function(X_test))))
print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test)))

from sklearn.metrics.scorer import SCORERS
print("Available scorers:\n{}".format(sorted(SCORERS.keys())))
Example #5
0
grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')
grid_clf_auc.fit(X_train, y_train)
y_decision_fn_scores_auc = grid_clf_auc.decision_function(X_test) 

print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc))
print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
print('Grid best score (AUC): ', grid_clf_auc.best_score_)


# #### Evaluation metrics supported for model selection

# In[ ]:

from sklearn.metrics.scorer import SCORERS

print(sorted(list(SCORERS.keys())))


# ### Two-feature classification example using the digits dataset

# #### Optimizing a classifier using different evaluation metrics

# In[ ]:

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# We can simply use this in ``cross_val_score`` by specifying ``scoring="roc_auc"``:

# %%
from sklearn.model_selection import cross_val_score
cross_val_score(SVC(gamma='auto'), X, y, scoring="roc_auc", cv=5)

# %% [markdown]
# Built-In and custom scoring functions
# =======================================

# %% [markdown]
# There are many more scoring methods available, which are useful for different kinds of tasks. You can find them in the "SCORERS" dictionary. The only documentation explains all of them.

# %%
from sklearn.metrics.scorer import SCORERS
print(SCORERS.keys())


# %% [markdown]
# It is also possible to define your own scoring metric. Instead of a string, you can provide a callable to as ``scoring`` parameter, that is an object with a ``__call__`` method or a function.
# It needs to take a model, a test-set features ``X_test`` and test-set labels ``y_test``, and return a float. Higher floats are taken to mean better models.
#
# Let's reimplement the standard accuracy score:

# %%
def my_accuracy_scoring(est, X, y):
    return np.mean(est.predict(X) == y)

cross_val_score(SVC(), X, y, scoring=my_accuracy_scoring)

# %% [markdown]
explicit_accuracy =  cross_val_score(SVC(), digits.data, digits.target == 9,
                                     scoring="accuracy", cv=5)
print("Explicit accuracy scoring: {}".format(explicit_accuracy))
roc_auc =  cross_val_score(SVC(), digits.data, digits.target == 9,
                           scoring="roc_auc", cv=5)
print("AUC scoring: {}".format(roc_auc))


res = cross_validate(SVC(), digits.data, digits.target == 9,
                     scoring=["accuracy", "roc_auc", "recall_macro"],
                     return_train_score=True, cv=5)
display(pd.DataFrame(res))

from sklearn.metrics.scorer import SCORERS
print("Available scorers:")
print(sorted(SCORERS.keys()))














Example #8
0
print('Best cross-validation score(accuracy):{:.3f}'.format(grid.best_score_))
print('Test set AUC:{:.3f}'.format(
    roc_auc_score(y_test, grid.decision_function(X_test))
))
print('Test set accuracy:{:.3f}'.format(grid.score(X_test, y_test)))
# Grid-Search with accuracy
# Best parameter: {'gamma': 0.0001}
# Best cross-validation score(accuracy):0.976
# Test set AUC:0.992
# Test set accuracy:0.973

# 使用AUC评分来代替
grid = GridSearchCV(SVC(), param_grid=param_grid, scoring='roc_auc')
grid.fit(X_train, y_train)
print('\nGrid-Search with AUC')
print('Best parameters:', grid.best_params_)
print('Best cross-validation score(AUC):{:.3f}'.format(grid.best_score_))
print('Test set AUC:{:.3f}'.format(
    roc_auc_score(y_test, grid.decision_function(X_test))
))
print('Test set accuracy:{:.3f}'.format(grid.score(X_test, y_test)))
# Grid-Search with AUC
# Best parameters: {'gamma': 0.01}
# Best cross-validation score(AUC):0.998
# Test set AUC:1.000
# Test set accuracy:1.000

# 查看所有scoring取值
print('Available scores:\n{}'.format(sorted(SCORERS.keys())))

                          scoring="roc_auc")
print("auc index : {}".format(roc_auc))  #기본값이 accuracy, 문자열로 roc_auc 지정

#그리드 서치
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
X_train, X_test, y_train, y_test = train_test_split(digits.data,
                                                    digits.target,
                                                    random_state=0)
param_grid = {"gamma": [0.0001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(SVC(), param_grid=param_grid)
grid.fit(X_train, y_train)
print("grid search accuracy index")
print("best param : ", grid.best_params_)
print("best cross_val score : {:.3f}".format(grid.best_score_))
print("test set auc : {:.3f}".format(
    roc_auc_score(y_test, grid.decision_function(X_test))))
print("test set score : {:.3f}".format(grid.score(X_test, y_test)))

grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc")
grid.fit(X_train, y_train)  #오류 발생. 검색해보니 y에 인코딩이 필요한듯.
print("grid search roc_auc index")
print("best param : ", grid.best_params_)
print("best cross_val score : {:.3f}".format(grid.best_score_))
print("test set auc : {:.3f}".format(
    roc_auc_score(y_test, grid.decision_function(X_test))))
print("test set score : {:.3f}".format(grid.score(X_test, y_test)))

from sklearn.metrics.scorer import SCORERS
print("가능한 평가 방식 : \n{}".format(sorted(SCORERS.keys())))