Example #1
0
def main():
    "Main function"
    optmgr  = OptionParser(SCORERS.keys())
    opts, _ = optmgr.get_opt()

    predictions = loader(opts.fpred, opts.fpred_target, opts.fpred_sep)
    real_values = loader(opts.fin, opts.fin_target, opts.fin_sep)
    checker(predictions, real_values, opts.scorer, opts.verbose)
Example #2
0
def model_selection_with_score():
    from sklearn.datasets import load_digits

    # 数字图片
    digits = load_digits()
    # print(np.bincount(digits.target))
    # [178 182 177 183 181 182 181 179 174 180]
    # print(np.unique(digits.target))
    # [0 1 2 3 4 5 6 7 8 9]

    # 将数据划分为训练集和测试集,是为了利用测试集度量模型的泛化能力。
    from sklearn.model_selection import train_test_split
    y = (digits.target == 9)
    X_train, X_test, y_train, y_test = train_test_split(digits.data, y, random_state=seed)

    from sklearn.model_selection import cross_val_score
    from sklearn.svm import SVC
    cross_val = cross_val_score(SVC(gamma='auto'), digits.data, y, scoring='accuracy', cv=5)
    show_title("交叉验证的默认评估指标是:accuracy")
    print("Accuracy scoring:", cross_val)
    cross_val = cross_val_score(SVC(gamma='auto'), digits.data, y, scoring='roc_auc', cv=5)
    show_subtitle("交叉验证的默认评估指标是:roc_auc")
    print("AUC scoring: ", cross_val)

    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics import roc_auc_score
    param_grid = {'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10]}

    np.set_printoptions(precision=5, suppress=True, threshold=np.inf, linewidth=200)

    # 使用 精确度 评分
    grid_search = GridSearchCV(SVC(gamma='auto'), param_grid=param_grid, scoring='accuracy', cv=5)
    grid_search.fit(X_train, y_train)
    show_title(f"网格搜索的评估指标:{grid_search.scoring}")
    print("Best parameters:", grid_search.best_params_)
    print("Best estimator:", grid_search.best_estimator_)
    print("Best cross-validation score(accuracy)): {:.5f}".format(grid_search.best_score_))
    print("Test set AUC: {:.5f}".format(roc_auc_score(y_test, grid_search.decision_function(X_test))))
    print("Test set score(accuracy): {:.5f}".format(grid_search.score(X_test, y_test)))
    print("Best estimator's accuracy of test set: {:.5f}".format(grid_search.best_estimator_.score(X_test, y_test)))

    # 使用 AUC 评分
    grid_search = GridSearchCV(SVC(gamma='auto'), param_grid=param_grid, scoring='roc_auc', cv=5)
    grid_search.fit(X_train, y_train)
    show_subtitle(f"网格搜索的评估指标:{grid_search.scoring}")
    print("Best parameters:", grid_search.best_params_)
    print("Best estimator:", grid_search.best_estimator_)
    print("Best cross-validation score(AUC): {:.5f}".format(grid_search.best_score_))
    print("Test set AUC: {:.5f}".format(roc_auc_score(y_test, grid_search.decision_function(X_test))))
    print("Test set score(AUC): {:.5f}".format(grid_search.score(X_test, y_test)))
    print("Best estimator's accuracy of test set: {:.5f}".format(grid_search.best_estimator_.score(X_test, y_test)))

    from sklearn.metrics.scorer import SCORERS
    show_title("系统提供的有效的评估指标")
    print("Available scorers:")
    print(sorted(SCORERS.keys()))
Example #3
0
def main():
    "Main function"
    optmgr  = OptionParser(SCORERS.keys())
    opts, _ = optmgr.get_opt()
    predictions = loader(opts.fpred, opts.fpred_target, opts.fpred_sep, opts.threshold)
    real_values = loader(opts.fin, opts.fin_target, opts.fin_sep, None)
    probabilities = None
    if  opts.threshold:
        probabilities = loader(opts.fpred, opts.fpred_target, opts.fpred_sep, None)
    if  len(predictions) != len(real_values):
        print("Error: input file and prediction file lengths are different: %s vs %s" % (len(predictions), len(real_values)))
        sys.exit(1)
    if  opts.tiers_break:
        checker_with_tiers(predictions, real_values, probabilities, opts.fin, opts.scorer, opts.tiers_col, opts.tiers_map, opts.tiers_map_kval, opts.plainout, opts.verbose)
    else:
        checker(predictions, real_values, probabilities, opts.scorer, opts.verbose, opts.plainout)
Example #4
0
def main():
    "Main function"
    optmgr = OptionParser(learners().keys(), SCORERS.keys())
    opts, _ = optmgr.options()
    if  opts.learner_help:
        obj = learners()[opts.learner_help]
        print(obj)
        print(obj.__doc__)
        sys.exit(0)
    ofile = opts.predict
    if  not ofile:
        ofile = "%s.predictions" % opts.learner
    model2run = 'model'
    if  opts.train.find(',') != -1: # list of files
        train_files = opts.train.split(',')
        model2run = 'model_iter'
    elif os.path.isdir(opts.train): # we got directory name
        for ext in ['.csv.gz', '.csv']:
            train_files = [f for f in files(opts.train, ext)]
            model2run = 'model_iter'
            if  len(train_files):
                break

#    random.seed(12345)
    if  model2run == 'model_iter':
        model_iter(train_file_list=train_files, newdata_file=opts.newdata,
                idcol=opts.idcol, tcol=opts.target,
                learner=opts.learner, lparams=opts.lparams,
                drops=opts.drops, split=opts.split,
                scaler=opts.scaler, ofile=ofile, seed=opts.seed, verbose=opts.verbose)
    else:
        model(train_file=opts.train, newdata_file=opts.newdata,
                idcol=opts.idcol, tcol=opts.target,
                learner=opts.learner, lparams=opts.lparams,
                drops=opts.drops, split=opts.split,
                scorer=opts.scorer, scaler=opts.scaler, ofile=ofile,
                idx=opts.idx, limit=opts.limit, gsearch=opts.gsearch,
                crossval=opts.cv, seed=opts.seed, verbose=opts.verbose,
                timeout=opts.timeout, proba=opts.proba)
grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')
grid_clf_auc.fit(X_train, y_train)
y_decision_fn_scores_auc = grid_clf_auc.decision_function(X_test) 

print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc))
print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
print('Grid best score (AUC): ', grid_clf_auc.best_score_)


# #### Evaluation metrics supported for model selection

# In[28]:

from sklearn.metrics.scorer import SCORERS

print(sorted(list(SCORERS.keys())))


# ### Two-feature classification example using the digits dataset

# #### Optimizing a classifier using different evaluation metrics

# In[29]:

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

Example #6
0
#corss_val_scoreのスコア方法を【roc_auc】に変更
roc_auc = cross_val_score(SVC(),
                          digits.data,
                          digits.target == 9,
                          scoring="roc_auc")
print("AUC スコア:{}".format(roc_auc))

#gridSerchCVの結果からdecision_functionを取り出して roc_auc_scoreを計算
xtrain, xtest, ytrain, ytest = train_test_split(digits.data,
                                                digits.target == 9,
                                                random_state=0)
param_grid = {'gamma': [0.0001, 0.001, 0.1, 1, 10]}
grid = GridSearchCV(SVC(), param_grid=param_grid)
grid.fit(xtrain, ytrain)
print("Best parameters:", grid.best_params_)
print("Test set AUC: {:.3f}".format(
    roc_auc_score(ytest, grid.decision_function(xtest))))
print("Test set accuracy: {:.3f}".format(grid.score(xtest, ytest)))

#GridSerachCVでの評価基準を【roc_auc】スコアに変更
grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc")
grid.fit(xtrain, ytrain)
print("Best parameters:", grid.best_params_)
print("Best cross-validation score (auc):{:.3f}".format(grid.best_score_))
print("Test set AUC: {:.3f}".format(grid.score(xtest, ytest)))

#利用できるスコアの種類をリスト
from sklearn.metrics.scorer import SCORERS

print("Available scores \n{}".format(sorted(SCORERS.keys())))
print("Explicit accuracy scoring: {}".format(explicit_accuracy))
roc_auc = cross_val_score(SVC(), digits.data, digits.target == 9,scoring="roc_auc")
print("AUC scoring: {}".format(roc_auc))

X_train, X_test, y_train, y_test = train_test_split(
digits.data, digits.target == 9, random_state=0)
# we provide a somewhat bad grid to illustrate the point:
param_grid = {'gamma': [0.0001, 0.01, 0.1, 1, 10]}
# using the default scoring of accuracy:
grid = GridSearchCV(SVC(), param_grid=param_grid)
grid.fit(X_train, y_train)
print("Grid-Search with accuracy")
print("Best parameters:", grid.best_params_)
print("Best cross-validation score (accuracy)): {:.3f}".format(grid.best_score_))
print("Test set AUC: {:.3f}".format(
roc_auc_score(y_test, grid.decision_function(X_test))))
print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test)))

# using AUC scoring instead:
grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc")
grid.fit(X_train, y_train)
print("\nGrid-Search with AUC")
print("Best parameters:", grid.best_params_)
print("Best cross-validation score (AUC): {:.3f}".format(grid.best_score_))
print("Test set AUC: {:.3f}".format(
roc_auc_score(y_test, grid.decision_function(X_test))))
print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test)))

from sklearn.metrics.scorer import SCORERS
print("Available scorers:\n{}".format(sorted(SCORERS.keys())))
print 'Grid Search with accuracy'
print 'Best parameters: {}'.format(grid.best_params_)
print 'Best cross-validation score (accuracy): {:.3f}'.format(grid.best_score_)
print 'Test set AUC: {:.3f}'.format(
    roc_auc_score(y_test, grid.decision_function(X_test)))
print 'Test set accuracy: {:.3f}'.format(grid.score(X_test, y_test))
# Grid Search with accuracy
# Best parameters: {'gamma': 0.0001}
# Best cross - validation score(accuracy): 0.970
# Test set AUC: 0.992
# Test set accuracy: 0.973

grid = GridSearchCV(SVC(), param_grid=param_grid, scoring='roc_auc')
grid.fit(X_train, y_train)
print 'Grid Search with AUC'
print 'Best parameters: {}'.format(grid.best_params_)
print 'Best cross-validation score (accuracy): {:.3f}'.format(grid.best_score_)
print 'Test set AUC: {:.3f}'.format(
    roc_auc_score(y_test, grid.decision_function(X_test)))
print 'Test set accuracy: {:.3f}'.format(grid.score(X_test, y_test))
# Grid Search with AUC
# Best parameters: {'gamma': 0.01}
# Best cross - validation score(accuracy): 0.997
# Test set AUC: 1.000
# Test set accuracy: 1.000
# Here we see using AUC on imbalanced data let to a better AUC score
# and even a better accuracy score

print 'Available scores:\n{}'.format(sorted(SCORERS.keys()))
# Different scoring metrics available
# We can simply use this in ``cross_val_score`` by specifying ``scoring="roc_auc"``:

# %%
from sklearn.model_selection import cross_val_score
cross_val_score(SVC(gamma='auto'), X, y, scoring="roc_auc", cv=5)

# %% [markdown]
# Built-In and custom scoring functions
# =======================================

# %% [markdown]
# There are many more scoring methods available, which are useful for different kinds of tasks. You can find them in the "SCORERS" dictionary. The only documentation explains all of them.

# %%
from sklearn.metrics.scorer import SCORERS
print(SCORERS.keys())


# %% [markdown]
# It is also possible to define your own scoring metric. Instead of a string, you can provide a callable to as ``scoring`` parameter, that is an object with a ``__call__`` method or a function.
# It needs to take a model, a test-set features ``X_test`` and test-set labels ``y_test``, and return a float. Higher floats are taken to mean better models.
#
# Let's reimplement the standard accuracy score:

# %%
def my_accuracy_scoring(est, X, y):
    return np.mean(est.predict(X) == y)

cross_val_score(SVC(), X, y, scoring=my_accuracy_scoring)

# %% [markdown]
explicit_accuracy =  cross_val_score(SVC(), digits.data, digits.target == 9,
                                     scoring="accuracy", cv=5)
print("Explicit accuracy scoring: {}".format(explicit_accuracy))
roc_auc =  cross_val_score(SVC(), digits.data, digits.target == 9,
                           scoring="roc_auc", cv=5)
print("AUC scoring: {}".format(roc_auc))


res = cross_validate(SVC(), digits.data, digits.target == 9,
                     scoring=["accuracy", "roc_auc", "recall_macro"],
                     return_train_score=True, cv=5)
display(pd.DataFrame(res))

from sklearn.metrics.scorer import SCORERS
print("Available scorers:")
print(sorted(SCORERS.keys()))














                          scoring="roc_auc")
print("auc index : {}".format(roc_auc))  #기본값이 accuracy, 문자열로 roc_auc 지정

#그리드 서치
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
X_train, X_test, y_train, y_test = train_test_split(digits.data,
                                                    digits.target,
                                                    random_state=0)
param_grid = {"gamma": [0.0001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(SVC(), param_grid=param_grid)
grid.fit(X_train, y_train)
print("grid search accuracy index")
print("best param : ", grid.best_params_)
print("best cross_val score : {:.3f}".format(grid.best_score_))
print("test set auc : {:.3f}".format(
    roc_auc_score(y_test, grid.decision_function(X_test))))
print("test set score : {:.3f}".format(grid.score(X_test, y_test)))

grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc")
grid.fit(X_train, y_train)  #오류 발생. 검색해보니 y에 인코딩이 필요한듯.
print("grid search roc_auc index")
print("best param : ", grid.best_params_)
print("best cross_val score : {:.3f}".format(grid.best_score_))
print("test set auc : {:.3f}".format(
    roc_auc_score(y_test, grid.decision_function(X_test))))
print("test set score : {:.3f}".format(grid.score(X_test, y_test)))

from sklearn.metrics.scorer import SCORERS
print("가능한 평가 방식 : \n{}".format(sorted(SCORERS.keys())))