def main(): "Main function" optmgr = OptionParser(SCORERS.keys()) opts, _ = optmgr.get_opt() predictions = loader(opts.fpred, opts.fpred_target, opts.fpred_sep) real_values = loader(opts.fin, opts.fin_target, opts.fin_sep) checker(predictions, real_values, opts.scorer, opts.verbose)
def model_selection_with_score(): from sklearn.datasets import load_digits # 数字图片 digits = load_digits() # print(np.bincount(digits.target)) # [178 182 177 183 181 182 181 179 174 180] # print(np.unique(digits.target)) # [0 1 2 3 4 5 6 7 8 9] # 将数据划分为训练集和测试集,是为了利用测试集度量模型的泛化能力。 from sklearn.model_selection import train_test_split y = (digits.target == 9) X_train, X_test, y_train, y_test = train_test_split(digits.data, y, random_state=seed) from sklearn.model_selection import cross_val_score from sklearn.svm import SVC cross_val = cross_val_score(SVC(gamma='auto'), digits.data, y, scoring='accuracy', cv=5) show_title("交叉验证的默认评估指标是:accuracy") print("Accuracy scoring:", cross_val) cross_val = cross_val_score(SVC(gamma='auto'), digits.data, y, scoring='roc_auc', cv=5) show_subtitle("交叉验证的默认评估指标是:roc_auc") print("AUC scoring: ", cross_val) from sklearn.model_selection import GridSearchCV from sklearn.metrics import roc_auc_score param_grid = {'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10]} np.set_printoptions(precision=5, suppress=True, threshold=np.inf, linewidth=200) # 使用 精确度 评分 grid_search = GridSearchCV(SVC(gamma='auto'), param_grid=param_grid, scoring='accuracy', cv=5) grid_search.fit(X_train, y_train) show_title(f"网格搜索的评估指标:{grid_search.scoring}") print("Best parameters:", grid_search.best_params_) print("Best estimator:", grid_search.best_estimator_) print("Best cross-validation score(accuracy)): {:.5f}".format(grid_search.best_score_)) print("Test set AUC: {:.5f}".format(roc_auc_score(y_test, grid_search.decision_function(X_test)))) print("Test set score(accuracy): {:.5f}".format(grid_search.score(X_test, y_test))) print("Best estimator's accuracy of test set: {:.5f}".format(grid_search.best_estimator_.score(X_test, y_test))) # 使用 AUC 评分 grid_search = GridSearchCV(SVC(gamma='auto'), param_grid=param_grid, scoring='roc_auc', cv=5) grid_search.fit(X_train, y_train) show_subtitle(f"网格搜索的评估指标:{grid_search.scoring}") print("Best parameters:", grid_search.best_params_) print("Best estimator:", grid_search.best_estimator_) print("Best cross-validation score(AUC): {:.5f}".format(grid_search.best_score_)) print("Test set AUC: {:.5f}".format(roc_auc_score(y_test, grid_search.decision_function(X_test)))) print("Test set score(AUC): {:.5f}".format(grid_search.score(X_test, y_test))) print("Best estimator's accuracy of test set: {:.5f}".format(grid_search.best_estimator_.score(X_test, y_test))) from sklearn.metrics.scorer import SCORERS show_title("系统提供的有效的评估指标") print("Available scorers:") print(sorted(SCORERS.keys()))
def main(): "Main function" optmgr = OptionParser(SCORERS.keys()) opts, _ = optmgr.get_opt() predictions = loader(opts.fpred, opts.fpred_target, opts.fpred_sep, opts.threshold) real_values = loader(opts.fin, opts.fin_target, opts.fin_sep, None) probabilities = None if opts.threshold: probabilities = loader(opts.fpred, opts.fpred_target, opts.fpred_sep, None) if len(predictions) != len(real_values): print("Error: input file and prediction file lengths are different: %s vs %s" % (len(predictions), len(real_values))) sys.exit(1) if opts.tiers_break: checker_with_tiers(predictions, real_values, probabilities, opts.fin, opts.scorer, opts.tiers_col, opts.tiers_map, opts.tiers_map_kval, opts.plainout, opts.verbose) else: checker(predictions, real_values, probabilities, opts.scorer, opts.verbose, opts.plainout)
def main(): "Main function" optmgr = OptionParser(learners().keys(), SCORERS.keys()) opts, _ = optmgr.options() if opts.learner_help: obj = learners()[opts.learner_help] print(obj) print(obj.__doc__) sys.exit(0) ofile = opts.predict if not ofile: ofile = "%s.predictions" % opts.learner model2run = 'model' if opts.train.find(',') != -1: # list of files train_files = opts.train.split(',') model2run = 'model_iter' elif os.path.isdir(opts.train): # we got directory name for ext in ['.csv.gz', '.csv']: train_files = [f for f in files(opts.train, ext)] model2run = 'model_iter' if len(train_files): break # random.seed(12345) if model2run == 'model_iter': model_iter(train_file_list=train_files, newdata_file=opts.newdata, idcol=opts.idcol, tcol=opts.target, learner=opts.learner, lparams=opts.lparams, drops=opts.drops, split=opts.split, scaler=opts.scaler, ofile=ofile, seed=opts.seed, verbose=opts.verbose) else: model(train_file=opts.train, newdata_file=opts.newdata, idcol=opts.idcol, tcol=opts.target, learner=opts.learner, lparams=opts.lparams, drops=opts.drops, split=opts.split, scorer=opts.scorer, scaler=opts.scaler, ofile=ofile, idx=opts.idx, limit=opts.limit, gsearch=opts.gsearch, crossval=opts.cv, seed=opts.seed, verbose=opts.verbose, timeout=opts.timeout, proba=opts.proba)
grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc') grid_clf_auc.fit(X_train, y_train) y_decision_fn_scores_auc = grid_clf_auc.decision_function(X_test) print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc)) print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_) print('Grid best score (AUC): ', grid_clf_auc.best_score_) # #### Evaluation metrics supported for model selection # In[28]: from sklearn.metrics.scorer import SCORERS print(sorted(list(SCORERS.keys()))) # ### Two-feature classification example using the digits dataset # #### Optimizing a classifier using different evaluation metrics # In[29]: from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from adspy_shared_utilities import plot_class_regions_for_classifier_subplot from sklearn.svm import SVC from sklearn.model_selection import GridSearchCV
#corss_val_scoreのスコア方法を【roc_auc】に変更 roc_auc = cross_val_score(SVC(), digits.data, digits.target == 9, scoring="roc_auc") print("AUC スコア:{}".format(roc_auc)) #gridSerchCVの結果からdecision_functionを取り出して roc_auc_scoreを計算 xtrain, xtest, ytrain, ytest = train_test_split(digits.data, digits.target == 9, random_state=0) param_grid = {'gamma': [0.0001, 0.001, 0.1, 1, 10]} grid = GridSearchCV(SVC(), param_grid=param_grid) grid.fit(xtrain, ytrain) print("Best parameters:", grid.best_params_) print("Test set AUC: {:.3f}".format( roc_auc_score(ytest, grid.decision_function(xtest)))) print("Test set accuracy: {:.3f}".format(grid.score(xtest, ytest))) #GridSerachCVでの評価基準を【roc_auc】スコアに変更 grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc") grid.fit(xtrain, ytrain) print("Best parameters:", grid.best_params_) print("Best cross-validation score (auc):{:.3f}".format(grid.best_score_)) print("Test set AUC: {:.3f}".format(grid.score(xtest, ytest))) #利用できるスコアの種類をリスト from sklearn.metrics.scorer import SCORERS print("Available scores \n{}".format(sorted(SCORERS.keys())))
print("Explicit accuracy scoring: {}".format(explicit_accuracy)) roc_auc = cross_val_score(SVC(), digits.data, digits.target == 9,scoring="roc_auc") print("AUC scoring: {}".format(roc_auc)) X_train, X_test, y_train, y_test = train_test_split( digits.data, digits.target == 9, random_state=0) # we provide a somewhat bad grid to illustrate the point: param_grid = {'gamma': [0.0001, 0.01, 0.1, 1, 10]} # using the default scoring of accuracy: grid = GridSearchCV(SVC(), param_grid=param_grid) grid.fit(X_train, y_train) print("Grid-Search with accuracy") print("Best parameters:", grid.best_params_) print("Best cross-validation score (accuracy)): {:.3f}".format(grid.best_score_)) print("Test set AUC: {:.3f}".format( roc_auc_score(y_test, grid.decision_function(X_test)))) print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test))) # using AUC scoring instead: grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc") grid.fit(X_train, y_train) print("\nGrid-Search with AUC") print("Best parameters:", grid.best_params_) print("Best cross-validation score (AUC): {:.3f}".format(grid.best_score_)) print("Test set AUC: {:.3f}".format( roc_auc_score(y_test, grid.decision_function(X_test)))) print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test))) from sklearn.metrics.scorer import SCORERS print("Available scorers:\n{}".format(sorted(SCORERS.keys())))
print 'Grid Search with accuracy' print 'Best parameters: {}'.format(grid.best_params_) print 'Best cross-validation score (accuracy): {:.3f}'.format(grid.best_score_) print 'Test set AUC: {:.3f}'.format( roc_auc_score(y_test, grid.decision_function(X_test))) print 'Test set accuracy: {:.3f}'.format(grid.score(X_test, y_test)) # Grid Search with accuracy # Best parameters: {'gamma': 0.0001} # Best cross - validation score(accuracy): 0.970 # Test set AUC: 0.992 # Test set accuracy: 0.973 grid = GridSearchCV(SVC(), param_grid=param_grid, scoring='roc_auc') grid.fit(X_train, y_train) print 'Grid Search with AUC' print 'Best parameters: {}'.format(grid.best_params_) print 'Best cross-validation score (accuracy): {:.3f}'.format(grid.best_score_) print 'Test set AUC: {:.3f}'.format( roc_auc_score(y_test, grid.decision_function(X_test))) print 'Test set accuracy: {:.3f}'.format(grid.score(X_test, y_test)) # Grid Search with AUC # Best parameters: {'gamma': 0.01} # Best cross - validation score(accuracy): 0.997 # Test set AUC: 1.000 # Test set accuracy: 1.000 # Here we see using AUC on imbalanced data let to a better AUC score # and even a better accuracy score print 'Available scores:\n{}'.format(sorted(SCORERS.keys())) # Different scoring metrics available
# We can simply use this in ``cross_val_score`` by specifying ``scoring="roc_auc"``: # %% from sklearn.model_selection import cross_val_score cross_val_score(SVC(gamma='auto'), X, y, scoring="roc_auc", cv=5) # %% [markdown] # Built-In and custom scoring functions # ======================================= # %% [markdown] # There are many more scoring methods available, which are useful for different kinds of tasks. You can find them in the "SCORERS" dictionary. The only documentation explains all of them. # %% from sklearn.metrics.scorer import SCORERS print(SCORERS.keys()) # %% [markdown] # It is also possible to define your own scoring metric. Instead of a string, you can provide a callable to as ``scoring`` parameter, that is an object with a ``__call__`` method or a function. # It needs to take a model, a test-set features ``X_test`` and test-set labels ``y_test``, and return a float. Higher floats are taken to mean better models. # # Let's reimplement the standard accuracy score: # %% def my_accuracy_scoring(est, X, y): return np.mean(est.predict(X) == y) cross_val_score(SVC(), X, y, scoring=my_accuracy_scoring) # %% [markdown]
explicit_accuracy = cross_val_score(SVC(), digits.data, digits.target == 9, scoring="accuracy", cv=5) print("Explicit accuracy scoring: {}".format(explicit_accuracy)) roc_auc = cross_val_score(SVC(), digits.data, digits.target == 9, scoring="roc_auc", cv=5) print("AUC scoring: {}".format(roc_auc)) res = cross_validate(SVC(), digits.data, digits.target == 9, scoring=["accuracy", "roc_auc", "recall_macro"], return_train_score=True, cv=5) display(pd.DataFrame(res)) from sklearn.metrics.scorer import SCORERS print("Available scorers:") print(sorted(SCORERS.keys()))
scoring="roc_auc") print("auc index : {}".format(roc_auc)) #기본값이 accuracy, 문자열로 roc_auc 지정 #그리드 서치 from sklearn.model_selection import GridSearchCV from sklearn.metrics import roc_auc_score X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, random_state=0) param_grid = {"gamma": [0.0001, 0.01, 0.1, 1, 10]} grid = GridSearchCV(SVC(), param_grid=param_grid) grid.fit(X_train, y_train) print("grid search accuracy index") print("best param : ", grid.best_params_) print("best cross_val score : {:.3f}".format(grid.best_score_)) print("test set auc : {:.3f}".format( roc_auc_score(y_test, grid.decision_function(X_test)))) print("test set score : {:.3f}".format(grid.score(X_test, y_test))) grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc") grid.fit(X_train, y_train) #오류 발생. 검색해보니 y에 인코딩이 필요한듯. print("grid search roc_auc index") print("best param : ", grid.best_params_) print("best cross_val score : {:.3f}".format(grid.best_score_)) print("test set auc : {:.3f}".format( roc_auc_score(y_test, grid.decision_function(X_test)))) print("test set score : {:.3f}".format(grid.score(X_test, y_test))) from sklearn.metrics.scorer import SCORERS print("가능한 평가 방식 : \n{}".format(sorted(SCORERS.keys())))