def __init__(self, shuffle_seed=0): self.base_model = MLPClassifier self.fixed_params = mlp_gen_fixed self.space_x = JointSpace(mlp_gen_cfg) bounds = self.space_x.get_bounds() self.lb = bounds[:, 0] # In warped space self.ub = bounds[:, 1] # In warped space self.dim = bounds.shape[0] metric = "accuracy" self.scorer = get_scorer(metric) # Now setup data set data, target = datasets.load_digits(return_X_y=True) # data = pickle.load(open("cifar10_X.pkl", "rb")) # target = pickle.load(open("cifar10_y.pkl", "rb")) # data, target = data[:10000, :], target[:10000] # Do some validation on loaded data assert isinstance(data, np.ndarray) assert isinstance(target, np.ndarray) assert data.ndim == 2 and target.ndim == 1 assert data.shape[0] == target.shape[0] assert data.size > 0 assert data.dtype == np.float_ assert np.all(np.isfinite(data)) # also catch nan assert target.dtype == np.int_ assert np.all(np.isfinite(target)) # also catch nan # Always shuffle your data to be safe. Use fixed seed for reprod. self.data_X, self.data_y = shuffle(data, target, random_state=shuffle_seed)
def pred(est_type, task, ests, X, y, scoring=None, thresh=None): pred_frames = [] pred_scores = [] perm_imps = [] for est in ests: if scoring: scorer = get_scorer(scoring) ps = scorer(est, X, y) else: ps = est.score(X, y) pred_scores.append(round(ps, 3)) pred_frames.append( pd.DataFrame(index=y.index.tolist(), data={ 'YBOCS_pred': est.predict(X), 'YBOCS_target': y })) # if ps > thresh: # perm_imp_test(task, est, ps, X, y, 1, scoring) # if task is gbl.clf: # if est_type is gbl.linear_: # pred_frames[i].insert(1, 'Confidence', est.decision_function(X)) # elif est_type is gbl.non_linear_: # pred_frames[i].insert(1, 'Confidence', est.predict_proba(X)) #perm_imps.append(perm_imp_test(est=est, base_score=ps, X=X, y=y, n_iter=3, scoring=scoring)) return pred_scores, pred_frames #, perm_imps
def __init__(self, estimator, scorer, cv=3): try: cv = int(cv) except: cv = cv self.__est = estimator self.__cv = cv self.__scorer = get_scorer(scorer)
def single_split(data, estimator, scoring): attrs, classes = utils.horizontal_split(data) X_train, X_test, y_train, y_test = train_test_split(attrs, classes, test_size=0.4) estimator.fit(X_train, y_train) scorer = get_scorer(scoring) return scorer(estimator, X_test, y_test)
def __init__(self, model, dataset, metric, shuffle_seed=0, data_root=None): """Build class that wraps sklearn classifier/regressor CV score for use as an objective function. Parameters ---------- model : str Which classifier to use, must be key in `MODELS_CLF` or `MODELS_REG` dict depending on if dataset is classification or regression. dataset : str Which data set to use, must be key in `DATA_LOADERS` dict, or name of custom csv file. metric : str Which sklearn scoring metric to use, in `SCORERS_CLF` list or `SCORERS_REG` dict depending on if dataset is classification or regression. shuffle_seed : int Random seed to use when splitting the data into train and validation in the cross-validation splits. This is needed in order to keep the split constant across calls. Otherwise there would be extra noise in the objective function for varying splits. data_root : str Root directory to look for all custom csv files. """ TestFunction.__init__(self) data, target, problem_type = load_data(dataset, data_root=data_root) assert problem_type in (ProblemType.clf, ProblemType.reg) self.is_classifier = problem_type == ProblemType.clf # Do some validation on loaded data assert isinstance(data, np.ndarray) assert isinstance(target, np.ndarray) assert data.ndim == 2 and target.ndim == 1 assert data.shape[0] == target.shape[0] assert data.size > 0 assert data.dtype == np.float_ assert np.all(np.isfinite(data)) # also catch nan assert target.dtype == (np.int_ if self.is_classifier else np.float_) assert np.all(np.isfinite(target)) # also catch nan model_lookup = MODELS_CLF if self.is_classifier else MODELS_REG base_model, fixed_params, api_config = model_lookup[model] # New members for model self.base_model = base_model self.fixed_params = fixed_params self.api_config = api_config # Always shuffle your data to be safe. Use fixed seed for reprod. self.data_X, self.data_y = shuffle(data, target, random_state=shuffle_seed) assert metric in METRICS, "Unknown metric %s" % metric assert metric in METRICS_LOOKUP[ problem_type], "Incompatible metric %s with problem type %s" % ( metric, problem_type, ) self.scorer = get_scorer(SklearnModel._METRIC_MAP[metric])
def __init__(self, clf, metric, cv, n_jobs=1, verbose=0, pre_dispatch='2*n_jobs'): try: cv = int(cv) except: cv = cv self.clf = clf self.metric = get_scorer(metric) self.cv = cv self.n_jobs = n_jobs self.verbose = verbose self.pre_dispatch = pre_dispatch
def __init__(self, clf, metric, cv, n_jobs=1, verbose=0, pre_dispatch='2*n_jobs'): try: cv = int(cv) except: cv = cv self.clf = clf self.metric = get_scorer(metric[0]) self.cv = cv self.n_jobs = n_jobs self.verbose = verbose self.pre_dispatch = pre_dispatch
def fit_score(self, X, y): if isinstance(self.__cv, int): cross_valid = KFold(n_splits=self.__cv).split(X) else: cross_valid = self.__cv scorer = self.__scorer weight = self.__weight scores = [] for train_index, test_index in cross_valid: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] est = clone(self.__est) est.fit(X_train, y_train) k_score = 0 for i in range(len(scorer)): k_score = k_score + get_scorer(scorer[i])(est, X_test, y_test) * weight[i] scores.append(k_score) return (np.mean(scores), np.std(scores))
def perm_imp_test(task, est, base_score, X, y, n_iter=1, scoring=None): feats = [c for c in X.columns.tolist() if c not in gbl.clin_demog_feats] for f in feats: X_col = deepcopy(X.loc[:, f]) score_diff = 0.0 for _ in np.arange(n_iter): X.loc[:, f] = np.random.permutation(X.loc[:, f]) if scoring: scorer = get_scorer(scoring) score_diff += base_score - scorer(est, X, y) else: score_diff += base_score - est.score(X, y) X.loc[:, f] = X_col if task is gbl.clf: gbl.fpi_clf.setdefault(f, []).append(score_diff / n_iter) elif task is gbl.reg: gbl.fpi_reg.setdefault(f, []).append(score_diff / n_iter) return
def compute_feat_perm_imp(est, base_score, X, y, fpis_dict, n_iter=3, scoring=None): feats = [c for c in X.columns.tolist() if c not in gbl.clin_demog_feats] for f in feats: X_col = deepcopy(X.loc[:, f]) score_diff = 0.0 for _ in np.arange(n_iter): X.loc[:, f] = np.random.permutation(X.loc[:, f]) if scoring: scorer = get_scorer(scoring) score_diff += base_score - scorer(est, X, y) else: score_diff += base_score - est.score(X, y) X.loc[:, f] = X_col fpis_dict.setdefault(f, []).append(score_diff / n_iter) return fpis_dict
# lin_svc = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, # decision_function_shape=None, degree=3, gamma='auto', kernel='linear', # max_iter=-1, probability=False, random_state=None, shrinking=True, # tol=0.001,verbose=False) rbf_svc = svm.SVC(C=1.0, kernel='rbf', gamma=0.7) # poly_svc = svm.SVC(C=1.0, kernel='poly', degree=3) #palette = itertools.cycle(seaborn.color_palette(n_colors = 10)) scores_lin = [] scores_rbf = [] scores_poly = [] lin_roc_auc_scorer = [] rbf_roc_auc_scorer = [] poly_roc_auc_scorer = [] roc_auc_scorer = get_scorer("roc_auc") for C in C_2d_range: for gamma in gamma_2d_range: rbf_svc = svm.SVC(C=C, kernel='rbf', gamma=gamma) rbf_roc_auc_scorer = [] for train, test in KFold(n=len(X), n_folds=10, random_state=42): X_train, y_train = X[train], y[train] X_test, y_test = X[test], y[test] # lin_clf = lin_svc.fit(X_train, y_train) rbf_clf = rbf_svc.fit(X_train, y_train) # poly_clf = poly_svc.fit(X_train, y_train) # scores_lin.append(zero_one_loss((y_test),lin_clf.predict(X_test))) scores_rbf.append(zero_one_loss((y_test),rbf_clf.predict(X_test))) # scores_poly.append(zero_one_loss((y_test),poly_clf.predict(X_test))) # lin_roc_auc_scorer.append(roc_auc_scorer(lin_clf, X_test, y_test)) rbf_roc_auc_scorer.append(roc_auc_scorer(rbf_clf, X_test, y_test))
# lin_svc = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, # decision_function_shape=None, degree=3, gamma='auto', kernel='linear', # max_iter=-1, probability=False, random_state=None, shrinking=True, # tol=0.001,verbose=False) rbf_svc = svm.SVC(C=1.0, kernel='rbf', gamma=0.7) # poly_svc = svm.SVC(C=1.0, kernel='poly', degree=3) #palette = itertools.cycle(seaborn.color_palette(n_colors = 10)) scores_lin = [] scores_rbf = [] scores_poly = [] lin_roc_auc_scorer = [] rbf_roc_auc_scorer = [] poly_roc_auc_scorer = [] roc_auc_scorer = get_scorer("roc_auc") for C in C_2d_range: for gamma in gamma_2d_range: rbf_svc = svm.SVC(C=C, kernel='rbf', gamma=gamma) rbf_roc_auc_scorer = [] for train, test in KFold(n=len(X), n_folds=10, random_state=42): X_train, y_train = X[train], y[train] X_test, y_test = X[test], y[test] # lin_clf = lin_svc.fit(X_train, y_train) rbf_clf = rbf_svc.fit(X_train, y_train) # poly_clf = poly_svc.fit(X_train, y_train) # scores_lin.append(zero_one_loss((y_test),lin_clf.predict(X_test))) scores_rbf.append( zero_one_loss((y_test), rbf_clf.predict(X_test))) # scores_poly.append(zero_one_loss((y_test),poly_clf.predict(X_test))) # lin_roc_auc_scorer.append(roc_auc_scorer(lin_clf, X_test, y_test))
def get_scoring(): scoring = {} scoring_proba = {} scores_names = [ 'accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'completeness_score', 'f1', 'f1_macro', 'f1_micro', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_weighted', 'roc_auc', 'v_measure_score' ] metrics_functions = [ metrics.cohen_kappa_score, metrics.hinge_loss, metrics.matthews_corrcoef, metrics.accuracy_score, metrics.f1_score, metrics.hamming_loss, metrics.log_loss, metrics.precision_score, metrics.recall_score, metrics.zero_one_loss, metrics.average_precision_score, metrics.roc_auc_score ] pr_auc_scorer = metrics.make_scorer(pr_auc_score, greater_is_better=True, needs_proba=True) scoring = {x: get_scorer(x) for x in scores_names} scoring.update( {x.__name__: metrics.make_scorer(x) for x in metrics_functions}) scoring["pr_auc"] = pr_auc_scorer scoring.update({ 'tp': metrics.make_scorer(tp), 'tn': metrics.make_scorer(tn), 'fp': metrics.make_scorer(fp), 'fn': metrics.make_scorer(fn) }) scoring.update({ "cost_{0}_{1}".format(*x): metrics.make_scorer(cost, fp_cost=x[0], fn_cost=x[1]) for x in product(range(1, 4), range(1, 4)) }) scoring.update({ "mse_cost_{0}_{1}".format(*x): metrics.make_scorer(mse_cost, fp_cost=x[0], fn_cost=x[1], needs_proba=True) for x in product(range(1, 4), range(1, 4)) }) scoring.update({ "mse1_cost_{0}_{1}".format(*x): metrics.make_scorer(mse_cost1, fp_cost=x[0], fn_cost=x[1], needs_proba=True) for x in product(range(1, 4), range(1, 4)) }) scoring["mse"] = metrics.make_scorer(mse, needs_proba=True) scoring["mse1"] = metrics.make_scorer(mse1, needs_proba=True) return scoring, scoring_proba
def check_score_is_finite(scoring, estimator, input_data, labels): estimator = clone(estimator) assert np.isfinite(cross_val_score(estimator, input_data, labels, scoring=scoring)).all() estimator.fit(input_data, labels) assert np.isfinite(get_scorer(scoring)(estimator, input_data, labels))
def _get_score_byname(scoring): from sklearn.metrics.scorer import get_scorer from sklearn.metrics import SCORERS #TODO: below metrics does not directly map to sklearn: # Classification : weighted_accuracy, accuracy_table, balanced_accuracy, matthews_correlation,norm_macro_recall # Regression, Time Series Forecasting: #spearman_correlation, normalized_root_mean_squared_error, normalized_mean_absolute_error scorer = None if scoring.startswith("AUC"): scorer = get_scorer("roc_auc") average = scoring.split("_")[-1] scorer._kwargs['average'] = average elif scoring.startswith("log_loss"): scorer = get_scorer("neg_log_loss") # elif scoring.startswith("matthews_correlation"): # scorer = get_scorer("matthews_corrcoef") elif scoring.startswith("precision_score"): scorer = get_scorer("precision") average = scoring.split("_")[-1] scorer._kwargs['average'] = average elif scoring.startswith("average_precision_score"): scorer = get_scorer("average_precision") average = scoring.split("_")[-1] scorer._kwargs['average'] = average elif scoring.startswith("recall_score"): scorer = get_scorer("recall") average = scoring.split("_")[-1] scorer._kwargs['average'] = average elif scoring.startswith("norm_macro_recall"): scorer = get_scorer("recall") scorer._kwargs['average'] = "macro" elif scoring.startswith("f1_score"): scorer = get_scorer("f1") average = scoring.split("_")[-1] scorer._kwargs['average'] = average elif scoring.startswith("precision_score"): scorer = get_scorer("precision") average = scoring.split("_")[-1] scorer._kwargs['average'] = average elif scoring.startswith("spearman_correlation"): scorer = get_scorer("r2") elif scoring.startswith("r2_score"): scorer = get_scorer("r2") elif "mean_absolute_error" in scoring: scorer = get_scorer("mean_absolute_error") elif "root_mean_squared" in scoring: scorer = get_scorer("mean_squared_error") elif "median_absolute_error" in scoring: scorer = get_scorer("median_absolute_error") if scorer is None: scorer = get_scorer(scoring) return scorer
def cv_fit_xgb_model( model, X_train, y_train, X_valid, y_valid, cv_nfold=5, early_stopping_rounds=50, missing=np.nan, eval_metric="auc", scoring=None, verbose=True, ): """Fit xgb model with best n_estimators using xgb builtin cv Note: This function changes the model's `n_estimators` attribute Parameters ---------- model : xgb model object X_train : pandas.DataFrame Training features data y_train : pandas.Series Training target data X_valid, y_valid : same as X_train, y_train, but used for validation cv_nfold : int Number of folds in CV early_stopping_rounds : int Activates early stopping. CV error needs to decrease at least every <early_stopping_rounds> round(s) to continue. Last entry in evaluation history is the one from best iteration. missing : float Value in the data which needs to be present as a missing value. eval_metric : str The metric to be used for validation data while training xgb Probably should match `scoring` scoring : str, callable or None, default=None See `scoring` parameter description for sklearn.grid_search.GridSearchCV.html verbose : bool Print scoring summary to stdout Returns ------- best_n_estimators : int Number of optimal estimators, or boosting rounds train_score : float Performance of the best model on training set valid_score : float Performance of the best model on validation set Example ------- model = xgb.XGBRegressor( learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, colsample_bylevel=1.0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, max_delta_step=0, objective='binary:logistic', nthread=4, seed=5 ) n_estimators, train_score, valid_score = cv_fit_xgb_model( model, X_train, y_train, X_valid, y_valid, cv_nfold=5, early_stopping_rounds=50, scoring='roc_auc', verbose=True ) """ # Train cv xgb_param = model.get_xgb_params() dtrain = xgb.DMatrix(X_train.values, label=y_train.values, missing=missing) cv_result = xgb.cv( xgb_param, dtrain, num_boost_round=model.get_params()["n_estimators"], nfold=cv_nfold, metrics=[eval_metric], early_stopping_rounds=early_stopping_rounds, show_progress=False, ) best_n_estimators = cv_result.shape[0] model.set_params(n_estimators=best_n_estimators) # Train model model.fit(X_train, y_train, eval_metric=eval_metric) scorer = get_scorer(scoring) # Predict and score training data train_score = scorer(model, X_train, y_train) # Predict and score validation data valid_score = scorer(model, X_valid, y_valid) # Print model report: if verbose: print("\nModel Report") print("best n_estimators: {}".format(best_n_estimators)) print("Score (Train): %f" % train_score) print("Score (Validation) : %f" % valid_score) return best_n_estimators, train_score, valid_score
def check_score_is_finite(scoring, estimator, input_data, labels): estimator = clone(estimator) assert np.isfinite( cross_val_score(estimator, input_data, labels, scoring=scoring)).all() estimator.fit(input_data, labels) assert np.isfinite(get_scorer(scoring)(estimator, input_data, labels))
def scoring(self, value): self._scoring = value self.scorer = scorer.get_scorer(value)