コード例 #1
0
ファイル: mlp.py プロジェクト: JuanUngredda/Input-Uncertainty
    def __init__(self, shuffle_seed=0):
        self.base_model = MLPClassifier
        self.fixed_params = mlp_gen_fixed

        self.space_x = JointSpace(mlp_gen_cfg)
        bounds = self.space_x.get_bounds()

        self.lb = bounds[:, 0]  # In warped space
        self.ub = bounds[:, 1]  # In warped space
        self.dim = bounds.shape[0]

        metric = "accuracy"
        self.scorer = get_scorer(metric)

        # Now setup data set
        data, target = datasets.load_digits(return_X_y=True)
        # data = pickle.load(open("cifar10_X.pkl", "rb"))
        # target = pickle.load(open("cifar10_y.pkl", "rb"))
        # data, target = data[:10000, :], target[:10000]

        # Do some validation on loaded data
        assert isinstance(data, np.ndarray)
        assert isinstance(target, np.ndarray)
        assert data.ndim == 2 and target.ndim == 1
        assert data.shape[0] == target.shape[0]
        assert data.size > 0
        assert data.dtype == np.float_
        assert np.all(np.isfinite(data))  # also catch nan
        assert target.dtype == np.int_
        assert np.all(np.isfinite(target))  # also catch nan

        # Always shuffle your data to be safe. Use fixed seed for reprod.
        self.data_X, self.data_y = shuffle(data,
                                           target,
                                           random_state=shuffle_seed)
コード例 #2
0
def pred(est_type, task, ests, X, y, scoring=None, thresh=None):
    pred_frames = []
    pred_scores = []
    perm_imps = []

    for est in ests:
        if scoring:
            scorer = get_scorer(scoring)
            ps = scorer(est, X, y)
        else:
            ps = est.score(X, y)

        pred_scores.append(round(ps, 3))
        pred_frames.append(
            pd.DataFrame(index=y.index.tolist(),
                         data={
                             'YBOCS_pred': est.predict(X),
                             'YBOCS_target': y
                         }))
        # if ps > thresh:
        #     perm_imp_test(task, est, ps, X, y, 1, scoring)
        # if task is gbl.clf:
        #     if est_type is gbl.linear_:
        #         pred_frames[i].insert(1, 'Confidence', est.decision_function(X))
        #     elif est_type is gbl.non_linear_:
        #         pred_frames[i].insert(1, 'Confidence', est.predict_proba(X))

        #perm_imps.append(perm_imp_test(est=est, base_score=ps, X=X, y=y, n_iter=3, scoring=scoring))

    return pred_scores, pred_frames  #, perm_imps
コード例 #3
0
 def __init__(self, estimator, scorer, cv=3):
     try:
         cv = int(cv)
     except:
         cv = cv
     self.__est = estimator
     self.__cv = cv
     self.__scorer = get_scorer(scorer)
コード例 #4
0
def single_split(data, estimator, scoring):
    attrs, classes = utils.horizontal_split(data)
    X_train, X_test, y_train, y_test = train_test_split(attrs,
                                                        classes,
                                                        test_size=0.4)
    estimator.fit(X_train, y_train)
    scorer = get_scorer(scoring)
    return scorer(estimator, X_test, y_test)
コード例 #5
0
 def __init__(self, estimator, scorer, cv=3):
     try:
         cv = int(cv)
     except:
         cv = cv
     self.__est = estimator
     self.__cv = cv
     self.__scorer = get_scorer(scorer)
コード例 #6
0
    def __init__(self, model, dataset, metric, shuffle_seed=0, data_root=None):
        """Build class that wraps sklearn classifier/regressor CV score for use as an objective function.

        Parameters
        ----------
        model : str
            Which classifier to use, must be key in `MODELS_CLF` or `MODELS_REG` dict depending on if dataset is
            classification or regression.
        dataset : str
            Which data set to use, must be key in `DATA_LOADERS` dict, or name of custom csv file.
        metric : str
            Which sklearn scoring metric to use, in `SCORERS_CLF` list or `SCORERS_REG` dict depending on if dataset is
            classification or regression.
        shuffle_seed : int
            Random seed to use when splitting the data into train and validation in the cross-validation splits. This
            is needed in order to keep the split constant across calls. Otherwise there would be extra noise in the
            objective function for varying splits.
        data_root : str
            Root directory to look for all custom csv files.
        """
        TestFunction.__init__(self)
        data, target, problem_type = load_data(dataset, data_root=data_root)
        assert problem_type in (ProblemType.clf, ProblemType.reg)
        self.is_classifier = problem_type == ProblemType.clf

        # Do some validation on loaded data
        assert isinstance(data, np.ndarray)
        assert isinstance(target, np.ndarray)
        assert data.ndim == 2 and target.ndim == 1
        assert data.shape[0] == target.shape[0]
        assert data.size > 0
        assert data.dtype == np.float_
        assert np.all(np.isfinite(data))  # also catch nan
        assert target.dtype == (np.int_ if self.is_classifier else np.float_)
        assert np.all(np.isfinite(target))  # also catch nan

        model_lookup = MODELS_CLF if self.is_classifier else MODELS_REG
        base_model, fixed_params, api_config = model_lookup[model]

        # New members for model
        self.base_model = base_model
        self.fixed_params = fixed_params
        self.api_config = api_config

        # Always shuffle your data to be safe. Use fixed seed for reprod.
        self.data_X, self.data_y = shuffle(data,
                                           target,
                                           random_state=shuffle_seed)

        assert metric in METRICS, "Unknown metric %s" % metric
        assert metric in METRICS_LOOKUP[
            problem_type], "Incompatible metric %s with problem type %s" % (
                metric,
                problem_type,
            )
        self.scorer = get_scorer(SklearnModel._METRIC_MAP[metric])
コード例 #7
0
    def __init__(self, clf, metric, cv, n_jobs=1, verbose=0, pre_dispatch='2*n_jobs'):
        try:
            cv = int(cv)
        except:
            cv = cv

        self.clf = clf
        self.metric = get_scorer(metric)
        self.cv = cv
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.pre_dispatch = pre_dispatch
コード例 #8
0
    def __init__(self,
                 clf,
                 metric,
                 cv,
                 n_jobs=1,
                 verbose=0,
                 pre_dispatch='2*n_jobs'):
        try:
            cv = int(cv)
        except:
            cv = cv

        self.clf = clf
        self.metric = get_scorer(metric[0])
        self.cv = cv
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.pre_dispatch = pre_dispatch
コード例 #9
0
 def fit_score(self, X, y):
     if isinstance(self.__cv, int):
         cross_valid = KFold(n_splits=self.__cv).split(X)
     else:
         cross_valid = self.__cv
     scorer = self.__scorer
     weight = self.__weight
     scores = []
     for train_index, test_index in cross_valid:
         X_train, X_test = X[train_index], X[test_index]
         y_train, y_test = y[train_index], y[test_index]
         est = clone(self.__est)
         est.fit(X_train, y_train)
         k_score = 0
         for i in range(len(scorer)):
             k_score = k_score + get_scorer(scorer[i])(est, X_test,
                                                       y_test) * weight[i]
             scores.append(k_score)
     return (np.mean(scores), np.std(scores))
コード例 #10
0
def perm_imp_test(task, est, base_score, X, y, n_iter=1, scoring=None):
    feats = [c for c in X.columns.tolist() if c not in gbl.clin_demog_feats]
    for f in feats:
        X_col = deepcopy(X.loc[:, f])
        score_diff = 0.0
        for _ in np.arange(n_iter):
            X.loc[:, f] = np.random.permutation(X.loc[:, f])
            if scoring:
                scorer = get_scorer(scoring)
                score_diff += base_score - scorer(est, X, y)
            else:
                score_diff += base_score - est.score(X, y)
            X.loc[:, f] = X_col
        if task is gbl.clf:
            gbl.fpi_clf.setdefault(f, []).append(score_diff / n_iter)
        elif task is gbl.reg:
            gbl.fpi_reg.setdefault(f, []).append(score_diff / n_iter)

    return
コード例 #11
0
def compute_feat_perm_imp(est,
                          base_score,
                          X,
                          y,
                          fpis_dict,
                          n_iter=3,
                          scoring=None):
    feats = [c for c in X.columns.tolist() if c not in gbl.clin_demog_feats]
    for f in feats:
        X_col = deepcopy(X.loc[:, f])
        score_diff = 0.0
        for _ in np.arange(n_iter):
            X.loc[:, f] = np.random.permutation(X.loc[:, f])
            if scoring:
                scorer = get_scorer(scoring)
                score_diff += base_score - scorer(est, X, y)
            else:
                score_diff += base_score - est.score(X, y)
            X.loc[:, f] = X_col
        fpis_dict.setdefault(f, []).append(score_diff / n_iter)
    return fpis_dict
コード例 #12
0
    
#     lin_svc = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
#                   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
#                   max_iter=-1, probability=False, random_state=None, shrinking=True,
#                   tol=0.001,verbose=False)
    rbf_svc = svm.SVC(C=1.0, kernel='rbf', gamma=0.7)
#     poly_svc = svm.SVC(C=1.0, kernel='poly', degree=3)
    
    #palette = itertools.cycle(seaborn.color_palette(n_colors = 10))
    scores_lin = []
    scores_rbf = []
    scores_poly = []
    lin_roc_auc_scorer = []
    rbf_roc_auc_scorer = []
    poly_roc_auc_scorer = []
    roc_auc_scorer = get_scorer("roc_auc")
    for C in C_2d_range:
        for gamma in gamma_2d_range:
            rbf_svc = svm.SVC(C=C, kernel='rbf', gamma=gamma)
            rbf_roc_auc_scorer = []
            for train, test in KFold(n=len(X), n_folds=10, random_state=42):
                X_train, y_train = X[train], y[train]
                X_test, y_test = X[test], y[test]
#                 lin_clf = lin_svc.fit(X_train, y_train)
                rbf_clf = rbf_svc.fit(X_train, y_train)
#                 poly_clf = poly_svc.fit(X_train, y_train)
#                 scores_lin.append(zero_one_loss((y_test),lin_clf.predict(X_test)))
                scores_rbf.append(zero_one_loss((y_test),rbf_clf.predict(X_test)))
#                 scores_poly.append(zero_one_loss((y_test),poly_clf.predict(X_test)))
#                 lin_roc_auc_scorer.append(roc_auc_scorer(lin_clf, X_test, y_test))
                rbf_roc_auc_scorer.append(roc_auc_scorer(rbf_clf, X_test, y_test))
コード例 #13
0
    #     lin_svc = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    #                   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
    #                   max_iter=-1, probability=False, random_state=None, shrinking=True,
    #                   tol=0.001,verbose=False)
    rbf_svc = svm.SVC(C=1.0, kernel='rbf', gamma=0.7)
    #     poly_svc = svm.SVC(C=1.0, kernel='poly', degree=3)

    #palette = itertools.cycle(seaborn.color_palette(n_colors = 10))
    scores_lin = []
    scores_rbf = []
    scores_poly = []
    lin_roc_auc_scorer = []
    rbf_roc_auc_scorer = []
    poly_roc_auc_scorer = []
    roc_auc_scorer = get_scorer("roc_auc")
    for C in C_2d_range:
        for gamma in gamma_2d_range:
            rbf_svc = svm.SVC(C=C, kernel='rbf', gamma=gamma)
            rbf_roc_auc_scorer = []
            for train, test in KFold(n=len(X), n_folds=10, random_state=42):
                X_train, y_train = X[train], y[train]
                X_test, y_test = X[test], y[test]
                #                 lin_clf = lin_svc.fit(X_train, y_train)
                rbf_clf = rbf_svc.fit(X_train, y_train)
                #                 poly_clf = poly_svc.fit(X_train, y_train)
                #                 scores_lin.append(zero_one_loss((y_test),lin_clf.predict(X_test)))
                scores_rbf.append(
                    zero_one_loss((y_test), rbf_clf.predict(X_test)))
                #                 scores_poly.append(zero_one_loss((y_test),poly_clf.predict(X_test)))
                #                 lin_roc_auc_scorer.append(roc_auc_scorer(lin_clf, X_test, y_test))
コード例 #14
0
def get_scoring():
    scoring = {}
    scoring_proba = {}
    scores_names = [
        'accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score',
        'average_precision', 'completeness_score', 'f1', 'f1_macro',
        'f1_micro', 'f1_weighted', 'fowlkes_mallows_score',
        'homogeneity_score', 'mutual_info_score', 'neg_log_loss',
        'normalized_mutual_info_score', 'precision', 'precision_macro',
        'precision_micro', 'precision_weighted', 'recall', 'recall_macro',
        'recall_micro', 'recall_weighted', 'roc_auc', 'v_measure_score'
    ]
    metrics_functions = [
        metrics.cohen_kappa_score, metrics.hinge_loss,
        metrics.matthews_corrcoef, metrics.accuracy_score, metrics.f1_score,
        metrics.hamming_loss, metrics.log_loss, metrics.precision_score,
        metrics.recall_score, metrics.zero_one_loss,
        metrics.average_precision_score, metrics.roc_auc_score
    ]

    pr_auc_scorer = metrics.make_scorer(pr_auc_score,
                                        greater_is_better=True,
                                        needs_proba=True)
    scoring = {x: get_scorer(x) for x in scores_names}
    scoring.update(
        {x.__name__: metrics.make_scorer(x)
         for x in metrics_functions})

    scoring["pr_auc"] = pr_auc_scorer

    scoring.update({
        'tp': metrics.make_scorer(tp),
        'tn': metrics.make_scorer(tn),
        'fp': metrics.make_scorer(fp),
        'fn': metrics.make_scorer(fn)
    })

    scoring.update({
        "cost_{0}_{1}".format(*x): metrics.make_scorer(cost,
                                                       fp_cost=x[0],
                                                       fn_cost=x[1])
        for x in product(range(1, 4), range(1, 4))
    })
    scoring.update({
        "mse_cost_{0}_{1}".format(*x): metrics.make_scorer(mse_cost,
                                                           fp_cost=x[0],
                                                           fn_cost=x[1],
                                                           needs_proba=True)
        for x in product(range(1, 4), range(1, 4))
    })

    scoring.update({
        "mse1_cost_{0}_{1}".format(*x): metrics.make_scorer(mse_cost1,
                                                            fp_cost=x[0],
                                                            fn_cost=x[1],
                                                            needs_proba=True)
        for x in product(range(1, 4), range(1, 4))
    })

    scoring["mse"] = metrics.make_scorer(mse, needs_proba=True)
    scoring["mse1"] = metrics.make_scorer(mse1, needs_proba=True)

    return scoring, scoring_proba
コード例 #15
0
def check_score_is_finite(scoring, estimator, input_data, labels):
  estimator = clone(estimator)
  assert np.isfinite(cross_val_score(estimator, input_data, labels,
                                     scoring=scoring)).all()
  estimator.fit(input_data, labels)
  assert np.isfinite(get_scorer(scoring)(estimator, input_data, labels))
コード例 #16
0
ファイル: model_helper.py プロジェクト: chrinide/a2ml
    def _get_score_byname(scoring):
        from sklearn.metrics.scorer import get_scorer
        from sklearn.metrics import SCORERS

        #TODO: below metrics does not directly map to sklearn:
        # Classification : weighted_accuracy, accuracy_table, balanced_accuracy, matthews_correlation,norm_macro_recall
        # Regression,  Time Series Forecasting:
        #spearman_correlation, normalized_root_mean_squared_error, normalized_mean_absolute_error
        scorer = None
        if scoring.startswith("AUC"):
            scorer = get_scorer("roc_auc")
            average = scoring.split("_")[-1]
            scorer._kwargs['average'] = average
        elif scoring.startswith("log_loss"):
            scorer = get_scorer("neg_log_loss")
        # elif scoring.startswith("matthews_correlation"):
        #     scorer = get_scorer("matthews_corrcoef")
        elif scoring.startswith("precision_score"):
            scorer = get_scorer("precision")
            average = scoring.split("_")[-1]
            scorer._kwargs['average'] = average
        elif scoring.startswith("average_precision_score"):
            scorer = get_scorer("average_precision")
            average = scoring.split("_")[-1]
            scorer._kwargs['average'] = average
        elif scoring.startswith("recall_score"):
            scorer = get_scorer("recall")
            average = scoring.split("_")[-1]
            scorer._kwargs['average'] = average
        elif scoring.startswith("norm_macro_recall"):
            scorer = get_scorer("recall")
            scorer._kwargs['average'] = "macro"
        elif scoring.startswith("f1_score"):
            scorer = get_scorer("f1")
            average = scoring.split("_")[-1]
            scorer._kwargs['average'] = average
        elif scoring.startswith("precision_score"):
            scorer = get_scorer("precision")
            average = scoring.split("_")[-1]
            scorer._kwargs['average'] = average
        elif scoring.startswith("spearman_correlation"):
            scorer = get_scorer("r2")
        elif scoring.startswith("r2_score"):
            scorer = get_scorer("r2")
        elif "mean_absolute_error" in scoring:
            scorer = get_scorer("mean_absolute_error")
        elif "root_mean_squared" in scoring:
            scorer = get_scorer("mean_squared_error")
        elif "median_absolute_error" in scoring:
            scorer = get_scorer("median_absolute_error")

        if scorer is None:
            scorer = get_scorer(scoring)

        return scorer
コード例 #17
0
ファイル: ml.py プロジェクト: jjinking/datsci
def cv_fit_xgb_model(
    model,
    X_train,
    y_train,
    X_valid,
    y_valid,
    cv_nfold=5,
    early_stopping_rounds=50,
    missing=np.nan,
    eval_metric="auc",
    scoring=None,
    verbose=True,
):
    """Fit xgb model with best n_estimators using xgb builtin cv
    Note: This function changes the model's `n_estimators` attribute

    Parameters
    ----------
    model : xgb model object

    X_train : pandas.DataFrame
        Training features data

    y_train : pandas.Series
        Training target data

    X_valid, y_valid : same as X_train, y_train, but used for validation

    cv_nfold : int
        Number of folds in CV

    early_stopping_rounds : int
        Activates early stopping. CV error needs to decrease at least
        every <early_stopping_rounds> round(s) to continue.
        Last entry in evaluation history is the one from best iteration.

    missing : float
        Value in the data which needs to be present as a missing value.

    eval_metric : str
        The metric to be used for validation data while training xgb
        Probably should match `scoring`

    scoring : str, callable or None, default=None
        See `scoring` parameter description for
        sklearn.grid_search.GridSearchCV.html

    verbose : bool
        Print scoring summary to stdout

    Returns
    -------
    best_n_estimators : int
        Number of optimal estimators, or boosting rounds

    train_score : float
        Performance of the best model on training set

    valid_score : float
        Performance of the best model on validation set

    Example
    -------

    model = xgb.XGBRegressor(
        learning_rate=0.1,
        n_estimators=1000,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        colsample_bylevel=1.0,
        reg_alpha=0,
        reg_lambda=1,
        scale_pos_weight=1,
        max_delta_step=0,
        objective='binary:logistic',
        nthread=4,
        seed=5
    )

    n_estimators, train_score, valid_score = cv_fit_xgb_model(
        model, X_train, y_train, X_valid, y_valid, cv_nfold=5,
        early_stopping_rounds=50, scoring='roc_auc', verbose=True
    )
    """
    # Train cv
    xgb_param = model.get_xgb_params()
    dtrain = xgb.DMatrix(X_train.values, label=y_train.values, missing=missing)
    cv_result = xgb.cv(
        xgb_param,
        dtrain,
        num_boost_round=model.get_params()["n_estimators"],
        nfold=cv_nfold,
        metrics=[eval_metric],
        early_stopping_rounds=early_stopping_rounds,
        show_progress=False,
    )
    best_n_estimators = cv_result.shape[0]
    model.set_params(n_estimators=best_n_estimators)

    # Train model
    model.fit(X_train, y_train, eval_metric=eval_metric)

    scorer = get_scorer(scoring)
    # Predict and score training data
    train_score = scorer(model, X_train, y_train)
    # Predict and score validation data
    valid_score = scorer(model, X_valid, y_valid)

    # Print model report:
    if verbose:
        print("\nModel Report")
        print("best n_estimators: {}".format(best_n_estimators))
        print("Score (Train): %f" % train_score)
        print("Score (Validation) : %f" % valid_score)

    return best_n_estimators, train_score, valid_score
コード例 #18
0
def check_score_is_finite(scoring, estimator, input_data, labels):
    estimator = clone(estimator)
    assert np.isfinite(
        cross_val_score(estimator, input_data, labels, scoring=scoring)).all()
    estimator.fit(input_data, labels)
    assert np.isfinite(get_scorer(scoring)(estimator, input_data, labels))
コード例 #19
0
ファイル: model.py プロジェクト: simonm3/analysis
 def scoring(self, value):
     self._scoring = value
     self.scorer = scorer.get_scorer(value)