Python XGBClassifier.set_paramsの例、xgboost.XGBClassifier.set_params Pythonの例

コード例 #1

0

ファイルを表示

def load_architecture():

    ada_params_filename = logger.config_dict['BEST_ADA_L']
    logger.log(
        "Loading params for ADA from {} ...".format(ada_params_filename))
    with open(logger.get_model_file(ada_params_filename, "large")) as fp:
        ada_best_params = json.load(fp)

    ada_model = AdaBoostClassifier(DecisionTreeClassifier())
    ada_model.set_params(**ada_best_params)

    xgb_params_filename = logger.config_dict['BEST_XGB_L']
    logger.log(
        "Loading params for XGB from {} ...".format(xgb_params_filename))
    with open(logger.get_model_file(xgb_params_filename, "large")) as fp:
        xgb_best_params = json.load(fp)

    xgb_model = XGBClassifier()
    xgb_model.set_params(**xgb_best_params)

    ensemble_weights = [0.5, 0.5]

    comb_model = VotingClassifier(estimators=[('ADA', ada_model),
                                              ('XGB', xgb_model)],
                                  voting='soft',
                                  weights=ensemble_weights,
                                  n_jobs=-1)

    logger.log("Finish loading best architecture {}".format(comb_model))

    return comb_model

コード例 #2

0

ファイルを表示

def xgmethod(X,Y):
  
    # split data into train and test sets
    seed = 7
    test_size = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

    scaler = preprocessing.StandardScaler().fit(X_train)
    scaler.transform(X_train) 
    # XGtrain matrix
    xgtrain = xgb.DMatrix(X_train, label=y_train)
    
   
    model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=100,objective='binary:logistic')
    xgb_param = model.get_xgb_params()
    
    print ('Start cross validation')
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=500, nfold=10, metrics=['auc'],
     early_stopping_rounds=50, stratified=True, seed=1301)
    print('Best number of trees = {}'.format(cvresult.shape[0]))
    
    model.set_params(n_estimators=cvresult.shape[0])
    print('Fit on the trainingsdata')
    model.fit(X_train, y_train, eval_metric='auc')
   
    pred = model.predict(X_test, ntree_limit=cvresult.shape[0])
    
  
    # make predictions for test data
    predictions = [round(value) for value in pred]
   
    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    return accuracy

コード例 #3

0

ファイルを表示

def xgboost(X_train, X_test, y_train, y_test, **kwargs):
    model = XGBClassifier(random_state=9)
    model.set_params(**kwargs)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

コード例 #4

0

ファイルを表示

ファイル: home_credit_default_risk.py プロジェクト: toanbkmt/COTAI

def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
  alg = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5,
                        min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0,
                        objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
  if useTrainCV:
        print("Start Feeding Data")
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          early_stopping_rounds=early_stopping_rounds)
        display(cvresult)
        alg.set_params(n_estimators=cvresult.shape[0])

    
  print('Start Training')
  alg.fit(X_train, y_train, eval_metric='auc')
  print("Start Predicting")
  predictions = alg.predict(X_test)
  pred_proba = alg.predict_proba(X_test)[:, 1]

    # Model performance
  print("\nModel statistic")
  print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
  print("AUC score (test set): %f" % metrics.roc_auc_score(y_test, pred_proba))
  print("F1 Score (test set): %f" % metrics.f1_score(y_test, predictions))

  feat_imp = alg.feature_importances_
  feat = X_train.columns.tolist()
  res_df = pd.DataFrame({'Features': feat, 'Importance': feat_imp}).sort_values(by='Importance', ascending=False)
  res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances')
  plt.ylabel('Feature Importance Score')
  plt.show()
  print(res_df)
  print(res_df["Features"].tolist())
  return cvresult, alg

コード例 #5

0

ファイルを表示

def return_classifier(classifier, classifier_params):
    """
    Returns classifier object based on name
    """
    # Max Features parameter for RandomForest and DecisionTree
    cp = classifier_params.copy()
    if classifier in ['LogisticRegression', 'KNeighborsClassifier','RandomForest']:
        cp['n_jobs'] = -1

    if classifier == 'LinearSVC':
        cv_generator = cp['cv_generator']
    else:
        cv_generator = None

    if classifier == 'XGBoost':
        from xgboost import XGBClassifier
        clf = XGBClassifier()
    elif classifier == 'LogisticRegression':
        clf = linear_model.LogisticRegression()
    elif classifier == 'KNeighborsClassifier':
        del cp['random_state']
        clf = neighbors.KNeighborsClassifier()
    elif classifier == 'RandomForest':
        clf = ensemble.RandomForestClassifier()
    elif classifier == 'DecisionTree':
        clf = tree.DecisionTreeClassifier()
    elif classifier == 'AdaBoost':
        clf = ensemble.AdaBoostClassifier()
    elif classifier == 'LinearSVC':
        del cp['cv_generator']
        clf = svm.LinearSVC()

    clf.set_params(**cp)
    return clf, cv_generator

コード例 #6

0

ファイルを表示

ファイル: run_xgb_param_search.py プロジェクト: Quasi-quant2010/Stacking

def get_xgb_feature_importance_plot(best_param_, experiment_, 
                                    png_folder,
                                    png_fname,
                                    score_threshold=0.8):

    # 1. 
    train_X, train_y = experiment_.get_train_data()
    clf = XGBClassifier()
    try:
        del best_param_['model_type']
    except:
        pass
    clf.set_params(**best_param_)
    clf.fit(train_X, train_y)
    index2feature = clf.booster().get_fscore()
    fis = pd.DataFrame({'name':index2feature.keys(),
                        'score':index2feature.values()})
    fis = fis.sort('score', ascending=False)
    if len(fis.index) > 20:
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold)
        #where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)

    # 2. plot
    #gs = GridSpec(2,2)
    #ax1 = plt.subplot(gs[:,0])
    #ax2 = plt.subplot(gs[0,1])
    #ax3 = plt.subplot(gs[1,1])

    # 3.1 feature importance
    sns.barplot(x = 'score', y = 'name',
                data = fis,
                #ax=ax1,
                color="blue")
    #plt.title("Feature_Importance", fontsize=10)
    plt.ylabel("Feature", fontsize=10)
    plt.xlabel("Feature_Importance : f-Score", fontsize=10)

    """
    # 3.2 PDF
    confidence_score = clf.oob_decision_function_[:,1]
    sns.distplot(confidence_score, kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF")

    # 3.3 CDF
    num_bins = min(best_param_.get('n_estimators',1), 100)
    counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True)
    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10)
    """

    png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1)
    plt.close()

    return True

コード例 #7

0

ファイルを表示

def xgbclf(params, X_train, y_train, X_test, y_test):

    eval_set = [(X_train, y_train), (X_test, y_test)]

    model = XGBClassifier(**params).\
      fit(X_train, y_train, eval_set=eval_set, \
                  eval_metric='auc', early_stopping_rounds = 100, verbose=100)

    model.set_params(**{'n_estimators': model.best_ntree_limit})
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test, ntree_limit=model.best_ntree_limit)

    abclf_cm = confusion_matrix(y_test, y_pred)
    print(abclf_cm)
    print(abclf_cm[0][0] / (abclf_cm[0][0] + abclf_cm[1][0]))
    print(classification_report(y_test, y_pred))
    print('\n')
    print("Model Final Generalization Accuracy: %.6f" %
          accuracy_score(y_test, y_pred))

    y_pred_proba = model.predict_proba(X_test,
                                       ntree_limit=model.best_ntree_limit)[:,
                                                                           1]
    get_roc(y_test, y_pred_proba)
    return model

コード例 #8

0

ファイルを表示

ファイル: build.py プロジェクト: vp999/Xgboost_project

def xgboost(X_train, X_test, y_train, y_test, **kwargs):
    xgb1 = XGBClassifier(seed=9)
    if kwargs:
        xgb1.set_params(**kwargs)
    xgb1.fit(X_train, y_train)

    y_pred = xgb1.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    return accuracy

コード例 #9

0

ファイルを表示

def xgboost(X_train, X_test, y_train, y_test, **kwargs):
    model = XGBClassifier(seed=9)
    model.set_params(**kwargs)
    #ac,bst=myXGBoost(X_train, X_test, y_train, y_test,model,param_grid1,KFold=3)
    #h,j=param2(X_train, X_test, y_train, y_test,model,param_grid2)
    #return h,j
    #print model
    model.fit(X_train, y_train)
    d = model.predict(X_test)
    a = accuracy_score(y_test, d)
    return a

コード例 #10

0

ファイルを表示

def train_xgb(X, y, params, save_path=None, save_path_booster=None):

    # the threshold is not handled by XGB interface
    params, binary_threshold = _parse_param_and_delete(params,
                                                       'binary_threshold', .5)

    # n_jobs is handled by XGB SKL interface
    params = _parse_param_and_keep(params,
                                   name='n_jobs',
                                   default=min(max_cpu_count(), 24))

    X = np.asarray(X)
    y = np.asarray(y).flatten()

    if not tuple(np.sort(np.unique(y))) == (0, 1):
        raise NotImplementedError(
            'XGB Wrapper currently only support biinary classification.')

    # Fit the model
    model = XGBClassifier(use_label_encoder=False, )
    model = clone(model)
    model.set_params(**params)

    logging.info('Training...')
    model.fit(
        X,
        y,
        # early_stopping_rounds=10,
        verbose=True,
    )
    # Save and re-load (feature-agnostic model)
    temp_file = f'temp-{time.time()}-{random.random()}.bin'
    model.get_booster().save_model(temp_file)
    booster = Booster(model_file=temp_file)
    os.remove(temp_file)

    if binary_threshold == 'auto':
        p_ = booster.predict(DMatrix(X))
        p_ = np.sort(p_)
        binary_threshold = p_[int((y == 0).sum())]

    logging.info(f'Using a binary_threshold = {binary_threshold}')

    # Wrap
    model = XGBClassifierSKLWrapper(booster,
                                    features=X.shape[1],
                                    threshold=binary_threshold)

    # Save
    if save_path is not None:
        save_pickle(model, save_path)
    if save_path_booster is not None:
        save_pickle(model.get_booster(), save_path_booster)
    return model

コード例 #11

0

ファイルを表示

class Hyperopt_xbc:
    def __init__(self, X, y, seed):
        self.name = 'XGBoost'
        self.name_short = 'XBC'
        self.X = X
        self.y = y
        self.seed = seed
        self.clf = None
        self.best_acc = 0
        self.space = {
            'objective': 'binary:logistic',
            'max_depth': hp.choice('max_depth', range(5, 30, 1)),
            'learning_rate': hp.quniform('learning_rate', 0.01, 0.5, 0.01),
            'n_estimators': hp.choice('n_estimators', range(10, 500, 10)),
            'booster': hp.choice('booster', ['gbtree', 'gblinear', 'dart']),
            'gamma': hp.quniform('gamma', 0, 0.50, 0.01),
            'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
            'subsample': hp.quniform('subsample', 0.1, 1, 0.01),
            'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)
        }
        self.max_evals = 50

    def train_test(self, params):
        warnings.filterwarnings(action='ignore', category=DeprecationWarning)
        self.clf = XGBClassifier(**params)
        self.clf.fit(self.X, self.y)
        return cross_val_score(self.clf,
                               self.X,
                               self.y,
                               scoring='roc_auc',
                               cv=10).mean()

    def f(self, params):
        acc = self.train_test(params)
        if acc > self.best_acc:
            self.best_acc = acc
        return {'loss': -acc, 'status': STATUS_OK}

    def best(self):
        trials = Trials()
        best = fmin(self.f,
                    self.space,
                    algo=tpe.suggest,
                    max_evals=self.max_evals,
                    rstate=np.random.RandomState(self.seed),
                    trials=trials)
        self.clf.set_params(**best)
        return self.clf, self.name, self.name_short, space_eval(
            self.space, best), self.best_acc

コード例 #12

0

ファイルを表示

ファイル: P5.py プロジェクト: sequent/XGBoost_stock_prediction

    def training(self):
        """
        Training is done at each max_depth loop.
        XGBoost's cv is used to find the optimum number of tree (estimators) at each depth, up to 1000 trees.
        Once traning result doesn't improve for 50 epochs, training will stop. The tree number used in the last epoch
        will be used to fit the train and test set again. Metrics will then be measured again this XGB model.
        """

        max_depth = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        best_depth = 0
        best_estimator = 0
        max_score = 0
        for md in max_depth:
            model = XGBClassifier(learning_rate=0.3, n_estimators=1000, max_depth=md, min_child_weight=1,
                                  gamma=1, subsample=1, colsample_bytree=0.1, reg_lambda=0, reg_alpha=1,
                                  random_state=42)
            xgb_param = model.get_xgb_params()
            xgtrain = xgboost.DMatrix(self.Xtrain.values, label=self.ytrain.values)

            cvresult = xgboost.cv(xgb_param, xgtrain, num_boost_round=1000, early_stopping_rounds=50,
                                  nfold=8, metrics='auc', stratified=True, shuffle=True, seed=42,
                                  verbose_eval=False)
            print("There are {} trees in the XGB model. CV-mean: {:.4f}, CV-std: {:.4f}.".format(
                cvresult.shape[0], cvresult.iloc[cvresult.shape[0] - 1, 0],
                cvresult.iloc[cvresult.shape[0] - 1, 1]))
            n = cvresult.shape[0]
            model.set_params(n_estimators=n)
            model.fit(self.Xtrain,
                      self.ytrain,
                      eval_metric=self._metric,
                      eval_set=[(self.Xtrain, self.ytrain), (self.Xtest, self.ytest)],
                      verbose=False)
            y_pred = model.predict(self.Xtest)
            score = accuracy_score(self.ytest, y_pred)
            mse = mean_squared_error(self.ytest, y_pred)

            if score > max_score:
                max_score = score
                min_mse = mse
                best_depth = md
                best_estimator = n
                self.best_xgb = model
            print("Accuracy score: " + str(round(score, 4)) + " at depth: " + str(md) + " and estimator " + str(n))
            print("Mean square error: " + str(round(mse, 4)) + " at depth: " + str(md) + " and estimator " + str(n))
        print("Best score: " + str(round(max_score, 4)) + " Best MSE: " + str(round(min_mse, 4)) + " at depth: " + str(
            best_depth) + " and estimator of " + str(best_estimator))

コード例 #13

0

ファイルを表示

ファイル: predictions.py プロジェクト: NakramR/kakart

def generateXGBoostPrediction(train, test):
    print('\n##################\nXGBoost\n##################')
    features = [
        'orderfrequency', 'dayfrequency', 'days_without_product_order',
        'department_id', 'aisle_id', 'eval_days_since_prior_order',
        'numproductorders', 'totaluserorders', 'user_id', 'product_id'
    ]
    param = {}
    #param['booster'] = 'gbtree'
    param['objective'] = 'binary:logistic'
    # param["eval_metric"] = "error"
    # param['eta'] = 0.3
    # param['gamma'] = 0
    param['max_depth'] = 4
    param['n_estimators'] = 80
    param['learning_rate'] = 0.1
    # param['min_child_weight'] = 1
    # param['max_delta_step'] = 0
    #param['subsample'] = 1
    # param['colsample_bytree'] = 1
    # param['silent'] = 1
    # param['seed'] = 0
    #param['base_score'] = 0.4

    X_train = train[features]
    test = test[features]

    y_train = train['reordered']

    estimator = XGBClassifier()
    estimator.set_params(**param)
    metLearn = CalibratedClassifierCV(estimator, method='sigmoid', cv=5)
    metLearn.fit(X_train, y_train)
    y_pred = metLearn.predict(test)

    # estimator.fit(X_train, y_train)
    # y_pred = estimator.predict(test)
    print('Predict counter : %s' % (Counter(y_pred)))

    df = pd.DataFrame(columns=('user_id', 'product_id', 'predy'))
    df['user_id'] = test['user_id']
    df['product_id'] = test['product_id']
    df['predy'] = y_pred
    return df

コード例 #14

0

ファイルを表示

def final_xgb(X_train, y_train, X_test, y_test, scale_pos_weight, best_params,
              analysis):

    xgb = XGBClassifier(**best_params)
    xgb.set_params(njobs=4,
                   random_state=0,
                   objective='binary:logistic',
                   scale_pos_weight=scale_pos_weight)

    eval_set = [(X_train, y_train), (X_test, y_test)]
    eval_metric = ["error", "auc"]

    xgb.fit(X_train,
            y_train,
            eval_metric=eval_metric,
            eval_set=eval_set,
            verbose=0)

    results = xgb.evals_result()

    fig1, axes1 = plt.subplots(figsize=(10, 8), nrows=1, ncols=2)
    axes1[0].plot(results['validation_0']['error'], label='Train Error')
    axes1[0].plot(results['validation_1']['error'], label='Validation Error')
    axes1[0].set_title("Final XGBoost Error")
    axes1[0].set_xlabel("Iteration")
    axes1[0].set_ylabel("Error")
    axes1[0].legend()

    axes1[1].plot(results['validation_0']['auc'], label='Train AUC-ROC')
    axes1[1].plot(results['validation_1']['auc'], label='Validation AUC-ROC')
    axes1[1].set_title("Final XGBoost AUC-ROC")
    axes1[1].set_xlabel("Iteration")
    axes1[1].set_ylabel("AUC")
    axes1[1].legend()

    fig1.tight_layout()

    fig1.savefig(fig_dir + '/{}_final_xgb_model.png'.format(analysis),
                 format='png',
                 dpi=300,
                 transparent=False)

    return xgb

コード例 #15

0

ファイルを表示

ファイル: methods.py プロジェクト: camendola/DNN_VBFvsGGF

def opt_BDT(input, output, params, show, names):

    model = XGBClassifier(**params)
    xgb_param = model.get_xgb_params()
    cvscores = []
    AUC = []
    X_train, X_test, y_train, y_test = train_test_split(input,
                                                        output,
                                                        test_size=0.2,
                                                        random_state=42)
    matrix_train = xgb.DMatrix(X_train, label=y_train)
    cvresult = xgb.cv(
        xgb_param,
        matrix_train,
        num_boost_round=model.get_params()["n_estimators"],
        nfold=5,
        metrics="auc",
        early_stopping_rounds=30,
        verbose_eval=True,
    )
    model.set_params(n_estimators=cvresult.shape[0])
    model.fit(X_train, y_train, eval_metric="auc")
    y_prob = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
    prediction = [round(value) for value in y_pred]
    auc = roc_auc_score(y_test, y_prob[:, 1])
    accuracy = accuracy_score(y_test, prediction)

    print("Accuracy: %.2f%%; AUC = %.4f%" % (accuracy * 100, auc))
    if show:

        name = "channel_" + str(channel) + "_BDT"
        name = "%s_%s" % (name, selection)
        modelname = "models/%s.h5" % name
        print("Save to %s" % modelname)

        plotter.plot_separation(model, X_test, y_test, name, False)
        plotter.plot_ROC(model, X_test, y_test, name, False)
        model.get_booster().feature_names = names
        mp.rc("figure", figsize=(5, 5))
        plot_importance(model.get_booster())
        plt.subplots_adjust(left=0.3)
        plt.show()

コード例 #16

0

ファイルを表示

ファイル: oofs.py プロジェクト: Jie-Yuan/tql-Python

    def fit_predict(self, X_train, y_train, X_valid, y_valid, X_test,
                    **kwargs):
        clf = XGBClassifier()
        if self.params is not None:
            clf.set_params(**self.params)
            # print(clf.get_params())

        eval_set = [(X_train, y_train), (X_valid, y_valid)]
        self.clf = clf.fit(X_train,
                           y_train,
                           eval_set=eval_set,
                           eval_metric=None,
                           verbose=100,
                           early_stopping_rounds=100)
        # evals_result = self.clf.evals_result()

        valid_predict = clf.predict_proba(X_valid)
        test_predict = clf.predict_proba(X_test)
        return valid_predict, test_predict

コード例 #17

0

ファイルを表示

ファイル: meta_learner.py プロジェクト: acse-hw20/Struggle_2021_for_work

class ClassificationLearner:
    def __init__(self, **kwargs):
        self.estimator = XGBClassifier(**kwargs)
        self.fit_info = None

    # noinspection PyPep8Naming
    # pylint: disable-msg=too-many-arguments
    # pylint: disable-msg=too-many-locals
    # pylint: disable-msg=invalid-name
    def fit(self, X, y):
        # If there is no evaluation data, split some.
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.1,
                                                            random_state=42)

        if X.shape[0] < 10000:
            best_param = search_parameters(self.estimator, x_train, y_train)
            self.estimator.set_params(**best_param)

        self.estimator.fit(x_train,
                           y_train,
                           eval_set=[(x_test, y_test)],
                           early_stopping_rounds=10,
                           verbose=False)

        y_train_pred = self.predict_proba(x_train)[:, 1]
        train_auc = sklearn.metrics.roc_auc_score(y_train, y_train_pred)
        y_test_pred = self.predict_proba(x_test)[:, 1]
        test_auc = sklearn.metrics.roc_auc_score(y_test, y_test_pred)

        self.fit_info = 'Train/Test AUC: {:.2f}/{:.2f}'.format(
            train_auc, test_auc)

        return self

    def predict_proba(self, x):
        return self.estimator.predict_proba(x)

    def predict(self, x):
        return self.estimator.predict(x)

コード例 #18

0

ファイルを表示

def train_evaluate(training_dataset_path, validation_dataset_path, max_depth,
                   n_estimators, output_dir):

    df_train = pd.read_csv(training_dataset_path)
    df_validation = pd.read_csv(validation_dataset_path)
    df = pd.concat([df_train, df_validation])

    categorical_features = ['workclass', 'occupation']
    target = 'income_bracket'

    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=categorical_features)

    # Change label to 0 if <=50K, 1 if >50K
    df[target] = df[target].apply(lambda x: 0 if x == ' <=50K' else 1)

    # Split features and labels into 2 different vars
    X_train = df.loc[:, df.columns != target]
    y_train = np.array(df[target])

    # Normalize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    grid = {'max_depth': int(max_depth), 'n_estimators': int(n_estimators)}

    model = XGBClassifier()
    model.set_params(**grid)
    model.fit(X_train, y_train)

    model_filename = 'xgb_model.pkl'
    pickle.dump(model, open(model_filename, "wb"))

    EXPORT_PATH = os.path.join(
        output_dir,
        datetime.datetime.now().strftime("%Y%m%d%H%M%S"))

    gcs_model_path = '{}/{}'.format(EXPORT_PATH, model_filename)
    subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path])
    print('Saved model in: {}'.format(gcs_model_path))

コード例 #19

0

ファイルを表示

 def return_model_assessment(self, args):
     curr_model_hyper_params = [
         'colsample_bylevel', 'colsample_bytree', 'gamma', 'learning_rate',
         'max_delta_step', 'max_depth', 'min_child_weight', 'n_estimators',
         'reg_alpha', 'reg_lambda', 'subsample'
     ]
     params = {
         curr_model_hyper_params[i]: args[i]
         for i, j in enumerate(curr_model_hyper_params)
     }
     model = XGBClassifier(random_state=self.seed, seed=self.seed)
     model.set_params(**params)
     fitted_model = model.fit(self.X_train,
                              self.y_train,
                              sample_weight=None)
     self.models.append(fitted_model)
     train_predictions = model.predict(self.X_train)
     test_predictions = model.predict(self.X_test)
     train_score = f1_score(train_predictions, self.y_train)
     test_score = f1_score(test_predictions, self.y_test)
     self.train_scores.append(train_score)
     self.test_scores.append(test_score)
     return 1 - test_score

コード例 #20

0

ファイルを表示

def get_default_xgb_model(df):

    final_X, final_y = hs.get_final_data(df, hs.get_data_transformer())

    parameters = {
        'nthread': 1,
        'objective': 'binary:logistic',
        'learning_rate': 0.01,
        'max_depth': 8,
        'min_child_weight': 3,
        'silent': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.5,
        'n_estimators': 1000,
        'missing': -999,
        'seed': 1337
    }

    xgb_model = XGBClassifier(verbosity=0)
    xgb_model.set_params(**parameters)
    xgb_model.fit(final_X, final_y)

    return xgb_model

コード例 #21

0

ファイルを表示

 def modelXGBClassifier(self, trial: optuna.trial.Trial):
     opt_params = dict(
         max_depth=trial.suggest_int("max_depth", 2, 2**4),
         learning_rate=trial.suggest_discrete_uniform(
             'learning_rate', 0.001, 1, 0.001),
         n_estimators=trial.suggest_int("n_estimators", 2, 2**10, log=True),
         gamma=trial.suggest_loguniform('gamma', 1e-8, 1),
         min_child_weight=trial.suggest_loguniform('min_child_weight', 1e-8,
                                                   2**10),
         subsample=trial.suggest_uniform('subsample', 0.1, 1),
         colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.1, 1),
         colsample_bylevel=trial.suggest_uniform('colsample_bylevel', 0.1,
                                                 1),
         reg_alpha=trial.suggest_loguniform('reg_alpha', 1e-8, 10),
         reg_lambda=trial.suggest_loguniform('reg_lambda', 1e-8, 10),
     )
     clf = XGBClassifier(max_depth=3,
                         learning_rate=0.1,
                         n_estimators=100,
                         silent=True,
                         objective="binary:logistic",
                         booster='gbtree',
                         n_jobs=1,
                         gamma=0,
                         min_child_weight=1,
                         max_delta_step=0,
                         subsample=1,
                         colsample_bytree=1,
                         colsample_bylevel=1,
                         reg_alpha=0,
                         reg_lambda=1,
                         scale_pos_weight=1,
                         base_score=0.5,
                         random_state=0,
                         missing=None)
     clf.set_params(**{**opt_params, **self.params})
     return clf

コード例 #22

0

ファイルを表示

def log_xgboost(params, train_X, train_Y, test_X, test_Y):

    with mlflow.start_run() as ml_run:
        for k, v in params.items():
            mlflow.log_param(k, v)
        mlflow.set_tag("state", "dev")
        xgc = XGBClassifier(objective="binary:logistic")
        xgc.set_params(**params)
        model = xgc.fit(train_X,
                        train_Y.values.ravel(),
                        eval_set=[(train_X, train_Y.values.ravel()),
                                  (test_X, test_Y.values.ravel())],
                        eval_metric=['error', 'logloss'],
                        verbose=0)
        predictions = model.predict(test_X)
        acc = accuracy_score(test_Y.values.ravel(), predictions)
        loss = log_loss(test_Y.values.ravel(), predictions)

        ## Plots
        error_plot = plot_learning(model, "error")
        error_plot.savefig("temp/error_plot.png")
        mlflow.log_artifact("temp/error_plot.png")
        loss_plot = plot_learning(model, "logloss")
        loss_plot.savefig("temp/logloss.png")
        mlflow.log_artifact("temp/logloss.png")
        conf_mat = confusion_matrix(test_Y, predictions)
        conf_mat_plot = sns.heatmap(conf_mat, annot=True, fmt='g')
        conf_mat_plot.figure.savefig("temp/confmat.png")
        mlflow.log_artifact("temp/confmat.png")
        mlflow.log_metrics({'log_loss': loss, 'accuracy': acc})

        mlflow.xgboost.log_model(model, "model")

        print(f"Model trained with parameters: {params}")

        return model, predictions, acc, loss

コード例 #23

0

ファイルを表示

        logger.info('test col: %s' % (add_col))
        for train_idx, test_idx in list(cv)[:1]:
            train_omit_idx = numpy.intersect1d(train_idx, omit_idx)
            logger.info('ommit size: %s %s' %
                        (train_idx.shape[0], len(train_omit_idx)))

            ans = []
            insample_ans = []
            for i in ['']:  #
                logger.info('model: %s' % i)
                cols = data.columns.values  # [col for col in feature_column if 'L%s' % i in col]
                logger.info('model xg: %s' % i)
                model = XGBClassifier(seed=0)
                #model = RandomForestClassifier(n_jobs=-1, random_state=0)
                gc.collect()
                model.set_params(**params)
                model.fit(data.ix[train_idx, cols], target[train_idx])

                ans = model.predict_proba(data.ix[test_idx, cols])[:, 1]
                insample_ans = model.predict_proba(data.ix[train_idx, cols])[:,
                                                                             1]

            logger.info('train_end')
            """
            if all_ans is None:
                all_ans = ans
                all_target = target[test_idx]
                all_ids = ids.ix[test_idx].values
            else:
                all_ans = numpy.r_[all_ans, ans]
                all_target = numpy.r_[all_target, target[test_idx]]

コード例 #24

0

ファイルを表示

ファイル: tunningXGB.py プロジェクト: jenniyanjie/data-scientist-collection

        subsample=0.9,
        colsample_bytree=0.7,
        objective='multi:softprob',
        scale_pos_weight=1,
        seed=0,
    )

    xgb_enc = OneHotEncoder(handle_unknown='ignore')
    xgb_enc.fit(X)  # since I am working mostly on categorical features

    estimate_nround = False
    if estimate_nround:
        logger.info('estimating the n_estimators...')
        best_n_rounds = estimate_xgb_nround(xgb_model, X, y)
        logger.info('complete estimating the n_estimators')
        xgb_model.set_params(n_estimators=best_n_rounds)
        xgb_model.fit(xgb_enc.transform(X), y)
        plot_importance_matrix(xgb_model, csv_path)
        sys.exit()

    # start tunning
    param_grid = {
        ### step 1 ###
        #                'max_depth': [3, 5, 7, 9],
        #                'min_child_weight': [1, 3, 5]
        ### best parameter for round 1: max_depth = 5, min_child_weight = 1 ###
        ### step 2 ###
        #                'max_depth': [4, 5, 6],
        #                'min_child_weight': [1, 2]
        ### best parameter for round 2: max_depth = 5, min_child_weight = 1 ###
        ### step 3 ###

コード例 #25

0

ファイルを表示

ファイル: XGBtmp.py プロジェクト: pkepley/kaggle.austin.animals

					objective= 'multi:softprob', 
					max_depth = 7, 
					gamma= .2)

# use the xgb interface
xgb_param = clf.get_xgb_params()
xgb_param['num_class'] = 5
xgb_param['eval_metric'] = 'mlogloss'
Xg_train = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
cvresult = xgb.cv(xgb_param, 
				  Xg_train, 
 				  num_boost_round = clf.get_params()['n_estimators'],
 				  nfold = 5,
 				  show_progress = True,
				  early_stopping_rounds = 100)
clf.set_params(n_estimators=cvresult.shape[0])
clf.fit(X_train, y_train)
best_outcome_params = clf.get_params()
best_outcome_score = cvresult.min()

try:
	# predict the outcome probabilities
	y_pred = grid.predict_proba(X_test)
except:
	# predict the outcome probabilities
	y_pred = clf.predict_proba(X_test)


# Create a data frame
column_names = possible_outcomes[:]
idx = pd.Int64Index(np.arange(1,11457, dtype='int64'))

コード例 #26

0

ファイルを表示

ファイル: Combined models-Copy1.py プロジェクト: tonypeng1/Titanic

                  nfold=5,
                  metrics='auc', 
                  early_stopping_rounds=50,
                  seed=42
                  )

cvresult.head()

cvresult.shape

xgb_best_param = {'n_estimators': cvresult.shape[0]}
xgb_best_param
# best n_estimators value to be used in the stack model

# update xgb with the optimal n_estimators
xgb.set_params(**xgb_best_param)

# #### 2. Tune max_depth and min_child_weight

parameter_grid = {
                    'max_depth': np.arange(2, 4),
                    'min_child_weight': np.arange(1, 4)
                 }
grid_xgb = GridSearchCV(xgb, parameter_grid, cv=cv_splitter, n_jobs=-1)
grid_xgb.fit(X_1, y)

grid_xgb.best_params_

xgb_best_param.update(grid_xgb.best_params_)
xgb_best_param
# best parameter values to be used in the stack model

コード例 #27

0

ファイルを表示

ファイル: model.py プロジェクト: wkondrusiewicz/flavours-of-physics

    no_test=False)
var_kin, var_geo = data.variables_list()
skf = StratifiedKFold(n_splits=n, shuffle=True)

params = {
    'learning_rate': 0.05,
    'n_estimators': 100,
    'max_depth': 4,
    'subsample': 0.5,
    'n_jobs': 4,
    'min_child_weight': 15
}

train_params = {'early_stopping_rounds': 10, 'verbose': 0}
xgb = XGBClassifier(**params)
xgb.set_params(**train_params)

xgb_kin = clf.Classifier(model=xgb,
                         cv=skf,
                         variables=var_kin,
                         model_name='XGBoost',
                         var_name='kinetic',
                         fig_name='xgb',
                         train_params=train_params)
xgb_kin.fit(train)
xgb_kin.check_ks_and_cvm(train,
                         check_agreement=check_agreement,
                         check_correlation=check_correlation)
xgb_kin.predict(data=test)
params = {
    'learning_rate': 0.05,

コード例 #28

0

ファイルを表示

ファイル: NICU_xgboost_wolabor_2016.py プロジェクト: bmeier01/CDC_Natality_Analysis

dsample = LabelEncoding(dsample)
dtest = LabelEncoding(test)

X_train = dsample.drop('AB_NICU', axis=1)
y_train = dsample['AB_NICU']
X_test = dtest.drop('AB_NICU', axis=1)
y_test = dtest['AB_NICU']

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

#XGBoost initial fit 
xgb = XGBClassifier()
xgb.set_params(random_state=0)
xgb.fit(X_train, y_train)
print("The training error is: %.5f" % (1 - xgb.score(X_train, y_train)))
print("The test error is: %.5f" % (1 - xgb.score(X_test, y_test)))

# Commented out IPython magic to ensure Python compatibility.
# set the parameter grid
xgb_param_grid ={'learning_rate': [0.01,0.05,0.1],
                 'max_depth': [3,4,5,6],
                 'min_child_weight': [4,5,6],
                 'n_estimators': [100,200,300,400]}

#grid search
grid_search_xgb = GridSearchCV(xgb, xgb_param_grid, scoring='accuracy', cv= 5, n_jobs=-1, return_train_score = True)
# %time grid_search_xgb.fit(X_train, y_train)

コード例 #29

0

ファイルを表示

    max_score = -100
    best_thresh = None
    pg = list(ParameterGrid(all_params))
    for i in range(data.shape[1]):
        thresh, score = mcc_optimize(data[:, i], target)
        logger.info('model:%s, thresh: %s, total score: %s, max_score: %s' %
                    (i, thresh, score, max_score))

    for i, params in enumerate(pg):
        logger.info('%s/%s param: %s' % (i + 1, len(pg), params))
        pred_proba_all = []
        y_true = []
        for train_idx, test_idx in cv:
            model = XGBClassifier(seed=0)
            #model = LogisticRegression(n_jobs=-1, class_weight='balanced')
            model.set_params(**params)

            model.fit(data[train_idx],
                      target[train_idx],
                      eval_metric=evalmcc_xgb_min,
                      verbose=False)

            #pred_proba = data[test_idx, -1]
            pred_proba = model.predict_proba(data[test_idx])[:, 1]
            pred_proba_all = numpy.r_[pred_proba_all, pred_proba]
            y_true = numpy.r_[y_true, target[test_idx]]
            score = roc_auc_score(target[test_idx], pred_proba)
            #logger.info('    score: %s' % score)
            #thresh, score = mcc_scoring(model, data[test_idx], target[test_idx])
            list_score.append(score)
            #logger.info('    thresh: %s' % thresh)

コード例 #30

0

ファイルを表示

    def rvs(self, random_state):
        return random_state.choice(self.support)


def search(param_dict,
           cv_obj,
           X,
           y,
           n_iter=1_000,
           skeleton=None,
           scoring='neg_log_loss',
           **kwargs):
    if skeleton is None:
        skeleton = XGBClassifier(n_jobs=1, random_state=SEED)
        if 'early_stopping_rounds' in kwargs:
            skeleton.set_params(n_estimators=1_000)

    dist = {k: Uniform(v) for k, v in param_dict.items()}

    optim = RandomizedSearchCV(
        estimator=skeleton,
        param_distributions=dist,
        n_iter=n_iter,
        scoring=scoring,
        cv=cv_obj,
        return_train_score=True,
        verbose=1,
        n_jobs=4,
        random_state=SEED,
    )

コード例 #31

0

ファイルを表示

Fitting the final XGBoost with parameters found on grid_cv.
Use all training data.
Test on test data.
#######################################################################
"""
params = best_params
# params = {'colsample_bytree': 0.6,
#           'learning_rate': 0.01,
#           'max_depth': 3,
#           'n_estimators': 250,
#           'subsample': 1.0}

xgb = XGBClassifier(**params)
xgb.set_params(silent=True,
               verbosity=0,
               njobs=4,
               random_state=0,
               objective='binary:logistic',
               scale_pos_weight=scale_pos_weight)

eval_set = [(X_train, y_train), (X_test, y_test)]
eval_metric = ["error", "auc"]

xgb.fit(X_train, y_train,
        eval_metric=eval_metric,
        eval_set=eval_set,
        verbose=False)

results = xgb.evals_result()

fig1, ax1 = plt.subplots()
ax1.plot(results['validation_0']['error'], label='Train Error')

コード例 #32

0

ファイルを表示

def get_model(PARAMS):
    '''Get model according to parameters'''
    model = XGBClassifier()
    model.set_params(**PARAMS)
    return model