コード例 #1
0
def modelfit(useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    alg = XGBClassifier(**params)
    df = data.sample(frac=0.3)
    pX = df.drop('LABEL', axis=1)
    py = df['LABEL']
    if useTrainCV:
        print("start use cv")
        xgb_param = alg.get_xgb_params()
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=xgb_param['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds)
        print(cvresult.shape[0])
        alg.set_params(n_estimators=cvresult.shape[0])
        params['n_estimators'] = cvresult.shape[0]
        print("best tree size is {}".format(cvresult.shape[0]))
    # Fit the algorithm on the data
    alg.fit(X, y, eval_metric='auc')
    y_pred = alg.predict(pX)
    accuracy = metrics.accuracy_score(py, y_pred)
    print("精确率Accuracy: %.2f%%" % (accuracy * 100.0))
    print('auc:', metrics.roc_auc_score(py, y_pred))
    train_report = metrics.classification_report(py, y_pred)
    print(train_report)
    feat_imp = pd.Series(
        alg.get_booster().get_fscore()).sort_values(ascending=False)
    print(feat_imp)
    return alg
コード例 #2
0
def Create_Model(X_train, X_test, y_train, y_test, learning_rate, n_estimators,
                 max_depth, min_child_weight, gamma, subsample,
                 colsample_bytree, reg_alpha, eval_metric):

    ROCforest = XGBClassifier(learning_rate=learning_rate,
                              n_estimators=n_estimators,
                              max_depth=max_depth,
                              min_child_weight=min_child_weight,
                              gamma=gamma,
                              subsample=subsample,
                              colsample_bytree=colsample_bytree,
                              reg_alpha=reg_alpha,
                              objective='binary:logistic',
                              nthread=4,
                              seed=12)

    cv_folds = 5

    eval_metric = eval_metric

    xgb_param = ROCforest.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
    cvresult = xgb.cv(xgb_param,
                      xgtrain,
                      num_boost_round=ROCforest.get_params()['n_estimators'],
                      nfold=cv_folds,
                      metrics=eval_metric)

    ROCforest.set_params(n_estimators=cvresult.shape[0])

    ROCforest.fit(X_train, y_train)

    return ROCforest
コード例 #3
0
def xgb_model(x1, y1):
    X_train, X_test, y_train, y_test = train_test_split(
        x1, y1, test_size=0.3, random_state=SEED
    )

    # Down-sample controls in training set, [1:1] case:control
    if subsample is True:
        X_train, y_train = subsample_df(X_train, y_train)
    # Implement SMOTE to balance training set, [1:1] case:control
    if smote is True:
        X_train, y_train = smote_sample(X_train, y_train)

    columns = X_train.columns

    # Weight Rescale
    ratio = float(
        np.sum(y_train["psych_hosp"].values == 0)
        / np.sum(y_train["psych_hosp"].values == 1)
    )

    # Instantiate the XGBClassifier and specify parameters
    xgb1 = XGBClassifier(
        learning_rate=0.1,
        n_estimators=500,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        nthread=4,
        scale_pos_weight=ratio,
        seed=SEED,
    )

    xgb_param = xgb1.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train[columns].values, label=y_train["psych_hosp"].values)
    cvresult = xgb.cv(
        xgb_param,
        xgtrain,
        num_boost_round=xgb1.get_params()["n_estimators"],
        nfold=5,
        metrics="auc",
        early_stopping_rounds=50,
    )
    xgb1.set_params(n_estimators=cvresult.shape[0])

    # Fit the algorithm on the data
    xgb1.fit(X_train, np.ravel(y_train), eval_metric="auc")

    imp = importances(xgb1, X_test, y_test)  # permutation
    imp = imp.reset_index()
    imp_ = imp[imp["Importance"] >= 0.0001]

    feats = []
    for _ in imp_["Feature"]:
        feats.append(_)

    return imp, feats
コード例 #4
0
    def cv(self, cache=False):
        if not cache:
            rows = get_db_data(300000)
            features = rows[:, :-1]
            ys = rows[:, -1]
            try:
                dump_svmlight_file(features, ys, 'catarse.txt.all')
            except Exception as inst:
                print(inst)
            dtrain = xgb.DMatrix(features, label=ys)
            X = features
            y = ys
        else:
            # load file from text file, also binary buffer generated by xgboost
            dtrain = xgb.DMatrix('catarse_recommender/common/catarse.txt.all')
            data = load_svmlight_file(
                'catarse_recommender/common/catarse.txt.all')
            X = data[0]
            y = data[1]

        xgb1 = XGBClassifier(learning_rate=0.01,
                             n_estimators=800,
                             max_depth=4,
                             nthread=8,
                             objective='binary:logistic',
                             seed=27)

        xgb_param = xgb1.get_xgb_params()
        cvresult = xgb.cv(xgb_param,
                          dtrain,
                          num_boost_round=xgb1.get_params()['n_estimators'],
                          nfold=5,
                          metrics=['logloss', 'error'],
                          early_stopping_rounds=20,
                          stratified=True,
                          shuffle=True)
        print(cvresult)
        filehandler = open(b"catarse_recommender/common/cv_result.obj", "wb")
        pickle.dump(cvresult, filehandler)
コード例 #5
0
    n_estimators=100,
    # objective="gblinear",
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    # n_classes=12,
    nthread=4,
    # scale_pos_weight=1,
    seed=27)
modelfit(xgb1, y, predictors)
#预测
dtrain = xgb.DMatrix(predictors, y)
params = xgb1.get_xgb_params()
params['num_class'] = 12
model = xgb.train(dtrain=dtrain, params=params)
dtest = xgb.DMatrix(Xtest)
pred = pd.DataFrame(model.predict(dtest),
                    index=gatest.index,
                    columns=targetencoder.classes_)

#Step 2: Tune max_depth and min_child_weight
param_test1 = {
    'max_depth': [7, 9, 10],  #10 12
    'min_child_weight': [5, 7, 9]  #9 15
}
gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,
                                                n_estimators=100,
                                                max_depth=5,
コード例 #6
0
train.drop(x, axis=1, inplace=True)
test.drop(x, axis=1, inplace=True)

y_train = train['TARGET'].values
X_train = train.drop(['ID','TARGET'], axis=1).values

y_test = test['ID']
X_test = test.drop(['ID'], axis=1).values

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=600,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.6815,
 colsample_bytree=0.701,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

xgtrain = xgb.DMatrix(X_train, label=y_train)
cvresult = xgb.cv(xgb1.get_xgb_params(), xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5,
metrics=['auc'], early_stopping_rounds=50, show_progress=False)
xgb1.set_params(n_estimators=cvresult.shape[0])
xgb1.fit(X_train, y_train, eval_metric='auc')
output = xgb1.predict_proba(X_test)[:,1]

submission = pd.DataFrame({"ID":y_test, "TARGET":output})
submission.to_csv("submission.csv", index=False)
コード例 #7
0
def modelfit(train,
             labels,
             test,
             features,
             useTrainCV=True,
             cv_folds=5,
             early_stopping_rounds=50):
    model = XGBClassifier(learning_rate=0.2,
                          n_estimators=1000,
                          max_depth=5,
                          min_child_weight=1,
                          gamma=0,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          objective='binary:logistic',
                          scale_pos_weight=1,
                          seed=27)

    test_percent = 0.2
    X_train, X_test, y_train, y_test = train_test_split(train,
                                                        labels,
                                                        test_size=test_percent,
                                                        random_state=23)

    xgb_param = model.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train[features], y_train)
    xgcv = xgb.DMatrix(X_test[features])
    xgtest = xgb.DMatrix(test[features])
    cvresult = xgb.cv(xgb_param,
                      xgtrain,
                      num_boost_round=model.get_params()['n_estimators'],
                      nfold=cv_folds,
                      metrics='auc',
                      early_stopping_rounds=early_stopping_rounds)
    print("n_estimators=")
    print(cvresult.shape[0])
    model.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    model.fit(X_train, y_train)

    ##training predictions
    proba = model.predict_proba(X_test)
    preds = proba[:, 1]
    score = roc_auc_score(y_test, preds)
    print("Area under ROC {0}".format(score))

    #Print model report:
    #	print "\nModel Report"
    #	print "Accuracy : %.4g" % accuracy_score(y_train, preds)
    #	print "AUC Score (Train): %f" % roc_auc_score(y_train, preds)

    feat_imp = pd.Series(
        model.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    #	plt.show()

    ##test predictions
    test_proba = model.predict_proba(test)
    test_preds = test_proba[:, 1]

    return test_preds
コード例 #8
0
                        colsample_bytree=0.75,
                        min_child_weight=2,
                        eta=0.025,
                        gamma=0,
                        objective='binary:logistic',
                        nthread=4,
                        scale_pos_weight=1,
                        seed=27)

    isCV = True
    cv_folds = 3
    early_stopping_rounds = 10
    predictors = [x for x in train_df.columns if x not in [target, IDcol]]

    if isCV:
        xgb_param = xgb.get_xgb_params()
        xgtrain = xgb.DMatrix(train_df[predictors].values,
                              label=train_df[target].values)
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=xgb.get_params()['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds,
                          verbose_eval=True)
        xgb.set_params(n_estimators=cvresult.shape[0])

    xgb.fit(train_df[predictors],
            train_df['acc_now_delinq'],
            eval_metric='auc')
    dtrain_predictions = xgb.predict(train_df[predictors])
コード例 #9
0
ファイル: test1.py プロジェクト: mircean/ML
def do_cell(task):
    df_train, df_test, x_start, y_start = task[0], task[1], task[2], task[3]
    #print('do_cell', df_train.shape, df_test.shape, x_start, y_start)

    #train
    n_places_th_local = n_places_th
    n_places_local = n_places

    if n_places != 0:
        tmp = df_train.shape[0]
        value_counts = df_train.place_id.value_counts()[0:n_places]
        df_train = pd.merge(df_train, pd.DataFrame(value_counts), left_on='place_id', right_index=True)[df_train.columns]
        n_places_th_local = value_counts.values[n_places - 1]
        percentage = df_train.shape[0]/tmp

    elif n_places_th != 0:
        value_counts = df_train.place_id.value_counts()
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]
        df_train = df_train.loc[mask.values]

    else:
        n_places_th_local = 2

        value_counts = df_train.place_id.value_counts()
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]

        while percentage > n_places_percentage:
            n_places_th_local += 1
            n_places_local = value_counts[value_counts >= n_places_th_local].count()
            mask = value_counts[df_train.place_id.values] >= n_places_th_local
            percentage = mask.value_counts()[True]/df_train.shape[0]

        n_places_th_local -= 1
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]

        df_train = df_train.loc[mask.values]


    #print(x_start, y_start, n_places_local, n_places_th_local, percentage)
        
    #test
    row_ids = df_test.index
    if 'place_id' in df_test.columns:
        df_test = df_test.drop(['place_id'], axis=1)

    le = LabelEncoder()
    y = le.fit_transform(df_train.place_id.values)
    
    X = df_train.drop(['place_id'], axis=1).values
    X_predict = df_test.values

    score = 0
    n_estimators = 0
    if xgb == 1:    
        if xgb_calculate_n_estimators == True:
            clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha)

            if train_test == 1:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
   
                clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=calculate_score, early_stopping_rounds=early_stopping_rounds, verbose=10 if one_cell == 1 else False)
                score = round(1 - clf.booster().best_score, 6)
                n_estimators = clf.booster().best_ntree_limit
            else:
                abc += 1
                xgb_options = clf.get_xgb_params()
                xgb_options['num_class'] = n_places + 1
                train_dmatrix = DMatrix(X, label=y)

                #some of the classes have less than n_folds, cannot use stratified KFold
                #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
                folds = KFold(len(y), n_folds=n_folds, shuffle=True)
                cv_results = cv(xgb_options, train_dmatrix, clf.n_estimators, early_stopping_rounds=early_stopping_rounds, verbose_eval=10 if one_cell == 1 else False, show_stdv=False, folds=folds, feval=calculate_score)

                n_estimators = cv_results.shape[0]
                score = round(1 - cv_results.values[-1][0], 6)
                std = round(cv_results.values[-1][1], 6)
        else:
            n_estimators = n_estimators_fixed

        clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha)
    else:
        clf = RandomForestClassifier(n_estimators = 300, n_jobs = -1)
        if rf_calculate_score == True:
            if train_test == 1:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
                y_train2 = le.transform(y_train)
                y_test2 = le.transform(y_test)
    
                clf.fit(X_train, y_train2)
                y_predict = clf.predict_proba(X_test)

                scores_local = []
                for i in range(X_test.shape[0]):
                    score = calculate_score_per_row(y_predict[i], y_test2[i])
                    scores_local.append(score)

                score = np.array(scores_local).mean()
            else:
                #some of the classes have less than n_folds, cannot use stratified KFold
                #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
                folds = KFold(len(y), n_folds=n_folds, shuffle=True)
                scores_cv = []
                for train, test in folds:
                    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

                    y_train2 = le.transform(y_train)
                    y_test2 = le.transform(y_test)
    
                    clf.fit(X_train, y_train2)
                    y_predict = clf.predict_proba(X_test)

                    scores_local = []
                    for i in range(X_test.shape[0]):
                        score = calculate_score_per_row(y_predict[i], y_test2[i])
                        scores_local.append(score)

                    score = np.array(scores_local).mean()
                    print('  ', x_start, y_start, score)
                    scores_cv.append(score)

                score = np.array(scores_cv).mean()
    
    #if few_cells == 1 or grid_search == 1:
    #    return [score, None, None]

    clf.fit(X, y)
    y_predict = clf.predict_proba(X_predict)
    ##1
    labels_predict = le.inverse_transform(np.argsort(y_predict, axis=1)[:,::-1][:,:n_topx])    

    print(x_start, y_start, score, n_estimators, n_places_local, n_places_th_local, percentage)

    return [score, row_ids, labels_predict]
コード例 #10
0
def fit_xgboost(param_grid, param_table, train, col_type, find_n_estimator=False,
                cv_iterations=5, cv_folds=5, nthread=3, seed=1, verbose=0):

    target = col_type['target']
    features = col_type['features']
    ID = col_type['ID']

    start_time = strftime("%Y-%m-%d %H-%M", gmtime())
    pred_return = {}
    for params in param_table.itertuples(index=True, name='NamedTuple'):
        params = params._asdict()
        index = params['Index']
        params.pop('Index')  # remove "Index" from params

        params['objective'] = 'binary:logistic'
        params['nthread'] = nthread
        params['random_state'] = seed
        params['seed'] = seed
        params['silent'] = True

        xgb_model = XGBClassifier()
        xgb_model.set_params(**params)

        if find_n_estimator:
            xgb_train = xgb.DMatrix(train[features], label=train[target])
            cv_result = xgb.cv(
                xgb_model.get_xgb_params(),
                xgb_train,
                num_boost_round=int(params['n_estimators']),
                nfold=cv_folds,
                metrics='auc',
                early_stopping_rounds=50,
                seed=seed)

            best_n_estimator = cv_result.shape[0]
            param_table.at[index, 'n_estimators'] = best_n_estimator
            xgb_model.set_params(n_estimators=best_n_estimator)

        scores = []
        pred_all = []
        for cv_index in range(cv_iterations):
            pred = train.loc[:, [ID]]  # get only the ID column
            # k-fold cross validation
            skf = StratifiedKFold(n_splits=cv_folds, random_state=cv_index, shuffle=True)

            for train_index, dev_index in skf.split(train[features].values, train[target].values):
                X_train = train[features].iloc[train_index].values
                y_train = train[target].iloc[train_index].values

                X_dev = train[features].iloc[dev_index].values
                y_dev = train[target].iloc[dev_index].values

                # Fit the algorithm on train folds
                xgb_model.fit(X_train, y_train, eval_metric='auc')

                # Predict on dev fold
                pred_dev = xgb_model.predict_proba(X_dev)[:, 1]
                pred.at[dev_index, 'Pred'] = pred_dev

                # Compute the score
                score = metrics.roc_auc_score(y_dev, pred_dev)
                scores.append(score)

            if len(pred_all) == 0:
                pred_all = pred
            else:
                pred_all = pd.concat([pred_all, pred], axis=0)

        pred_mean = pred_all.groupby(ID)['Pred'].mean()  # avg predict_proba for each ID
        score = metrics.roc_auc_score(train.sort_values(ID)[target].values,
                                      pred_mean)  # use avg pred to compute auc score
        pred_return['Pred_' + str(index)] = pred_mean  # store the pred result for use in stacking

        param_table.at[index, 'Score'] = score
        param_table.at[index, 'Score_Std'] = np.std(scores)

        if verbose == 1:
            print('{} : {}'.format(index, param_table.iloc[index, :]))

    param_table["Score_Weighted"] = param_table["Score"] - 0.1 * param_table["Score_Std"]

    # update_param_grid
    best_param_index = param_table["Score_Weighted"].idxmax()
    print("Param_grid size: {}".format(param_table.shape[0]))
    print("Current Score: {},  Score_Std: {}".format(param_table.loc[best_param_index, "Score"],
                                                     param_table.loc[best_param_index, "Score_Std"]))
    print("--------------------------")
    for param in param_grid:
        best_param = param_table.loc[best_param_index, param]
        if isinstance(param_grid[param], list):
            if len(param_grid[param]) > 1 or (len(param_grid[param]) == 1 and param_grid[param][0] != best_param):
                print("{}: tuned to {}".format(param, best_param))
        else:
            print("{}: tuned to {}".format(param, best_param))
        param_grid[param] = [best_param]

    return param_grid, pred_return
コード例 #11
0
# set grid search parameters
# reg_alpha = [1e-5, 1e-2, 0.1, 1]
reg_lambda = [1e-5, 1e-2, 0.1, 1]
# num_fits = len(reg_alpha)*5
num_fits = len(reg_lambda) * 5
param_grid = dict(
    # reg_alpha=reg_alpha,
    reg_lambda=reg_lambda)

kfold = StratifiedKFold(n_splits=5, shuffle=True)
grid_search = GridSearchCV(model,
                           param_grid,
                           scoring="neg_log_loss",
                           cv=kfold,
                           verbose=num_fits)

start = time.time()
grid_result = grid_search.fit(X_train, y_train)

print("Best: %f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

end = time.time()
print("parameters: {}".format(model.get_xgb_params()))
print("\nTotal time : {:.2f} {}".format((end - start) / 60, "minutes"))
コード例 #12
0
def xgb_model2(x1, y1, ft):
    ## Remove features that negatively impact the model - Used after xgb_mode2 is already run once
    ##Copy results from XGB2_FEATS into 'unwanted'
    # unwanted = {'fever_unknown', 'other_ext_injury', 'med_angiotensin_ii_i', 'wbc_disease', 'proc_124', 'acute_bronch', 'hemorrhoid', 'chf', 'poison_psycho', 'eye_inflam', 'lower_limb_fract', 'biliary_tract', 'other_bone_disease', 'med_antifungal', 'spondylosis', 'secndry_malig', 'other_joint', 'neoplasm_unspec', 'chest_pain_nos', 'acq_foot_deform', 'mood', 'nonmalig_breast', 'schizo', 'suicide', 'osteo_arth', 'other_connective', 'medical_eval'}
    # ft = [e for e in ft if e not in unwanted]
    print("XGB Features:\n", ft, "\n")

    x1 = x1.loc[:, ft]
    X_train, X_test, y_train, y_test = train_test_split(
        x1, y1, test_size=0.3, random_state=SEED
    )

    # Down-sample controls in training set, [1:1] case:control
    if subsample is True:
        X_train, y_train = subsample_df(X_train, y_train)
    # Implement SMOTE to balance training set, [1:1] case:control
    if smote is True:
        X_train, y_train = smote_sample(X_train, y_train)

    columns = X_train.columns

    # Weight Rescale
    ratio = float(
        np.sum(y_train["psych_hosp"].values == 0)
        / np.sum(y_train["psych_hosp"].values == 1)
    )

    # Instantiate the XGBClassifier and specify parameters
    xgb1 = XGBClassifier(
        learning_rate=0.1,
        n_estimators=500,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        nthread=4,
        scale_pos_weight=ratio,
        seed=SEED,
    )

    xgb_param = xgb1.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train[columns].values, label=y_train["psych_hosp"].values)
    cvresult = xgb.cv(
        xgb_param,
        xgtrain,
        num_boost_round=xgb1.get_params()["n_estimators"],
        nfold=5,
        metrics="auc",
        early_stopping_rounds=50,
    )
    xgb1.set_params(n_estimators=cvresult.shape[0])
    xgb_cv_score = cross_val_score(
        xgb1, X_train, np.ravel(y_train), cv=10, scoring="roc_auc"
    )

    # Fit the algorithm on the data
    xgb1.fit(X_train, np.ravel(y_train), eval_metric="auc")

    D = feature_dependence_matrix(X_train)
    viz1 = plot_dependence_heatmap(D, figsize=(11, 10))
    viz1.save("output/Psych_XGB_feat_depend_" + outfile)

    xgb_predict = xgb1.predict(X_test)

    print("=== All AUC Scores [CV - Train] ===")
    print(xgb_cv_score, "\n")
    print("=== Mean AUC Score [CV - Train] ===")
    print(xgb_cv_score.mean(), "\n")
    print("=== Confusion Matrix [Test] ===")
    print(confusion_matrix(y_test, xgb_predict), "\n")
    print("=== Classification Report [Test] ===")
    print(classification_report(y_test, xgb_predict), "\n")
    print("=== AUC Score [Test] ===")
    print(roc_auc_score(y_test, xgb_predict), "\n")

    imp = importances(xgb1, X_test, y_test)  # permutation
    viz2 = plot_importances(imp)
    viz2.save("output/Psych_XGB_feat_imp_" + outfile)
    imp = imp.reset_index()
    imp_ = imp[imp["Importance"] < 0.00000]

    feats = []
    for _ in imp_["Feature"]:
        feats.append(_)

    xgb_roc_auc = roc_auc_score(y_test, xgb_predict)
    fpr, tpr, thresholds = roc_curve(y_test, xgb1.predict_proba(X_test)[:, 1])
    plt.figure()
    plt.plot(fpr, tpr, label="XGB Classifier (area = %0.3f)" % xgb_roc_auc)
    plt.plot([0, 1], [0, 1], "r--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver operating characteristic: Clinical Data Only [XGB]")
    plt.legend(loc="lower right")
    plt.savefig("output/ROC_Psych_XGB_" + outfile)
    plt.savefig("output/ROC_Psych_XGB_" + outfile)
    plt.show()

    return imp, feats
コード例 #13
0
            },
            {
                'gamma': np.linspace(0, 0.5, 5)
            },
            {
                'n_estimators': list(range(500, 3000, 500)),
            },
            {
                'subsample': np.linspace(0, 1, 5),
                'colsample_bytree': np.linspace(0, 1, 5)
            },
            {
                'reg_alpha': [0, 1e-10, 1e-5, 1e-2]
            },
            {
                'learning_rate': [0.01]  # set learning rate low
            },
            {
                'n_estimators': list(range(100, 5000, 500))
            }
        ]

    for i, param_space in enumerate(param_spaces):
        print('pct spaces tuned: ' + str(1. * i / len(param_space)))
        model = cv_grid(param_space, model, X, y)

        tuned_params_dict = model.get_xgb_params()
        alsDataManager.save_dict_as_json(
            output_dict=tuned_params_dict,
            output_path='xgboost_tuned_params.txt')
    save_model(mod_name=model_name, alg=model, cv_res=cv_results)

else:
    # Load the model
    model, cv_results = load_model(mod_name=model_name)

# Print parameters
display_param(mod_name=model_name, cv_res=cv_results, alg=model, grid_cv=False)

# Make predictions
train_pred, train_prob = predict_results(alg=model,
                                         d_train_x=train_x,
                                         d_train_y=train_y)

# Retrieve parameters
n_estimators = model.get_xgb_params()["n_estimators"]

model_list[model_name] = [model, train_pred, train_prob]

print("Test 1 Over")

# ======================== Step 2.2 : Test 2 ========================
print("=>=>=> Launching test 2")
model_name = "xgboost_2"
test = False

if test:
    # Define the model
    xgb_params = {
        "learning_rate": 0.1,
        "n_estimators": n_estimators,
コード例 #15
0
def set_parameters(set_name, golden_set, input_file):

    golden = str_to_bool(golden_set)

    #-------------------------------------------------------------------------

    #read in the directory that is being run
    data_dir = set_name

    #read in the parameters file and load it

    full_path = os.path.join(working_dir, "{0}".format(data_dir),
                             'params.yaml')
    stream = open(full_path, 'r')
    parameters = yaml.load(stream, Loader=yaml.FullLoader)

    #read in Hypatia data as pandas dataframe (2D structure), drop HIP numbers
    df = pd.read_csv(input_file)

    set_number = set_name

    #-------------------------------------------------------------------------

    if golden:
        df2 = df.copy()
        df2.loc[df2[(df2['Exo'] == 1)
                    & (df2['MaxPMass'] > parameters['gas_giant_mass'])].
                sample(10, random_state=np.random.RandomState()).index,
                'Exo'] = 0
        yy = df2.loc[df2['Exo'] == 0].index
        zz = df.loc[df['Exo'] == 0].index
        changed = [ind for ind in yy if not ind in zz]
        changedhips = [df['HIP'][ind] for ind in changed]
        df = df2.copy()
        yy2 = df2.loc[df2['Exo'] == 0].index
        zz2 = df.loc[df['Exo'] == 0].index
        changed2 = [ind for ind in yy2 if not ind in zz2]
    #-------------------------------------------------------------------------

    df.index = df['HIP']
    df['Exo'] = df['Exo'].astype('category')  #category = limited possibilities
    df['Multi'] = df['Multi'].astype('category')
    df['MaxPMass'] = df['MaxPMass'].astype(np.number)
    df['Sampled'] = np.zeros((df.shape[0]))
    df['Predicted'] = np.zeros((df.shape[0]))
    df = df.drop(['HIP'], 1)

    # Print a bunch of stuff in terminal
    print('Parameters used in simulation:')
    print('------------------------------')
    print('')

    for key in parameters.keys():
        print('{0} = {1}'.format(key, parameters[key]))

    cv_folds = parameters['cv_folds']
    early_stopping_rounds = parameters['early_stopping_rounds']
    N_iterations = parameters['N_iterations']
    N_samples = parameters['N_samples']
    gas_giant_mass = parameters['gas_giant_mass']
    features = parameters['features']

    relevant_columns = features + ['Exo', 'MaxPMass', 'Sampled', 'Predicted']

    #Redefine dataframe with the "relevant columns" and remove nans if dropnans==True in yaml
    if (parameters['dropnans']):
        df = df[relevant_columns].dropna()

    print('Number of samples used in simulation: {0}'.format(df.shape[0]))

    print('')

    #Define the confusion matrix and other arrays
    cfm = np.zeros((2, 2))

    auc_score_train = []
    precision_score_train = []
    feat_imp_train = pd.DataFrame(columns=features)
    probabilities_total = pd.DataFrame(index=df.index)

    print('iteration \t estimators')
    print('---------------------------')

    #---------------------------XGBOOST LOOP----------------------------------------------

    # Loop for all of the iterations (defined in yaml)
    for iteration in range(0, N_iterations):

        #dataframe of 200 random hosts with giant planets
        df_iter_with_exo = df[(df['Exo'] == 1)
                              & (df['MaxPMass'] > gas_giant_mass)].sample(
                                  N_samples,
                                  random_state=np.random.RandomState())
        #dataframe of 200 random non hosts
        df_iter_none_exo = df[df['Exo'] == 0].sample(
            N_samples, random_state=np.random.RandomState())

        # make a new dataframe of the 400 star subset
        df_train = pd.concat([df_iter_with_exo, df_iter_none_exo], axis=0)
        # make a dataframe of those stars NOT in the training set (to predict on)
        df_predict = df[~df.index.isin(df_train.index)]

        # The train dataframe with everything but the Exo column
        X = df_train.drop(['Exo'], 1)
        # The Exo column (and hips)
        Y = df_train.Exo

        # Note: Using gbtree booster
        alg = XGBClassifier(
            learning_rate=
            0.1,  #def=0.3, prevents overfitting and makes feature weight conservative
            n_estimators=1000,  #number of boosted trees to fit
            max_depth=6,  #def=6, max depth of tree/complexity
            min_child_weight=
            1,  #def=1, min weight needed to continue leaf partitioning
            gamma=
            0,  #def=0, minimum loss reduction required to make partition on a leaf
            subsample=0.8,  #def=1, subsample ratio of the training set
            colsample_bytree=
            0.8,  #def=1, subsample ratio of columns when making each tree
            objective=
            'binary:logistic',  #def=linear, logistic regression for binary classification, output probability
            nthread=
            1,  #originall = 8, but issue on laptop...def=max, number of parallel threads used to run xgboost
            scale_pos_weight=1,  #def=1, balance positive and neg weights
            seed=27)  #def=0, random number seed

        #get input parameters of algorithm
        xgb_param = alg.get_xgb_params()

        #construct training set matrix
        xgtrain = xgb.DMatrix(X[features].values, label=Y)

        #cross validation (CV) of xgboost to avoid overfitting
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds)

        alg.set_params(n_estimators=cvresult.shape[0])
        print(iteration, '\t \t', cvresult.shape[0])

        alg.fit(X[features], Y, eval_metric='auc')

        dtrain_predictions = alg.predict(X[features])
        dtrain_predprob = alg.predict_proba(X[features])[:, 1]

        feat_imp = alg.get_booster().get_fscore()
        # See how the algorithm performs on the Exo data
        auc_score = metrics.roc_auc_score(Y, dtrain_predprob)
        precision_score = metrics.precision_score(Y, dtrain_predictions)
        metric_score = metrics.confusion_matrix(Y, dtrain_predictions)

        # Weighting function to ignore the null values
        normalized_features = pd.DataFrame(
            (1 -
             df_train[features].isnull().sum() / df_train[features].count()) *
            pd.Series(alg.get_booster().get_fscore()),
            columns=[iteration]).T

        #calculate the confusion matrix
        feat_imp_train = pd.concat([
            feat_imp_train,
            pd.DataFrame(feat_imp, columns=features, index=[iteration])
        ])
        feat_imp_train_normal = pd.concat(
            [feat_imp_train, normalized_features])
        auc_score_train.append(auc_score)
        precision_score_train.append(precision_score)
        cfm += metric_score

        df.loc[df_predict.index, 'Sampled'] += np.ones(len(df_predict.index))
        df.loc[df_predict.index,
               'Predicted'] += alg.predict(df_predict[features])
        df.loc[df_predict.index, 'Prob'] = alg.predict(df_predict[features])

        values = df['Prob']
        probabilities_total = pd.concat(
            [probabilities_total,
             pd.Series(values, name=str(iteration))],
            axis=1)

        if (not iteration % 10):
            probabilities_total.to_pickle(
                '{0}/probabilities_total.pkl'.format(data_dir))

    #-------------------------------------------------------------------------

    # Calculate the confusion matrix
    cfm /= N_iterations
    cfm[0] /= cfm[0].sum()
    cfm[1] /= cfm[1].sum()

    # Print confusion matrix
    print(np.round(cfm, 3))
    df['Prob'] = df['Predicted'] / df['Sampled']

    ###########-------------------Output List of Planets------------------------#########

    #Find the stars with >90% probability of hosting a planet, with the Sampled, Predicted, and Prob columns
    planets = df[(df.Prob > .90)
                 & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']]
    print('Number of most probable planet hosts: {0}'.format(planets.shape[0]))

    #Sort the stars with predicted planets and save that file
    planetprobs = planets.sort_values(by='Prob', ascending=False)
    name = data_dir + '/figures/planet_probabilities' + str(
        datetime.today().strftime('-%h%d-%H%M')) + '.csv'
    #name = data_dir+'/figures/planet_probabilities.csv'
    outfile = open(name, 'w')
    planetprobs.to_csv(outfile)
    outfile.close()

    #Create a second list with all stars in Hypatia and the probabilities
    planets2 = df[(df.Prob > .0)
                  & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']]
    if golden:  #if 10 stars were randomly taken out
        changeddf = pd.DataFrame([])  #make empty dataframe
        for star in changedhips:  #loop over the 10 known planets hosts (defined at top)
            changeddf = changeddf.append(planets2.loc[planets2.index == star])
            if planets2.loc[
                    planets2.index ==
                    star].empty:  #catch for when a known planet host was cut (bc of abunds)
                temp = pd.Series([nan, nan, nan],
                                 index=['Sampled', 'Predicted', 'Prob'])
                temp.name = star
                changeddf = changeddf.append(
                    temp)  #append blank file (with star name as index)
        #Save golden set as a separate file with the date and time as a tag
        filename = '{0}/figures/goldenSetProbabilities' + str(
            datetime.today().strftime('-%h%d-%H%M')) + '.csv'
        changeddf.to_csv(filename.format(set_number), na_rep=" ")

    #Save the file with all of the probabilities
    planetprobs2 = planets2.sort_values(by='Prob', ascending=False)
    name2 = data_dir + '/figures/planet_probabilitiesAll' + str(
        datetime.today().strftime('-%h%d-%H%M')) + '.csv'
    #name2 = data_dir+'/figures/planet_probabilitiesAll.csv'
    outfile2 = open(name2, 'w')
    planetprobs2.to_csv(outfile2)
    outfile2.close()

    ###########------------------------Save Files------------------------##########
    print('Saving data files')

    #Save files
    feat_imp_train.to_pickle('{0}/features_train.pkl'.format(data_dir))
    feat_imp_train_normal.to_pickle(
        '{0}/features_train_normal.pkl'.format(data_dir))
    probabilities_total.to_pickle(
        '{0}/probabilities_total.pkl'.format(data_dir))
    df.to_pickle('{0}/df_info_all.pkl'.format(data_dir))

    np.save('{0}/auc_score_train.npy'.format(data_dir),
            np.array(auc_score_train))
    np.save('{0}/precision_score_train.npy'.format(data_dir),
            np.array(precision_score_train))
    np.save('{0}/cfm.npy'.format(data_dir), cfm)

    print('Simulation completed successfully.')
    if golden:
        print("Changed indices and HIP numbers:")
        print(changed)
        print(changedhips)
コード例 #16
0
# auc: Area under the curve

model = XGBClassifier(learning_rate=0.1,
                      n_estimators=1000,
                      max_depth=10,
                      min_child_weight=1,
                      gamma=0,
                      subsample=0.8,
                      colsample_bytree=0.8,
                      objective='multi:softmax',
                      nthread=4,
                      scale_pos_weight=1,
                      seed=27,
                      num_class=3)

xgb_param = model.get_xgb_params()

xgtrain = xgb.DMatrix(X, y)

cvresult = xgb.cv(
    xgb_param,
    xgtrain,
    num_boost_round=1000,  #model.get_params()['n_estimators'], 
    nfold=5,
    metrics='merror',
    early_stopping_rounds=50,
    stratified=True)

print('\ntraining error')
print(cvresult['train-merror-mean'])
print('\nvalidation error')
コード例 #17
0
                     max_depth=10,
                     max_delta_step=2,
                     random_state=42)

param_xgb = {'colsample_bytree': [0.7, 0.8, 0.9, 1], 'subsample': [0.9, 1]}
xgbc = model_fit(xgbc, xtrain, ytrain, param_xgb, False)

xgbc.fit(xtrain, ytrain)
bestpred_xgb = print_feature_importance(xgbc)

ypred_xgb = Test_Set_Report(xgbc, xtest, ytest)

# xgb.cv is used to get the actual number of n_estimators required based on the learning rate,
# it uses early_stopping_rounds to get the optimal value
xdtrain = xgb.DMatrix(xtrain, label=ytrain)
cvresult_xgb = xgb.cv(xgbc.get_xgb_params(),
                      xdtrain,
                      nfold=5,
                      num_boost_round=5000,
                      metrics='auc',
                      early_stopping_rounds=50)
''' Storing the predicted results along with the actual prediction for each phone number in a csv file'''

FinalResult_Python = pd.concat([FinalResult_Python, ypred_xgb], axis=1)
FinalResult_Python.rename(columns={0: 'Predicted Churn'}, inplace=True)
FinalResult_Python['Predicted Churn'] = FinalResult_Python[
    'Predicted Churn'].map({
        0: 'False',
        1: 'True'
    })
コード例 #18
0
            'wifi_infos'
        ]
    ]

    #自动调参第一步,确定最优评估器个数。
    xgb0 = XGBClassifier(learning_rate=0.1,
                         n_estimators=n_estimators,
                         max_depth=max_depth,
                         min_child_weight=min_child_weight,
                         subsample=subsample,
                         colsample_bytree=colsample_bytree,
                         objective='multi:softmax',
                         scale_pos_weight=1,
                         seed=0)

    xgb_param = xgb0.get_xgb_params()
    xgb_param['num_class'] = num_class
    xgtrain = xgb.DMatrix(df_train[feature], label=df_train['label'])
    cvresult = xgb.cv(xgb_param,
                      xgtrain,
                      num_boost_round=xgb0.get_params()['n_estimators'],
                      nfold=5,
                      metrics='merror',
                      early_stopping_rounds=20,
                      verbose_eval=True)
    n_estimators = cvresult.shape[0]
    print("n_estimators : %s" % n_estimators)

    #自动调参第二步,确定最优max_depth,min_child_weight。
    modelfit1(df_train, feature)
    #自动调参第三步,确定最优subsample,colsample_bytree。
コード例 #19
0
ファイル: try.py プロジェクト: Fred-Tian/AI_CMCC
    X_test = np.zeros((X_test_o.shape[0], X_test_o.shape[1] + 6), np.int64)
    y_train = y_train_o
    y_test = y_test_o
    for i in range(X_train_o.shape[0]):
        error_total[X_train_o[i, 0], y_train_o[i]] += 1
    l = X_train_o.shape[1]
    for i in range(X_train.shape[0]):
        X_train[i] = np.append(X_train_o[i], error_total[X_train_o[i, 0]])
        X_train[i, l + y_train_o[i]] -= 1
    for i in range(X_test.shape[0]):
        X_test[i] = np.append(X_test_o[i], error_total[X_test_o[i, 0]])

    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_test, y_test)
    #    dtrain = xgb.DMatrix(X_train[tr],y_train[tr])
    #    dvalid = xgb.DMatrix(X_train[va],y_train[va])
    watchlist = [(dtrain, 'train'), (dvalid, 'valid_data')]
    bst = xgb.train(dtrain=dtrain,
                    num_boost_round=2000,
                    evals=watchlist,
                    verbose_eval=50,
                    params=xgbc.get_xgb_params(),
                    early_stopping_rounds=100)
    y_train_pred_prod[va] += bst.predict(xgb.DMatrix(X_test),
                                         ntree_limit=bst.best_ntree_limit)
    print(bst.best_ntree_limit)
#    y_test_pred_prod += bst.predict(xgb.DMatrix(X_test), ntree_limit=bst.best_ntree_limit)
#print('the roc_auc_score for train:',metrics.accuracy_score(y_test,np.argmax(y_test_pred_prod,axis=1)))
print('the roc_auc_score for train:',
      metrics.accuracy_score(y, np.argmax(y_train_pred_prod, axis=1)))
print(time.clock() - start)
コード例 #20
0
ss = StandardScaler()
x_train_ss = ss.fit_transform(x_train)
x_test_ss = ss.transform(x_test)
print('train.shape', x_train.shape, 'test.shape', x_test.shape)

xgb1 = XGBClassifier(learning_rate=0.1,
                     n_estimators=500,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=1)

xgb_param = xgb1.get_xgb_params()
xgtrain = xgb.DMatrix(x_train_ss, label=y_train)
cvresult = xgb.cv(xgb_param,
                  xgtrain,
                  num_boost_round=xgb1.get_params()['n_estimators'],
                  nfold=5,
                  metrics='auc',
                  early_stopping_rounds=50,
                  verbose_eval=10)

print('n_estimators', cvresult.shape[0])
print('test-auc:', cvresult.iloc[cvresult.shape[0] - 1, 0])
xgb1.set_params(n_estimators=cvresult.shape[0])
print('model', xgb1)

#n_estimators 137
コード例 #21
0
ファイル: XGBoost_CV.py プロジェクト: AppTheHeart/-APP
clf = XGBClassifier(
    n_estimators=5000,  #总迭代数
    max_depth=4,  #树的深度
    min_child_weight=1,  #。。。
    subsample=0.65,
    colsample_bytree=0.7,
    learning_rate=0.01,
    objective='multi:softmax',  #需要被最小化的损失函数,选的是多分类预测类别
    num_class=5,  #指定类别数目
    gamma=0,  #惩罚参数
    reg_alpha=0.05,
    reg_lambda=0.05,
    nthread=4,
    seed=27)
xgtrain = xgb.DMatrix(X, label=y)
xgb_param = clf.get_xgb_params()
cvresult = xgb.cv(xgb_param,
                  xgtrain,
                  num_boost_round=5000,
                  nfold=5,
                  metrics=['mlogloss'],
                  early_stopping_rounds=50,
                  stratified=True,
                  seed=1301)

print('Best number of trees = {}'.format(cvresult.shape[0]))
clf.set_params(n_estimators=cvresult.shape[0],
               use_label_encoder=False)  #把clf的参数设置成最好的树对应的参数
clf.fit(X, y, eval_metric='merror')
dtest_x = xgb.DMatrix(X1)
pre = clf.predict(X1)