Ejemplo n.º 1
0
def train_xgb(model=False):
    print('train_xgb')
    '''
    input:
    output:
    '''
    global log

    params = grid_search_xgb(True)

    clf = XGBClassifier().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'xgb'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', max_depth: %d' % params['max_depth']
    log += ', min_child_weight: %d' % params['min_child_weight']
    log += ', gamma: %.1f' % params['gamma']
    log += ', subsample: %.1f' % params['subsample']
    log += ', colsample_bytree: %.1f' % params['colsample_bytree']
    log += '\n\n'

    model = train(clf)
    file = open('xgb-model.pkl', 'wb')
    pickle.dump(model, file)
    file.close()

    print('train_xgb end')
    return
Ejemplo n.º 2
0
def Create_Model(X_train, X_test, y_train, y_test, learning_rate, n_estimators,
                 max_depth, min_child_weight, gamma, subsample,
                 colsample_bytree, reg_alpha, eval_metric):

    ROCforest = XGBClassifier(learning_rate=learning_rate,
                              n_estimators=n_estimators,
                              max_depth=max_depth,
                              min_child_weight=min_child_weight,
                              gamma=gamma,
                              subsample=subsample,
                              colsample_bytree=colsample_bytree,
                              reg_alpha=reg_alpha,
                              objective='binary:logistic',
                              nthread=4,
                              seed=12)

    cv_folds = 5

    eval_metric = eval_metric

    xgb_param = ROCforest.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
    cvresult = xgb.cv(xgb_param,
                      xgtrain,
                      num_boost_round=ROCforest.get_params()['n_estimators'],
                      nfold=cv_folds,
                      metrics=eval_metric)

    ROCforest.set_params(n_estimators=cvresult.shape[0])

    ROCforest.fit(X_train, y_train)

    return ROCforest
Ejemplo n.º 3
0
def xgb_model(x1, y1):
    X_train, X_test, y_train, y_test = train_test_split(
        x1, y1, test_size=0.3, random_state=SEED
    )

    # Down-sample controls in training set, [1:1] case:control
    if subsample is True:
        X_train, y_train = subsample_df(X_train, y_train)
    # Implement SMOTE to balance training set, [1:1] case:control
    if smote is True:
        X_train, y_train = smote_sample(X_train, y_train)

    columns = X_train.columns

    # Weight Rescale
    ratio = float(
        np.sum(y_train["psych_hosp"].values == 0)
        / np.sum(y_train["psych_hosp"].values == 1)
    )

    # Instantiate the XGBClassifier and specify parameters
    xgb1 = XGBClassifier(
        learning_rate=0.1,
        n_estimators=500,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        nthread=4,
        scale_pos_weight=ratio,
        seed=SEED,
    )

    xgb_param = xgb1.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train[columns].values, label=y_train["psych_hosp"].values)
    cvresult = xgb.cv(
        xgb_param,
        xgtrain,
        num_boost_round=xgb1.get_params()["n_estimators"],
        nfold=5,
        metrics="auc",
        early_stopping_rounds=50,
    )
    xgb1.set_params(n_estimators=cvresult.shape[0])

    # Fit the algorithm on the data
    xgb1.fit(X_train, np.ravel(y_train), eval_metric="auc")

    imp = importances(xgb1, X_test, y_test)  # permutation
    imp = imp.reset_index()
    imp_ = imp[imp["Importance"] >= 0.0001]

    feats = []
    for _ in imp_["Feature"]:
        feats.append(_)

    return imp, feats
Ejemplo n.º 4
0
    def cv(self, cache=False):
        if not cache:
            rows = get_db_data(300000)
            features = rows[:, :-1]
            ys = rows[:, -1]
            try:
                dump_svmlight_file(features, ys, 'catarse.txt.all')
            except Exception as inst:
                print(inst)
            dtrain = xgb.DMatrix(features, label=ys)
            X = features
            y = ys
        else:
            # load file from text file, also binary buffer generated by xgboost
            dtrain = xgb.DMatrix('catarse_recommender/common/catarse.txt.all')
            data = load_svmlight_file(
                'catarse_recommender/common/catarse.txt.all')
            X = data[0]
            y = data[1]

        xgb1 = XGBClassifier(learning_rate=0.01,
                             n_estimators=800,
                             max_depth=4,
                             nthread=8,
                             objective='binary:logistic',
                             seed=27)

        xgb_param = xgb1.get_xgb_params()
        cvresult = xgb.cv(xgb_param,
                          dtrain,
                          num_boost_round=xgb1.get_params()['n_estimators'],
                          nfold=5,
                          metrics=['logloss', 'error'],
                          early_stopping_rounds=20,
                          stratified=True,
                          shuffle=True)
        print(cvresult)
        filehandler = open(b"catarse_recommender/common/cv_result.obj", "wb")
        pickle.dump(cvresult, filehandler)
Ejemplo n.º 5
0
def train_xgb(model=False):
    global log

    params = grid_search_xgb(True)

    clf = XGBClassifier().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'xgb'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', max_depth: %d' % params['max_depth']
    log += ', min_child_weight: %d' % params['min_child_weight']
    log += ', gamma: %.1f' % params['gamma']
    log += ', subsample: %.1f' % params['subsample']
    log += ', colsample_bytree: %.1f' % params['colsample_bytree']
    log += '\n\n'

    return train(clf)
Ejemplo n.º 6
0
    d_train = xgb.DMatrix(x_train, y_train)
    d_test = xgb.DMatrix(x_test, y_test)
    wathchlist = [(d_train, "train"), (d_test, "test")]
    clf = XGBClassifier(learning_rate=0.1,
                        n_estimators=100,
                        objective="binary:logistic",
                        eval_metric="logloss",
                        min_child_weight=1,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        reg_alpha=0,
                        reg_lambda=1,
                        n_jobs=-1,
                        nthread=-1,
                        seed=3)
    params = clf.get_params()
    evals_res = {}
    model_sklearn = xgb.train(params=params,
                              dtrain=d_train,
                              evals=wathchlist,
                              evals_result=evals_res,
                              early_stopping_rounds=10,
                              verbose_eval=True)
    y_hat = model_sklearn.predict(d_test)
    df_evals = pd.DataFrame({
        "loss_train": evals_res.get("train").get("logloss"),
        "loss_test": evals_res.get("test").get("logloss")
    })
    df_evals.plot()

    y_pred = np.where(y_hat <= 0.5, 0, 1)
Ejemplo n.º 7
0
def modelfit(train,
             labels,
             test,
             features,
             useTrainCV=True,
             cv_folds=5,
             early_stopping_rounds=50):
    model = XGBClassifier(learning_rate=0.2,
                          n_estimators=1000,
                          max_depth=5,
                          min_child_weight=1,
                          gamma=0,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          objective='binary:logistic',
                          scale_pos_weight=1,
                          seed=27)

    test_percent = 0.2
    X_train, X_test, y_train, y_test = train_test_split(train,
                                                        labels,
                                                        test_size=test_percent,
                                                        random_state=23)

    xgb_param = model.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train[features], y_train)
    xgcv = xgb.DMatrix(X_test[features])
    xgtest = xgb.DMatrix(test[features])
    cvresult = xgb.cv(xgb_param,
                      xgtrain,
                      num_boost_round=model.get_params()['n_estimators'],
                      nfold=cv_folds,
                      metrics='auc',
                      early_stopping_rounds=early_stopping_rounds)
    print("n_estimators=")
    print(cvresult.shape[0])
    model.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    model.fit(X_train, y_train)

    ##training predictions
    proba = model.predict_proba(X_test)
    preds = proba[:, 1]
    score = roc_auc_score(y_test, preds)
    print("Area under ROC {0}".format(score))

    #Print model report:
    #	print "\nModel Report"
    #	print "Accuracy : %.4g" % accuracy_score(y_train, preds)
    #	print "AUC Score (Train): %f" % roc_auc_score(y_train, preds)

    feat_imp = pd.Series(
        model.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    #	plt.show()

    ##test predictions
    test_proba = model.predict_proba(test)
    test_preds = test_proba[:, 1]

    return test_preds
Ejemplo n.º 8
0
def xgb_model2(x1, y1, ft):
    ## Remove features that negatively impact the model - Used after xgb_mode2 is already run once
    ##Copy results from XGB2_FEATS into 'unwanted'
    # unwanted = {'fever_unknown', 'other_ext_injury', 'med_angiotensin_ii_i', 'wbc_disease', 'proc_124', 'acute_bronch', 'hemorrhoid', 'chf', 'poison_psycho', 'eye_inflam', 'lower_limb_fract', 'biliary_tract', 'other_bone_disease', 'med_antifungal', 'spondylosis', 'secndry_malig', 'other_joint', 'neoplasm_unspec', 'chest_pain_nos', 'acq_foot_deform', 'mood', 'nonmalig_breast', 'schizo', 'suicide', 'osteo_arth', 'other_connective', 'medical_eval'}
    # ft = [e for e in ft if e not in unwanted]
    print("XGB Features:\n", ft, "\n")

    x1 = x1.loc[:, ft]
    X_train, X_test, y_train, y_test = train_test_split(
        x1, y1, test_size=0.3, random_state=SEED
    )

    # Down-sample controls in training set, [1:1] case:control
    if subsample is True:
        X_train, y_train = subsample_df(X_train, y_train)
    # Implement SMOTE to balance training set, [1:1] case:control
    if smote is True:
        X_train, y_train = smote_sample(X_train, y_train)

    columns = X_train.columns

    # Weight Rescale
    ratio = float(
        np.sum(y_train["psych_hosp"].values == 0)
        / np.sum(y_train["psych_hosp"].values == 1)
    )

    # Instantiate the XGBClassifier and specify parameters
    xgb1 = XGBClassifier(
        learning_rate=0.1,
        n_estimators=500,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        nthread=4,
        scale_pos_weight=ratio,
        seed=SEED,
    )

    xgb_param = xgb1.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train[columns].values, label=y_train["psych_hosp"].values)
    cvresult = xgb.cv(
        xgb_param,
        xgtrain,
        num_boost_round=xgb1.get_params()["n_estimators"],
        nfold=5,
        metrics="auc",
        early_stopping_rounds=50,
    )
    xgb1.set_params(n_estimators=cvresult.shape[0])
    xgb_cv_score = cross_val_score(
        xgb1, X_train, np.ravel(y_train), cv=10, scoring="roc_auc"
    )

    # Fit the algorithm on the data
    xgb1.fit(X_train, np.ravel(y_train), eval_metric="auc")

    D = feature_dependence_matrix(X_train)
    viz1 = plot_dependence_heatmap(D, figsize=(11, 10))
    viz1.save("output/Psych_XGB_feat_depend_" + outfile)

    xgb_predict = xgb1.predict(X_test)

    print("=== All AUC Scores [CV - Train] ===")
    print(xgb_cv_score, "\n")
    print("=== Mean AUC Score [CV - Train] ===")
    print(xgb_cv_score.mean(), "\n")
    print("=== Confusion Matrix [Test] ===")
    print(confusion_matrix(y_test, xgb_predict), "\n")
    print("=== Classification Report [Test] ===")
    print(classification_report(y_test, xgb_predict), "\n")
    print("=== AUC Score [Test] ===")
    print(roc_auc_score(y_test, xgb_predict), "\n")

    imp = importances(xgb1, X_test, y_test)  # permutation
    viz2 = plot_importances(imp)
    viz2.save("output/Psych_XGB_feat_imp_" + outfile)
    imp = imp.reset_index()
    imp_ = imp[imp["Importance"] < 0.00000]

    feats = []
    for _ in imp_["Feature"]:
        feats.append(_)

    xgb_roc_auc = roc_auc_score(y_test, xgb_predict)
    fpr, tpr, thresholds = roc_curve(y_test, xgb1.predict_proba(X_test)[:, 1])
    plt.figure()
    plt.plot(fpr, tpr, label="XGB Classifier (area = %0.3f)" % xgb_roc_auc)
    plt.plot([0, 1], [0, 1], "r--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver operating characteristic: Clinical Data Only [XGB]")
    plt.legend(loc="lower right")
    plt.savefig("output/ROC_Psych_XGB_" + outfile)
    plt.savefig("output/ROC_Psych_XGB_" + outfile)
    plt.show()

    return imp, feats
Ejemplo n.º 9
0
train.drop(x, axis=1, inplace=True)
test.drop(x, axis=1, inplace=True)

y_train = train['TARGET'].values
X_train = train.drop(['ID','TARGET'], axis=1).values

y_test = test['ID']
X_test = test.drop(['ID'], axis=1).values

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=600,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.6815,
 colsample_bytree=0.701,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

xgtrain = xgb.DMatrix(X_train, label=y_train)
cvresult = xgb.cv(xgb1.get_xgb_params(), xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5,
metrics=['auc'], early_stopping_rounds=50, show_progress=False)
xgb1.set_params(n_estimators=cvresult.shape[0])
xgb1.fit(X_train, y_train, eval_metric='auc')
output = xgb1.predict_proba(X_test)[:,1]

submission = pd.DataFrame({"ID":y_test, "TARGET":output})
submission.to_csv("submission.csv", index=False)
Ejemplo n.º 10
0
class ParamTuner:
    def __init__(self, X_train, y_train):
        self._clf = XGBClassifier(learning_rate=0.01,
                                  n_estimators=1000,
                                  max_depth=5,
                                  min_child_weight=1,
                                  gamma=0,
                                  subsample=0.8,
                                  colsample_bytree=0.8,
                                  objective='binary:logistic',
                                  scale_pos_weight=1,
                                  seed=0)
        self._dtrain = xgb.DMatrix(X_train, label=y_train)
        self._X_train = X_train
        self._y_train = y_train

    @property
    def clf(self):
        return self._clf

    def show_params(self):
        logging.info("-" * 40)
        logging.info("current params:\n" + str(self._clf.get_params()))
        logging.info("-" * 40)

    def get_param(self, name):
        return self._clf.get_params()[name]

    def set_param(self, name, value):
        self._clf.set_params(**{name: value})

    def set_params(self, params):
        self._clf.set_params(**params)

    def tune_num_boost_round(self):
        logging.info("turn num_boost_round")
        history = xgb.cv(self._clf.get_params(),
                         dtrain=self._dtrain,
                         num_boost_round=NUM_BOOST_ROUND,
                         nfold=CV_FOLDS,
                         metrics='auc',
                         early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                         show_stdv=True)
        logging.info("tail of history:\n" + str(history.tail(1)))
        logging.info("learning rate: %f, best boosting num: %d" %
                     (self.get_param('learning_rate'), history.shape[0]))
        self.set_param('n_estimators', history.shape[0])
        self.show_params()

    def grid_search(self, param_grid):
        logging.info("grid search on %s" % param_grid.keys())
        gs = GridSearchCV(estimator=self._clf,
                          param_grid=param_grid,
                          scoring='roc_auc',
                          n_jobs=-1,
                          iid=False,
                          cv=CV_FOLDS)
        gs.fit(X=self._X_train, y=self._y_train)
        logging.info("grid_scores:\n" + '\n'.join(map(str, gs.grid_scores_)))
        logging.info("best_params: " + str(gs.best_params_))
        logging.info("best_score: " + str(gs.best_score_))
        self.set_params(gs.best_params_)
        self.show_params()
Ejemplo n.º 11
0
def train_model_xgb_cv(X_train, X_test, y_train, y_test):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    xgb_sklearn = XGBClassifier(learning_rate=0.1,
                                n_estimators=300,
                                max_depth=3,
                                min_child_weight=1,
                                gamma=0.3,
                                subsample=0.6,
                                colsample_bytree=0.7,
                                objective='binary:logistic',
                                nthread=4,
                                seed=27,
                                reg_lambda=0.01)

    xgb_params = xgb_sklearn.get_params()
    cvresult = xgb.cv(xgb_params,
                      dtrain,
                      num_boost_round=xgb_params['n_estimators'],
                      nfold=5,
                      metrics='auc',
                      early_stopping_rounds=5)
    n_estimators = cvresult.shape[0]
    print("n_estimators: ", n_estimators)
    xgb_sklearn.set_params(n_estimators=n_estimators)
    xgb_sklearn.fit(np.array(X_train), np.array(y_train), eval_metric='auc')

    pred_y = xgb_sklearn.predict(X_test)
    pred_y_prob = xgb_sklearn.predict_proba(X_test)[:, 1]
    # auc
    auc = roc_auc_score(y_test, pred_y_prob)
    print('AUC: ', auc)
    # error
    score = xgb_sklearn.score(X_test, y_test)
    print('error: ', 1 - score)

    # grid search
    params = {'max_depth': [2, 3, 4, 5, 6, 7, 8]}
    model = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.1,
            n_estimators=300,
            # max_depth=3,
            min_child_weight=1,
            gamma=0.3,
            subsample=0.6,
            colsample_bytree=0.7,
            objective='binary:logistic',
            nthread=4,
            seed=27,
            reg_lambda=0.01),
        param_grid=params,
        cv=2)
    model.fit(np.array(X_train), np.array(y_train), eval_metric='auc')
    print(model.cv_results_, model.best_params_, model.best_score_)

    feat_imp = pd.Series(xgb_sklearn.get_booster().get_fscore(
        fmap='xgb.fmap')).sort_values(ascending=True)
    feat_imp.plot(kind='barh', color='black', legend=False, figsize=(10, 6))
    plt.ylabel('Feature name')
    plt.xlabel('Feature score')
    plt.savefig(
        'C:/Users/Administrator.USER-20161227PQ/Desktop/paper figure/figure5.png',
        dpi=300)
    plt.show()
Ejemplo n.º 12
0
def set_parameters(set_name, golden_set, input_file):

    golden = str_to_bool(golden_set)

    #-------------------------------------------------------------------------

    #read in the directory that is being run
    data_dir = set_name

    #read in the parameters file and load it

    full_path = os.path.join(working_dir, "{0}".format(data_dir),
                             'params.yaml')
    stream = open(full_path, 'r')
    parameters = yaml.load(stream, Loader=yaml.FullLoader)

    #read in Hypatia data as pandas dataframe (2D structure), drop HIP numbers
    df = pd.read_csv(input_file)

    set_number = set_name

    #-------------------------------------------------------------------------

    if golden:
        df2 = df.copy()
        df2.loc[df2[(df2['Exo'] == 1)
                    & (df2['MaxPMass'] > parameters['gas_giant_mass'])].
                sample(10, random_state=np.random.RandomState()).index,
                'Exo'] = 0
        yy = df2.loc[df2['Exo'] == 0].index
        zz = df.loc[df['Exo'] == 0].index
        changed = [ind for ind in yy if not ind in zz]
        changedhips = [df['HIP'][ind] for ind in changed]
        df = df2.copy()
        yy2 = df2.loc[df2['Exo'] == 0].index
        zz2 = df.loc[df['Exo'] == 0].index
        changed2 = [ind for ind in yy2 if not ind in zz2]
    #-------------------------------------------------------------------------

    df.index = df['HIP']
    df['Exo'] = df['Exo'].astype('category')  #category = limited possibilities
    df['Multi'] = df['Multi'].astype('category')
    df['MaxPMass'] = df['MaxPMass'].astype(np.number)
    df['Sampled'] = np.zeros((df.shape[0]))
    df['Predicted'] = np.zeros((df.shape[0]))
    df = df.drop(['HIP'], 1)

    # Print a bunch of stuff in terminal
    print('Parameters used in simulation:')
    print('------------------------------')
    print('')

    for key in parameters.keys():
        print('{0} = {1}'.format(key, parameters[key]))

    cv_folds = parameters['cv_folds']
    early_stopping_rounds = parameters['early_stopping_rounds']
    N_iterations = parameters['N_iterations']
    N_samples = parameters['N_samples']
    gas_giant_mass = parameters['gas_giant_mass']
    features = parameters['features']

    relevant_columns = features + ['Exo', 'MaxPMass', 'Sampled', 'Predicted']

    #Redefine dataframe with the "relevant columns" and remove nans if dropnans==True in yaml
    if (parameters['dropnans']):
        df = df[relevant_columns].dropna()

    print('Number of samples used in simulation: {0}'.format(df.shape[0]))

    print('')

    #Define the confusion matrix and other arrays
    cfm = np.zeros((2, 2))

    auc_score_train = []
    precision_score_train = []
    feat_imp_train = pd.DataFrame(columns=features)
    probabilities_total = pd.DataFrame(index=df.index)

    print('iteration \t estimators')
    print('---------------------------')

    #---------------------------XGBOOST LOOP----------------------------------------------

    # Loop for all of the iterations (defined in yaml)
    for iteration in range(0, N_iterations):

        #dataframe of 200 random hosts with giant planets
        df_iter_with_exo = df[(df['Exo'] == 1)
                              & (df['MaxPMass'] > gas_giant_mass)].sample(
                                  N_samples,
                                  random_state=np.random.RandomState())
        #dataframe of 200 random non hosts
        df_iter_none_exo = df[df['Exo'] == 0].sample(
            N_samples, random_state=np.random.RandomState())

        # make a new dataframe of the 400 star subset
        df_train = pd.concat([df_iter_with_exo, df_iter_none_exo], axis=0)
        # make a dataframe of those stars NOT in the training set (to predict on)
        df_predict = df[~df.index.isin(df_train.index)]

        # The train dataframe with everything but the Exo column
        X = df_train.drop(['Exo'], 1)
        # The Exo column (and hips)
        Y = df_train.Exo

        # Note: Using gbtree booster
        alg = XGBClassifier(
            learning_rate=
            0.1,  #def=0.3, prevents overfitting and makes feature weight conservative
            n_estimators=1000,  #number of boosted trees to fit
            max_depth=6,  #def=6, max depth of tree/complexity
            min_child_weight=
            1,  #def=1, min weight needed to continue leaf partitioning
            gamma=
            0,  #def=0, minimum loss reduction required to make partition on a leaf
            subsample=0.8,  #def=1, subsample ratio of the training set
            colsample_bytree=
            0.8,  #def=1, subsample ratio of columns when making each tree
            objective=
            'binary:logistic',  #def=linear, logistic regression for binary classification, output probability
            nthread=
            1,  #originall = 8, but issue on laptop...def=max, number of parallel threads used to run xgboost
            scale_pos_weight=1,  #def=1, balance positive and neg weights
            seed=27)  #def=0, random number seed

        #get input parameters of algorithm
        xgb_param = alg.get_xgb_params()

        #construct training set matrix
        xgtrain = xgb.DMatrix(X[features].values, label=Y)

        #cross validation (CV) of xgboost to avoid overfitting
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds)

        alg.set_params(n_estimators=cvresult.shape[0])
        print(iteration, '\t \t', cvresult.shape[0])

        alg.fit(X[features], Y, eval_metric='auc')

        dtrain_predictions = alg.predict(X[features])
        dtrain_predprob = alg.predict_proba(X[features])[:, 1]

        feat_imp = alg.get_booster().get_fscore()
        # See how the algorithm performs on the Exo data
        auc_score = metrics.roc_auc_score(Y, dtrain_predprob)
        precision_score = metrics.precision_score(Y, dtrain_predictions)
        metric_score = metrics.confusion_matrix(Y, dtrain_predictions)

        # Weighting function to ignore the null values
        normalized_features = pd.DataFrame(
            (1 -
             df_train[features].isnull().sum() / df_train[features].count()) *
            pd.Series(alg.get_booster().get_fscore()),
            columns=[iteration]).T

        #calculate the confusion matrix
        feat_imp_train = pd.concat([
            feat_imp_train,
            pd.DataFrame(feat_imp, columns=features, index=[iteration])
        ])
        feat_imp_train_normal = pd.concat(
            [feat_imp_train, normalized_features])
        auc_score_train.append(auc_score)
        precision_score_train.append(precision_score)
        cfm += metric_score

        df.loc[df_predict.index, 'Sampled'] += np.ones(len(df_predict.index))
        df.loc[df_predict.index,
               'Predicted'] += alg.predict(df_predict[features])
        df.loc[df_predict.index, 'Prob'] = alg.predict(df_predict[features])

        values = df['Prob']
        probabilities_total = pd.concat(
            [probabilities_total,
             pd.Series(values, name=str(iteration))],
            axis=1)

        if (not iteration % 10):
            probabilities_total.to_pickle(
                '{0}/probabilities_total.pkl'.format(data_dir))

    #-------------------------------------------------------------------------

    # Calculate the confusion matrix
    cfm /= N_iterations
    cfm[0] /= cfm[0].sum()
    cfm[1] /= cfm[1].sum()

    # Print confusion matrix
    print(np.round(cfm, 3))
    df['Prob'] = df['Predicted'] / df['Sampled']

    ###########-------------------Output List of Planets------------------------#########

    #Find the stars with >90% probability of hosting a planet, with the Sampled, Predicted, and Prob columns
    planets = df[(df.Prob > .90)
                 & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']]
    print('Number of most probable planet hosts: {0}'.format(planets.shape[0]))

    #Sort the stars with predicted planets and save that file
    planetprobs = planets.sort_values(by='Prob', ascending=False)
    name = data_dir + '/figures/planet_probabilities' + str(
        datetime.today().strftime('-%h%d-%H%M')) + '.csv'
    #name = data_dir+'/figures/planet_probabilities.csv'
    outfile = open(name, 'w')
    planetprobs.to_csv(outfile)
    outfile.close()

    #Create a second list with all stars in Hypatia and the probabilities
    planets2 = df[(df.Prob > .0)
                  & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']]
    if golden:  #if 10 stars were randomly taken out
        changeddf = pd.DataFrame([])  #make empty dataframe
        for star in changedhips:  #loop over the 10 known planets hosts (defined at top)
            changeddf = changeddf.append(planets2.loc[planets2.index == star])
            if planets2.loc[
                    planets2.index ==
                    star].empty:  #catch for when a known planet host was cut (bc of abunds)
                temp = pd.Series([nan, nan, nan],
                                 index=['Sampled', 'Predicted', 'Prob'])
                temp.name = star
                changeddf = changeddf.append(
                    temp)  #append blank file (with star name as index)
        #Save golden set as a separate file with the date and time as a tag
        filename = '{0}/figures/goldenSetProbabilities' + str(
            datetime.today().strftime('-%h%d-%H%M')) + '.csv'
        changeddf.to_csv(filename.format(set_number), na_rep=" ")

    #Save the file with all of the probabilities
    planetprobs2 = planets2.sort_values(by='Prob', ascending=False)
    name2 = data_dir + '/figures/planet_probabilitiesAll' + str(
        datetime.today().strftime('-%h%d-%H%M')) + '.csv'
    #name2 = data_dir+'/figures/planet_probabilitiesAll.csv'
    outfile2 = open(name2, 'w')
    planetprobs2.to_csv(outfile2)
    outfile2.close()

    ###########------------------------Save Files------------------------##########
    print('Saving data files')

    #Save files
    feat_imp_train.to_pickle('{0}/features_train.pkl'.format(data_dir))
    feat_imp_train_normal.to_pickle(
        '{0}/features_train_normal.pkl'.format(data_dir))
    probabilities_total.to_pickle(
        '{0}/probabilities_total.pkl'.format(data_dir))
    df.to_pickle('{0}/df_info_all.pkl'.format(data_dir))

    np.save('{0}/auc_score_train.npy'.format(data_dir),
            np.array(auc_score_train))
    np.save('{0}/precision_score_train.npy'.format(data_dir),
            np.array(precision_score_train))
    np.save('{0}/cfm.npy'.format(data_dir), cfm)

    print('Simulation completed successfully.')
    if golden:
        print("Changed indices and HIP numbers:")
        print(changed)
        print(changedhips)
Ejemplo n.º 13
0
    label_encoder = LabelEncoder()
    encoded_y_train = label_encoder.fit_transform(y_train)

    xgb = XGBClassifier(
        max_depth=args.max_depth,
        learning_rate=args.learning_rate,
        n_estimators=args.n_estimators,
        objective="multi:softprob",
        gamma=0,
        min_child_weight=1,
        max_delta_step=0,
        subsample=args.subsample,
        colsample_bytree=args.colsample_bytree,
        colsample_bylevel=args.colsample_bylevel,
        reg_alpha=0,
        reg_lambda=1,
        scale_pos_weight=1,
        base_score=0.5,
        missing=None,
        silent=True,
        nthread=-1,
        seed=42
    )

    kf = KFold(len(x_train), n_folds=10, random_state=42)

    score = cross_val_score(xgb, x_train, encoded_y_train,
                            cv=kf, scoring=ndcg_scorer)

    print(xgb.get_params(), score.mean())
Ejemplo n.º 14
0
            'subsample': 0.7,  # 随机采样训练样本
            'colsample_bytree': 0.7,  # 生成树时进行的列采样
            'min_child_weight': 3,
            # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
            # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
            # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
            'silent': 0,  # 设置成1则没有运行信息输出,最好是设置为0.
            # //无效'eta': 0.007, # 如同学习率
            'seed': 1000,
            'nthread': 7,  # cpu 线程数
            # 'eval_metric': 'auc'
        }

        # 训练模型
        model = XGBClassifier()  # 构建模型
        model.get_params()  #获取参数
        model.set_params(**params)  # 设置参数
        # 开始训练
        model.fit(aTrain_X, aTrain_Y, eval_metric='auc')

        # 保存模型
        score0 = 0  # model.score(aTrain_X, aTrain_Y)
        score1 = model.score(aTest_X, aTest_Y)
        if score1 > 0.745:
            pickle.dump(
                model,
                open(
                    '{}/qa_data/pre_trained_models/xgboost_qaquality_21_60dz_s{}.pkl'
                    .format(cur_dir, round(score1, 3)), 'wb'))
            print('====> yes found good xgboost model')
        # print(i+1, score)  # 打印每轮训练的准确率
Ejemplo n.º 15
0
    encoded_y_train = label_encoder.fit_transform(y_train)

    xgb = XGBClassifier(max_depth=args.max_depth,
                        learning_rate=args.learning_rate,
                        n_estimators=args.n_estimators,
                        objective="multi:softprob",
                        gamma=0,
                        min_child_weight=1,
                        max_delta_step=0,
                        subsample=args.subsample,
                        colsample_bytree=args.colsample_bytree,
                        colsample_bylevel=args.colsample_bylevel,
                        reg_alpha=0,
                        reg_lambda=1,
                        scale_pos_weight=1,
                        base_score=0.5,
                        missing=None,
                        silent=True,
                        nthread=-1,
                        seed=42)

    kf = KFold(len(x_train), n_folds=10, random_state=42)

    score = cross_val_score(xgb,
                            x_train,
                            encoded_y_train,
                            cv=kf,
                            scoring=ndcg_scorer)

    print(xgb.get_params(), score.mean())
Ejemplo n.º 16
0
class XGBOOST(BaseEstimator):
    """
        This class inherits from BaseEstimator and wraps SKLEARN
        RandomForestClassifier or RandomForestRegressor estimator

        ...
        
        Attributes
        ----------

        estimator_parameters : dict
            parameter values
        name : string
            name of the estimator
        tune_parameters: dict
            Hyperparameter optimization settings
        
        Methods
        -------

        build(X)
            Instance the estimator optimizing it
            if tune=true.

    """
    def __init__(self, X, Y, parameters, conveyor):
        # Initialize parent class
        try:
            BaseEstimator.__init__(self, X, Y, parameters, conveyor)
            LOG.debug('Initialize BaseEstimator parent class')
        except Exception as e:
            self.conveyor.setError(
                f'Error initializing BaseEstimator parent class with exception: {e}'
            )
            LOG.error(
                f'Error initializing BaseEstimator parent class with exception: {e}'
            )
            return

        try:
            import xgboost as xgb
            xgb.set_config(verbosity=0)
        except:
            LOG.error('XGboost not found, please revise your environment')

        # Load estimator parameters
        self.estimator_parameters = self.param.getDict('XGBOOST_parameters')

        # Load tune parameters
        self.tune_parameters = self.param.getDict('XGBOOST_optimize')

        if self.param.getVal('quantitative'):
            self.estimator_parameters['objective'] = 'reg:squarederror'
            self.name = "XGB-Regressor"
        else:
            self.estimator_parameters['objective'] = 'binary:logistic'
            self.name = "XGB-Classifier"

        # Missing value must be defined. Otherwyse it returns 'nan' which cannot be
        # converted to JSON and produces trouble in different points
        self.estimator_parameters['missing'] = -99.99999

    def build(self):
        '''Build a new XGBOOST model with the X and Y numpy matrices '''

        try:
            from xgboost.sklearn import XGBClassifier
            from xgboost.sklearn import XGBRegressor
        except Exception as e:
            return False, 'XGboost not found, please revise your environment'

        # Make a copy of data matrices
        X = self.X.copy()
        Y = self.Y.copy()

        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))
        results.append(('model', 'model type', 'XGBOOST'))

        # If tune then call gridsearch to optimize the estimator
        if self.param.getVal('tune'):

            LOG.info("Optimizing XGBOOST estimator")

            try:
                # Check type of model
                if self.param.getVal('quantitative'):
                    self.estimator = XGBRegressor(**self.estimator_parameters)
                    self.optimize(X, Y, self.estimator, self.tune_parameters)
                else:
                    self.estimator = XGBClassifier(**self.estimator_parameters)
                    params = self.estimator.get_params()
                    params['num_class'] = 2
                    self.optimize(X, Y, self.estimator, self.tune_parameters)

            except Exception as e:
                return False, f'Exception optimizing XGBOOST estimator with exception {e}'

        else:
            try:
                if self.param.getVal('quantitative'):
                    LOG.info("Building Quantitative XGBOOST model")

                    self.estimator = XGBRegressor(**self.estimator_parameters)
                else:
                    LOG.info("Building Qualitative XGBOOST model")

                    self.estimator = XGBClassifier(**self.estimator_parameters)

                # self.estimator.fit(X, Y)
                # self.feature_importances = self.estimator.feature_importances_
                self.regularBuild(X, Y)

            except Exception as e:
                return False, f'Exception building XGBOOST estimator with exception {e}'

        if not self.param.getVal('conformal'):
            return True, results

        self.estimator_temp = self.estimator
        success, error = self.conformalBuild(X, Y)
        if success:
            return True, results
        else:
            return False, error
Ejemplo n.º 17
0
                     seed=27)

Xtrain = Xtrain.tocsr()
mask = np.random.choice([False, True], Xtrain.shape[0], p=[0.75, 0.25])
not_mask = ~mask
#kf = list(StratifiedKFold(y, n_folds=10, shuffle=True, random_state=4242))[0]
#Xtr, Xte = Xtrain[kf[0], :], Xtrain[kf[1], :]
#ytr, yte = y[kf[0]], y[kf[1]]
#print('Training set: ' + str(Xtr.shape))
#print('Validation set: ' + str(Xte.shape))
dtrain = xgb.DMatrix(Xtrain[not_mask], label=y[not_mask])
dtrain_watch = xgb.DMatrix(Xtrain[mask], label=y[mask])
dtest = xgb.DMatrix(Xtest)
evallist = [(dtrain, 'train'), (dtrain_watch, 'eval')]
dtrain = xgb.DMatrix(Xtrain, label=y)
params = xgb4.get_params()
params['num_class'] = 12
model = xgb.train(params=params,
                  dtrain=dtrain,
                  evals=evallist,
                  early_stopping_rounds=4,
                  verbose_eval=1,
                  num_boost_round=100)
#model = xgb.train(params=params, dtrain=dtrain,verbose_eval=1,num_boost_round=100)
preds = pd.DataFrame(model.predict(dtest),
                     index=gatest.index,
                     columns=targetencoder.classes_)
preds.to_csv('LT_pred_xgboost2.csv', index=True)
#model = modelfit(xgb4,y,predictors)
#dtest=xgb.DMatrix(Xtest)
#pred1 = pd.DataFrame(model.predict_proba(dtest), index = gatest.index, columns=targetencoder.classes_)
Ejemplo n.º 18
0
class XGBOOST(BaseEstimator):
    """
        This class inherits from BaseEstimator and wraps SKLEARN
        RandomForestClassifier or RandomForestRegressor estimator

        ...
        
        Attributes
        ----------

        estimator_parameters : dict
            parameter values
        name : string
            name of the estimator
        tune_parameters: dict
            Hyperparameter optimization settings
        
        Methods
        -------

        build(X)
            Instance the estimator optimizing it
            if tune=true.

    """
    def __init__(self, X, Y, parameters, conveyor):
        # Initialize parent class
        try:
            BaseEstimator.__init__(self, X, Y, parameters, conveyor)
            LOG.debug('Initialize BaseEstimator parent class')
        except Exception as e:
            self.conveyor.setError(
                f'Error initializing BaseEstimator parent class with exception: {e}'
            )
            LOG.error(
                f'Error initializing BaseEstimator parent class with exception: {e}'
            )
            return

        # Load estimator parameters
        self.estimator_parameters = self.param.getDict('XGBOOST_parameters')

        # Load tune parameters
        self.tune_parameters = self.param.getDict('XGBOOST_optimize')

        if self.param.getVal('quantitative'):
            self.estimator_parameters['objective'] = 'reg:squarederror'
            self.name = "XGB-Regressor"
        else:
            self.estimator_parameters['objective'] = 'binary:logistic'
            self.name = "XGB-Classifier"

        # Missing value must be defined. Otherwyse it returns 'nan' which cannot be
        # converted to JSON and produces trouble in different points
        self.estimator_parameters['missing'] = -99.99999

    def build(self):
        '''Build a new XGBOOST model with the X and Y numpy matrices '''

        try:
            from xgboost.sklearn import XGBClassifier
            from xgboost.sklearn import XGBRegressor
        except Exception as e:
            return False, 'XGboost not found, please revise your environment'

        # Make a copy of data matrices
        X = self.X.copy()
        Y = self.Y.copy()

        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))
        results.append(('model', 'model type', 'XGBOOST'))

        # If tune then call gridsearch to optimize the estimator
        if self.param.getVal('tune'):

            LOG.info("Optimizing XGBOOST estimator")

            try:
                # Check type of model
                if self.param.getVal('quantitative'):
                    self.estimator = XGBRegressor(**self.estimator_parameters)
                    self.optimize(X, Y, self.estimator, self.tune_parameters)
                else:
                    self.estimator = XGBClassifier(**self.estimator_parameters)
                    params = self.estimator.get_params()
                    params['num_class'] = 2
                    self.optimize(X, Y, self.estimator, self.tune_parameters)

            except Exception as e:
                return False, f'Exception optimizing XGBOOST estimator with exception {e}'

        else:
            try:
                if self.param.getVal('quantitative'):
                    LOG.info("Building Quantitative XGBOOST model")

                    self.estimator = XGBRegressor(**self.estimator_parameters)
                else:
                    LOG.info("Building Qualitative XGBOOST model")

                    self.estimator = XGBClassifier(**self.estimator_parameters)

                self.estimator.fit(X, Y)
                LOG.debug(self.estimator)

            except Exception as e:
                return False, f'Exception building XGBOOST estimator with exception {e}'

        if not self.param.getVal('conformal'):
            return True, results

        self.estimator_temp = self.estimator
        success, error = self.conformalBuild(X, Y)
        if success:
            return True, results
        else:
            return False, error


## Overriding of parent methods

# def CF_quantitative_validation(self):
#     ''' performs validation for conformal quantitative models '''

# def CF_qualitative_validation(self):
#     ''' performs validation for conformal qualitative models '''

# def quantitativeValidation(self):
#     ''' performs validation for quantitative models '''

# def qualitativeValidation(self):
#     ''' performs validation for qualitative models '''

# def validate(self):
#     ''' Validates the model and computes suitable model quality scoring values'''

# def optimize(self, X, Y, estimator, tune_parameters):
#     ''' optimizes a model using a grid search over a range of values for diverse parameters'''

# def regularProject(self, Xb, results):
#     ''' projects a collection of query objects in a regular model, for obtaining predictions '''

# def conformalProject(self, Xb, results):
#     ''' projects a collection of query objects in a conformal model, for obtaining predictions '''

# def project(self, Xb, results):
#     ''' Uses the X matrix provided as argument to predict Y'''
print('train.shape', x_train.shape, 'test.shape', x_test.shape)

xgb1 = XGBClassifier(learning_rate=0.1,
                     n_estimators=500,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=1)

xgb_param = xgb1.get_xgb_params()
xgtrain = xgb.DMatrix(x_train_ss, label=y_train)
cvresult = xgb.cv(xgb_param,
                  xgtrain,
                  num_boost_round=xgb1.get_params()['n_estimators'],
                  nfold=5,
                  metrics='auc',
                  early_stopping_rounds=50,
                  verbose_eval=10)

print('n_estimators', cvresult.shape[0])
print('test-auc:', cvresult.iloc[cvresult.shape[0] - 1, 0])
xgb1.set_params(n_estimators=cvresult.shape[0])
print('model', xgb1)

#n_estimators 137
#test-auc: 0.8731962

tuned_parameters = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
                         n_estimators=20000,
                         max_depth=9,
                         min_child_weight=15,
                         gamma=0,
                         subsample=0.9,
                         colsample_bylevel=0.7,
                         colsample_bytree=0.7,
                         objective='binary:logistic',
                         nthread=4,
                         scale_pos_weight=1,
                         seed=27)

    #调优n_estimators
    modelfit(xgb1, train, predictors)

    params = xgb1.get_params()
    print(params)

    # max_depth 和 min_weight 参数调优
    # param_test1 = {
    #     'max_depth': range(3, 10, 2),
    #     'min_child_weight': range(1, 6, 2)
    # }
    # gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.01, n_estimators=563, max_depth=5,
    #                                                 min_child_weight=1, gamma=0, subsample=0.9, colsample_bytree=0.7,colsample_bylevel=0.7,
    #                                                 objective='binary:logistic', nthread=4, scale_pos_weight=1,
    #                                                 seed=27),
    #                         param_grid=param_test1, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    # gsearch1.fit(train[predictors], train[target])
    # print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)