Ejemplo n.º 1
0
def predict(train, test):
    train_X = train.drop(['next_step'], axis=1)
    train_Y = train.pop('next_step')
    bst = XGBClassifier()
    bst.fit(train_X, train_Y, eval_metric='auc')
    bst.get_booster().load_model('xgboost.model')
    pred = bst.predict(test)
    pred_prob = bst.predict_proba(test)
    print(pred)
    print(pred_prob)
Ejemplo n.º 2
0
def modelfit(useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    alg = XGBClassifier(**params)
    df = data.sample(frac=0.3)
    pX = df.drop('LABEL', axis=1)
    py = df['LABEL']
    if useTrainCV:
        print("start use cv")
        xgb_param = alg.get_xgb_params()
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=xgb_param['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds)
        print(cvresult.shape[0])
        alg.set_params(n_estimators=cvresult.shape[0])
        params['n_estimators'] = cvresult.shape[0]
        print("best tree size is {}".format(cvresult.shape[0]))
    # Fit the algorithm on the data
    alg.fit(X, y, eval_metric='auc')
    y_pred = alg.predict(pX)
    accuracy = metrics.accuracy_score(py, y_pred)
    print("精确率Accuracy: %.2f%%" % (accuracy * 100.0))
    print('auc:', metrics.roc_auc_score(py, y_pred))
    train_report = metrics.classification_report(py, y_pred)
    print(train_report)
    feat_imp = pd.Series(
        alg.get_booster().get_fscore()).sort_values(ascending=False)
    print(feat_imp)
    return alg
Ejemplo n.º 3
0
def get_xgb_features_and_values(
        clf: XGBClassifier) -> Tuple[List[Features], List[LeafValues]]:
    """Use regex to find (features, thresholds) and (left, right) splits"""

    fd, fout = mkstemp(text=True)

    clf.get_booster().dump_model(fout, with_stats=True)

    with open(fout, "r") as fin:
        txt = fin.read()

    os.close(fd)

    pat = "\[f([0-9]+)<([0-9]+.*[0-9-e]*)\]"
    features_thresholds = list(
        map(lambda x: (int(x[0]), float(x[1])), re.findall(pat, txt)))

    _ = list(map(float, re.findall("leaf=(-{,1}[0-9]+.[0-9-e]+),", txt)))
    left_right = cast(List[Tuple[float, float]], list(zip(*(iter(_), ) * 2)))

    return features_thresholds, left_right
Ejemplo n.º 4
0
def evaluate_model(model_params):
    model = XGBClassifier(**model_params)
    data, X_train, y_train = get_transformed_data(frac=1)
    model.fit(X_train, y_train, eval_metric=metrics.f1_score)
    joblib.dump(model, 'danCdmaModel_{}.pkl'.format(format(datetime.now().strftime('%d%H%M'))))
    del data
    del X_train
    del y_train
    data, X_test, y_test = get_transformed_data(fname='cdma_train2.csv', frac=1)
    y_pred = model.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print('auc:', metrics.roc_auc_score(y_test, y_pred))
    train_report = metrics.classification_report(y_test, y_pred)
    print(train_report)
    feat_imp = pd.Series(model.get_booster().get_fscore()).sort_values(ascending=False)
    print(feat_imp)
    return model
Ejemplo n.º 5
0
def evaluate_model(model_params):
    model = XGBClassifier(**model_params)
    AX = data.drop('LABEL', axis=1)
    ay = data['LABEL']
    X_train, X_test, y_train, y_test = train_test_split(AX,
                                                        ay,
                                                        test_size=0.33,
                                                        random_state=7)
    model.fit(X_train, y_train, eval_metric=metrics.f1_score)
    y_pred = model.predict(X_test)
    accuracy = metrics.acpredict_probacuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print('auc:', metrics.roc_auc_score(y_test, y_pred))
    train_report = metrics.classification_report(y_test, y_pred)
    print(train_report)
    feat_imp = pd.Series(
        model.get_booster().get_fscore()).sort_values(ascending=False)
    joblib.dump(
        model, 'lossWarnBroadBandModel_{}.pkl'.format(
            datetime.now().strftime('%d%H%M')))
    return model
    ###
    #   FEATURE IMPORTANCE
    ###

    feat_df = get_feature_importance(
        best_xgb_rf, annotated_df, cols_to_drop=['GRID', 'label', 'partition'])
    barplot_feat_importance(feat_df,
                            top_n=25,
                            plt_prefix=output_suffix,
                            fig_file=FEATURE_FIG_FILE)

    # shap values
    train_df, _ = extract_train_df(annotated_df)
    test_df, _ = extract_test_df(annotated_df)

    xgb_booster = best_xgb_rf.get_booster()
    calc_write_shap(X_train,
                    y_train,
                    xgb_booster,
                    train_df,
                    shap_pickle_file=TRAIN_SHAP_FILE,
                    top_feat_file=TRAIN_TOP_FEAT_SHAP_FILE)
    calc_write_shap(X_test,
                    y_test,
                    xgb_booster,
                    test_df,
                    shap_pickle_file=TEST_SHAP_FILE,
                    top_feat_file=TEST_TOP_FEAT_SHAP_FILE)

    ###
    #   WRITE
Ejemplo n.º 7
0
def train_model_xgb_cv(X_train, X_test, y_train, y_test):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    xgb_sklearn = XGBClassifier(learning_rate=0.1,
                                n_estimators=300,
                                max_depth=3,
                                min_child_weight=1,
                                gamma=0.3,
                                subsample=0.6,
                                colsample_bytree=0.7,
                                objective='binary:logistic',
                                nthread=4,
                                seed=27,
                                reg_lambda=0.01)

    xgb_params = xgb_sklearn.get_params()
    cvresult = xgb.cv(xgb_params,
                      dtrain,
                      num_boost_round=xgb_params['n_estimators'],
                      nfold=5,
                      metrics='auc',
                      early_stopping_rounds=5)
    n_estimators = cvresult.shape[0]
    print("n_estimators: ", n_estimators)
    xgb_sklearn.set_params(n_estimators=n_estimators)
    xgb_sklearn.fit(np.array(X_train), np.array(y_train), eval_metric='auc')

    pred_y = xgb_sklearn.predict(X_test)
    pred_y_prob = xgb_sklearn.predict_proba(X_test)[:, 1]
    # auc
    auc = roc_auc_score(y_test, pred_y_prob)
    print('AUC: ', auc)
    # error
    score = xgb_sklearn.score(X_test, y_test)
    print('error: ', 1 - score)

    # grid search
    params = {'max_depth': [2, 3, 4, 5, 6, 7, 8]}
    model = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.1,
            n_estimators=300,
            # max_depth=3,
            min_child_weight=1,
            gamma=0.3,
            subsample=0.6,
            colsample_bytree=0.7,
            objective='binary:logistic',
            nthread=4,
            seed=27,
            reg_lambda=0.01),
        param_grid=params,
        cv=2)
    model.fit(np.array(X_train), np.array(y_train), eval_metric='auc')
    print(model.cv_results_, model.best_params_, model.best_score_)

    feat_imp = pd.Series(xgb_sklearn.get_booster().get_fscore(
        fmap='xgb.fmap')).sort_values(ascending=True)
    feat_imp.plot(kind='barh', color='black', legend=False, figsize=(10, 6))
    plt.ylabel('Feature name')
    plt.xlabel('Feature score')
    plt.savefig(
        'C:/Users/Administrator.USER-20161227PQ/Desktop/paper figure/figure5.png',
        dpi=300)
    plt.show()
Ejemplo n.º 8
0
def set_parameters(set_name, golden_set, input_file):

    golden = str_to_bool(golden_set)

    #-------------------------------------------------------------------------

    #read in the directory that is being run
    data_dir = set_name

    #read in the parameters file and load it

    full_path = os.path.join(working_dir, "{0}".format(data_dir),
                             'params.yaml')
    stream = open(full_path, 'r')
    parameters = yaml.load(stream, Loader=yaml.FullLoader)

    #read in Hypatia data as pandas dataframe (2D structure), drop HIP numbers
    df = pd.read_csv(input_file)

    set_number = set_name

    #-------------------------------------------------------------------------

    if golden:
        df2 = df.copy()
        df2.loc[df2[(df2['Exo'] == 1)
                    & (df2['MaxPMass'] > parameters['gas_giant_mass'])].
                sample(10, random_state=np.random.RandomState()).index,
                'Exo'] = 0
        yy = df2.loc[df2['Exo'] == 0].index
        zz = df.loc[df['Exo'] == 0].index
        changed = [ind for ind in yy if not ind in zz]
        changedhips = [df['HIP'][ind] for ind in changed]
        df = df2.copy()
        yy2 = df2.loc[df2['Exo'] == 0].index
        zz2 = df.loc[df['Exo'] == 0].index
        changed2 = [ind for ind in yy2 if not ind in zz2]
    #-------------------------------------------------------------------------

    df.index = df['HIP']
    df['Exo'] = df['Exo'].astype('category')  #category = limited possibilities
    df['Multi'] = df['Multi'].astype('category')
    df['MaxPMass'] = df['MaxPMass'].astype(np.number)
    df['Sampled'] = np.zeros((df.shape[0]))
    df['Predicted'] = np.zeros((df.shape[0]))
    df = df.drop(['HIP'], 1)

    # Print a bunch of stuff in terminal
    print('Parameters used in simulation:')
    print('------------------------------')
    print('')

    for key in parameters.keys():
        print('{0} = {1}'.format(key, parameters[key]))

    cv_folds = parameters['cv_folds']
    early_stopping_rounds = parameters['early_stopping_rounds']
    N_iterations = parameters['N_iterations']
    N_samples = parameters['N_samples']
    gas_giant_mass = parameters['gas_giant_mass']
    features = parameters['features']

    relevant_columns = features + ['Exo', 'MaxPMass', 'Sampled', 'Predicted']

    #Redefine dataframe with the "relevant columns" and remove nans if dropnans==True in yaml
    if (parameters['dropnans']):
        df = df[relevant_columns].dropna()

    print('Number of samples used in simulation: {0}'.format(df.shape[0]))

    print('')

    #Define the confusion matrix and other arrays
    cfm = np.zeros((2, 2))

    auc_score_train = []
    precision_score_train = []
    feat_imp_train = pd.DataFrame(columns=features)
    probabilities_total = pd.DataFrame(index=df.index)

    print('iteration \t estimators')
    print('---------------------------')

    #---------------------------XGBOOST LOOP----------------------------------------------

    # Loop for all of the iterations (defined in yaml)
    for iteration in range(0, N_iterations):

        #dataframe of 200 random hosts with giant planets
        df_iter_with_exo = df[(df['Exo'] == 1)
                              & (df['MaxPMass'] > gas_giant_mass)].sample(
                                  N_samples,
                                  random_state=np.random.RandomState())
        #dataframe of 200 random non hosts
        df_iter_none_exo = df[df['Exo'] == 0].sample(
            N_samples, random_state=np.random.RandomState())

        # make a new dataframe of the 400 star subset
        df_train = pd.concat([df_iter_with_exo, df_iter_none_exo], axis=0)
        # make a dataframe of those stars NOT in the training set (to predict on)
        df_predict = df[~df.index.isin(df_train.index)]

        # The train dataframe with everything but the Exo column
        X = df_train.drop(['Exo'], 1)
        # The Exo column (and hips)
        Y = df_train.Exo

        # Note: Using gbtree booster
        alg = XGBClassifier(
            learning_rate=
            0.1,  #def=0.3, prevents overfitting and makes feature weight conservative
            n_estimators=1000,  #number of boosted trees to fit
            max_depth=6,  #def=6, max depth of tree/complexity
            min_child_weight=
            1,  #def=1, min weight needed to continue leaf partitioning
            gamma=
            0,  #def=0, minimum loss reduction required to make partition on a leaf
            subsample=0.8,  #def=1, subsample ratio of the training set
            colsample_bytree=
            0.8,  #def=1, subsample ratio of columns when making each tree
            objective=
            'binary:logistic',  #def=linear, logistic regression for binary classification, output probability
            nthread=
            1,  #originall = 8, but issue on laptop...def=max, number of parallel threads used to run xgboost
            scale_pos_weight=1,  #def=1, balance positive and neg weights
            seed=27)  #def=0, random number seed

        #get input parameters of algorithm
        xgb_param = alg.get_xgb_params()

        #construct training set matrix
        xgtrain = xgb.DMatrix(X[features].values, label=Y)

        #cross validation (CV) of xgboost to avoid overfitting
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds)

        alg.set_params(n_estimators=cvresult.shape[0])
        print(iteration, '\t \t', cvresult.shape[0])

        alg.fit(X[features], Y, eval_metric='auc')

        dtrain_predictions = alg.predict(X[features])
        dtrain_predprob = alg.predict_proba(X[features])[:, 1]

        feat_imp = alg.get_booster().get_fscore()
        # See how the algorithm performs on the Exo data
        auc_score = metrics.roc_auc_score(Y, dtrain_predprob)
        precision_score = metrics.precision_score(Y, dtrain_predictions)
        metric_score = metrics.confusion_matrix(Y, dtrain_predictions)

        # Weighting function to ignore the null values
        normalized_features = pd.DataFrame(
            (1 -
             df_train[features].isnull().sum() / df_train[features].count()) *
            pd.Series(alg.get_booster().get_fscore()),
            columns=[iteration]).T

        #calculate the confusion matrix
        feat_imp_train = pd.concat([
            feat_imp_train,
            pd.DataFrame(feat_imp, columns=features, index=[iteration])
        ])
        feat_imp_train_normal = pd.concat(
            [feat_imp_train, normalized_features])
        auc_score_train.append(auc_score)
        precision_score_train.append(precision_score)
        cfm += metric_score

        df.loc[df_predict.index, 'Sampled'] += np.ones(len(df_predict.index))
        df.loc[df_predict.index,
               'Predicted'] += alg.predict(df_predict[features])
        df.loc[df_predict.index, 'Prob'] = alg.predict(df_predict[features])

        values = df['Prob']
        probabilities_total = pd.concat(
            [probabilities_total,
             pd.Series(values, name=str(iteration))],
            axis=1)

        if (not iteration % 10):
            probabilities_total.to_pickle(
                '{0}/probabilities_total.pkl'.format(data_dir))

    #-------------------------------------------------------------------------

    # Calculate the confusion matrix
    cfm /= N_iterations
    cfm[0] /= cfm[0].sum()
    cfm[1] /= cfm[1].sum()

    # Print confusion matrix
    print(np.round(cfm, 3))
    df['Prob'] = df['Predicted'] / df['Sampled']

    ###########-------------------Output List of Planets------------------------#########

    #Find the stars with >90% probability of hosting a planet, with the Sampled, Predicted, and Prob columns
    planets = df[(df.Prob > .90)
                 & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']]
    print('Number of most probable planet hosts: {0}'.format(planets.shape[0]))

    #Sort the stars with predicted planets and save that file
    planetprobs = planets.sort_values(by='Prob', ascending=False)
    name = data_dir + '/figures/planet_probabilities' + str(
        datetime.today().strftime('-%h%d-%H%M')) + '.csv'
    #name = data_dir+'/figures/planet_probabilities.csv'
    outfile = open(name, 'w')
    planetprobs.to_csv(outfile)
    outfile.close()

    #Create a second list with all stars in Hypatia and the probabilities
    planets2 = df[(df.Prob > .0)
                  & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']]
    if golden:  #if 10 stars were randomly taken out
        changeddf = pd.DataFrame([])  #make empty dataframe
        for star in changedhips:  #loop over the 10 known planets hosts (defined at top)
            changeddf = changeddf.append(planets2.loc[planets2.index == star])
            if planets2.loc[
                    planets2.index ==
                    star].empty:  #catch for when a known planet host was cut (bc of abunds)
                temp = pd.Series([nan, nan, nan],
                                 index=['Sampled', 'Predicted', 'Prob'])
                temp.name = star
                changeddf = changeddf.append(
                    temp)  #append blank file (with star name as index)
        #Save golden set as a separate file with the date and time as a tag
        filename = '{0}/figures/goldenSetProbabilities' + str(
            datetime.today().strftime('-%h%d-%H%M')) + '.csv'
        changeddf.to_csv(filename.format(set_number), na_rep=" ")

    #Save the file with all of the probabilities
    planetprobs2 = planets2.sort_values(by='Prob', ascending=False)
    name2 = data_dir + '/figures/planet_probabilitiesAll' + str(
        datetime.today().strftime('-%h%d-%H%M')) + '.csv'
    #name2 = data_dir+'/figures/planet_probabilitiesAll.csv'
    outfile2 = open(name2, 'w')
    planetprobs2.to_csv(outfile2)
    outfile2.close()

    ###########------------------------Save Files------------------------##########
    print('Saving data files')

    #Save files
    feat_imp_train.to_pickle('{0}/features_train.pkl'.format(data_dir))
    feat_imp_train_normal.to_pickle(
        '{0}/features_train_normal.pkl'.format(data_dir))
    probabilities_total.to_pickle(
        '{0}/probabilities_total.pkl'.format(data_dir))
    df.to_pickle('{0}/df_info_all.pkl'.format(data_dir))

    np.save('{0}/auc_score_train.npy'.format(data_dir),
            np.array(auc_score_train))
    np.save('{0}/precision_score_train.npy'.format(data_dir),
            np.array(precision_score_train))
    np.save('{0}/cfm.npy'.format(data_dir), cfm)

    print('Simulation completed successfully.')
    if golden:
        print("Changed indices and HIP numbers:")
        print(changed)
        print(changedhips)
Ejemplo n.º 9
0
        'seed': 0,
        'subsample': 1,
        'colsample_bytree': 1,
        'objective': 'binary:logistic',
        'max_depth': 3
    }

    # log model params
    for key in params:
        mlflow.log_param(key, params[key])

    # train XGBoost model
    gbtree = XGBClassifier(**params)
    gbtree.fit(train_features, train_labels)

    importances = gbtree.get_booster().get_fscore()
    print(importances)

    # get predictions
    y_pred = gbtree.predict(test_features)

    accuracy = accuracy_score(test_labels, y_pred)
    print("Accuracy: %.1f%%" % (accuracy * 100.0))

    # log accuracy metric
    mlflow.log_metric("accuracy", accuracy)

    sns.set(font_scale=1.5)
    xgb.plot_importance(gbtree)
    plt.savefig("importance.png", dpi=200, bbox_inches="tight")
Ejemplo n.º 10
0
#Fit the algorithm on the data
model.fit(X_train, y_train, eval_metric='merror')

#Predict training set:
predictions = model.predict(X_test)
predprob = model.predict_proba(X_test)[:, 1]

# Print model report:
print("\nModel Report")
print("Training Accuracy : %.4g" %
      metrics.accuracy_score(y_train, model.predict(X_train)))
print("Testing Accuracy : %.4g" %
      metrics.accuracy_score(y_test, model.predict(X_test)))

feat_imp = pd.Series(
    model.get_booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
''' PARAMETER TUNING '''
''' Tune max_depth and min_child_weight '''
# phase1 with large subset
param_test1 = {
    'max_depth': range(3, 10, 2),
    'min_child_weight': range(1, 6, 2)
}

gsearch1 = GridSearchCV(estimator=model,
                        param_grid=param_test1,
                        scoring='accuracy',
                        n_jobs=2,
                        iid=False,
Ejemplo n.º 11
0
    def train_model(self):
        # KFold for cross-validation
        folds = KFold(n_splits=self.n_folds)

        self.submission[target] = 0

        training_start_time = time()
        for fold, (train_index,
                   valid_index) in enumerate(folds.split(self.X_train)):
            start_time = time()
            print('Training on Fold {}'.format(fold + 1))

            model = XGBClassifier(**self.params)

            # make train and valid set
            X_train, X_valid = self.X_train.iloc[
                train_index], self.X_train.iloc[valid_index]
            y_train, y_valid = self.y_train.iloc[
                train_index], self.y_train.iloc[valid_index]

            # train the model
            model.fit(X_train,
                      y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=self.metric,
                      verbose=self.verbose)

            train_pred = model.predict_proba(X_train)[:, 1]
            del X_train

            valid_pred = model.predict_proba(X_valid)[:, 1]
            del X_valid

            # train and valid roc_auc
            self.train_aucs.append(roc_auc_score(y_train, train_pred))
            self.valid_aucs.append(roc_auc_score(y_valid, valid_pred))

            del y_train, train_pred
            del y_valid, valid_pred

            print('ROC AUC on Train: {}'.format(self.train_aucs[fold]))
            print('ROC AUC on Validation: {}'.format(self.valid_aucs[fold]))

            # test set predictions for KFold
            test_pred = model.predict_proba(self.X_test)[:, 1]
            self.submission[self.target] = self.submission[
                self.target] + test_pred / self.n_folds

            gc.collect()

            print('Fold {} finished in {}'.format(
                fold + 1,
                str(datetime.timedelta(seconds=time() - start_time))))
            print("=" * 30)
            print()

            self.feature_importances['fold_{}'.format(fold + 1)] = pd.Series(
                model.get_booster().get_fscore())

        print('-' * 30)
        print('Training has finished!')
        print('Total training time is {}'.format(
            str(datetime.timedelta(seconds=time() - training_start_time))))
        print('Mean AUC on Train: ', np.mean(self.train_aucs))
        print('Mean AUC on Validation: ', np.mean(self.valid_aucs))
        print('-' * 30)

        return model
Ejemplo n.º 12
0
# 10.1 Print feature importance
#      https://stackoverflow.com/a/52777909
#      https://towardsdatascience.com/be-careful-when-interpreting-your-features-importance-in-xgboost-6e16132588e7
"""
importance_type

    ‘weight’ -      the number of times a feature is used to split the data across all trees.
    ‘gain’ -        the average gain across all splits the feature is used in.
    ‘cover’ -       the average coverage across all splits the feature is used in.
    ‘total_gain’ -  the total gain across all splits the feature is used in.
    ‘total_cover’ - the total coverage across all splits the feature is used in.

"""
# 11.0 Get results in a sorted DataFrame
feature_important = model_gs.get_booster().get_score(importance_type='weight')
feature_important
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values,
                    index=keys,
                    columns=["score"]).            \
                        sort_values(               \
                                     by = "score", \
                                    ascending=False)

# 11.1 Compare the results in the following DataFrame
#      with that obtained using PermutationImportance
#      of eli5 below.