Example #1
0
#Split to Training and Testing
from sklearn import cross_validation
seed = 7
test_size = 0.3
X = credit_data.loc[:, credit_data.columns != 'default payment next month']
y = credit_data[['default payment next month']]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=test_size, random_state=seed)

colnames = credit_data.columns
y = np.array(credit_data[colnames[-1]])

# XGBoost
# fit model on training data
model = XGBClassifier()
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#tune parameter
#use crtl+1 to select all
#from pandas.core.categorical import Categorical
#from scipy.sparse import csr_matrix
#import numpy as np
############################################################
# filename = r'C:\Users\Admin\Downloads\bestRFmodelV1.sav'
# loadedModel = pickle.load(open(filename,'rb'))
# loadedModel
############################################################

## BASELINE TESTS ##
#############################################################################################################################################################

Xtrain, Xtest, ytrain, ytest = preProcessingPipeline( set(allFeatures)-set(['Weight','Height']) )

#Compute baseline performance (accuracy on test set) for each model type:
acc_rf = compute_performance_Array(  RandomForestClassifier(random_state=1) .fit(Xtrain,ytrain).predict(Xtest), ytest) #RandomForestClassifier
acc_lg = compute_performance_Array(  LogisticRegression(random_state=1)     .fit(Xtrain,ytrain).predict(Xtest), ytest) #LogisticRegressionClassifier
acc_nn = compute_performance_Array(  KNeighborsClassifier()                 .fit(Xtrain,ytrain).predict(Xtest), ytest) #KNeighborsClassifier
acc_gb = compute_performance_Array(  XGBClassifier(random_state=1)          .fit(Xtrain,ytrain).predict(Xtest), ytest) #XGBClassifier
acc_nb = compute_performance_Array(  GaussianNB()                           .fit(Xtrain,ytrain).predict(Xtest), ytest) #XGBClassifier

print("Random Forest Accuracy:"         , acc_rf)
print("Logistic Regression Accuracy:"   , acc_lg)
print("k-Nearest Neighbours Accuracy:"  , acc_nn)
print("XGBoost Accuracy:"               , acc_gb)
print("Naive Bayes Accuracy:"           , acc_nb)

#############################################################################################################################################################
## FEATURE TUNING ##
#############################################################################################################################################################

feature_results_rf = recursiveFeatureSearch( RandomForestClassifier(random_state=1), list(set(allFeatures) - set(['Weight','Height'])) )
feature_results_gb = recursiveFeatureSearch( XGBClassifier(random_state=1),          list(set(allFeatures) - set(['Weight','Height'])) )
    return float("{0:.2f}".format(
        (((np.array(x)[:-1] * np.array(x)[1:]) < 0).sum()) / len(x)))


grouped = train[features].groupby('id')
X_train = grouped.agg(['max', 'min', 'mean', 'mad', q1, q3, IQR, RMS, ZCR])
X_test = test[features].groupby('id').agg(
    ['max', 'min', 'mean', 'mad', q1, q3, IQR, RMS, ZCR])
y_train = train_label['label']

from xgboost import XGBClassifier

xgb_wrapper = XGBClassifier(n_estimators=400,
                            learning_rate=0.3,
                            max_depth=3,
                            min_child_weight=5,
                            gamma=0.3,
                            subsample=0.9,
                            colsample_bytree=0.4)
xgb_wrapper.fit(X_train, y_train)

w_preds = xgb_wrapper.predict(X_test)
w_pred_proba = xgb_wrapper.predict_proba(X_test)[:, 1]

y_pred = xgb_wrapper.predict_proba(X_test)

submission.iloc[:, 1:] = y_pred
submission

submission.to_csv('xgboost_q1q3_iqrrmszcr.csv', index=False)
Example #4
0
def train():
    print("Starting writing classifier training...")
    if USE_POS_TAG:
        df = pd.read_csv(
            path.join(path.dirname(__file__), 'data/scrapeResultPOS.csv'))
    else:
        df = pd.read_csv(
            path.join(path.dirname(__file__), 'data/scrapeResultCleaned.csv'))

    # missing_rows = []

    # for i in range(len(df)):
    #     if df.loc[i, 'text'] != df.loc[i, 'text']:
    #         missing_rows.append(i)

    # df = df.drop(missing_rows).reset_index().drop(['index', 'id'], axis=1)

    # df = df.drop_duplicates(subset='text', keep='first')
    # df = df.drop_duplicates(subset='link', keep='first')

    count_fake = 0
    count_real = 0

    for index, row in df.iterrows():
        if row['label'] == 1:
            if count_fake > 5000:
                df.drop([df.index[index]])
                continue
            count_fake += 1
        else:
            count_real += 1
    print("Number of fake articles is ", count_fake)
    print("Number of real articles is ", count_real)

    # Set `y`
    y = df.label

    # Drop the `label` column
    df.drop("label", axis=1)

    # Make training and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(df['text'],
                                                        y,
                                                        test_size=0.2,
                                                        random_state=53)

    # Initialize the `tfidf_vectorizer`
    if USE_POS_TAG:
        vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                                     stop_words='english',
                                     min_df=2,
                                     norm='l2',
                                     strip_accents='unicode',
                                     lowercase=True)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True,
                                     ngram_range=(1, 2),
                                     stop_words='english',
                                     max_df=0.8,
                                     min_df=0.01,
                                     max_features=5000,
                                     strip_accents='unicode')

    # Fit and transform the training data
    X_train = vectorizer.fit_transform(X_train)

    # Transform the test set
    X_test = vectorizer.transform(X_test)

    clf = XGBClassifier()
    clf.fit(X_train, Y_train)
    Y_predicted = clf.predict(X_test)

    print("Classification Report Writing")
    print(metrics.classification_report(Y_test, Y_predicted))

    if (USE_POS_TAG):
        modelFile = path.join(path.dirname(__file__), "model-POS.xgb")
    else:
        modelFile = path.join(path.dirname(__file__), "model.xgb")
    outfile = open(modelFile, 'wb')
    pickle.dump(clf, outfile)
    outfile.close()

    if (USE_POS_TAG):
        vectorizerFile = path.join(path.dirname(__file__),
                                   "vectorizer-POS.tfidf")
    else:
        vectorizerFile = path.join(path.dirname(__file__), "vectorizer.tfidf")

    outfile = open(vectorizerFile, 'wb')
    pickle.dump(vectorizer, outfile)
    outfile.close()

    return clf, vectorizer
Example #5
0
mm = MinMaxScaler()
X_train = mm.fit_transform(X_train)
X_test = mm.transform(X_test)

print('y_train class distribution')
print(y_train.value_counts(normalize=True))

print('y_test class distribution')
print(y_test.value_counts(normalize=True))

scorers = {'precision_score': make_scorer(precision_score), 'f1_score': make_scorer(f1_score), 'recall_score': make_scorer(recall_score), 'accuracy_score': make_scorer(accuracy_score)}

lgbm = LGBMClassifier()
knn = KNeighborsClassifier()
catb = CatBoostClassifier()
xgb = XGBClassifier()
et = ExtraTreesClassifier()

#Extratrees Parameters
n_estimators = np.arange(50,350)
max_depth = np.arange(5,350)
max_features = ['sqrt', 'log2']
param_grid = dict(n_estimators = n_estimators, max_features = max_features, max_depth = max_depth)
et = ExtraTreesClassifier()
randomized = RandomizedSearchCV(et, param_grid, scoring = 'f1', cv = 2, n_iter = 10)
randomized.fit(X_train,y_train)
randomized.best_estimator_

#knn Parameters
#k_range = np.arange(1,100)
#weights = ["uniform","distance"]
#
# print(model)

# preds = model.predict(test[:,0:(n2-1)])
# accuracy=accuracy_score(test[:,(n2-1)], preds)

xtrain = train[:, 0:(n2 - 1)]
ytrain = train[:, (n2 - 1)]
xtest = test[:, 0:(n2 - 1)]
ytest = test[:, (n2 - 1)]
model = XGBClassifier(learning_rate=0.1,
                      n_estimators=1000,
                      max_depth=10,
                      min_child_weight=3,
                      gamma=0,
                      subsample=0.8,
                      colsample_bytree=0.8,
                      objective='binary:logistic',
                      nthread=4,
                      scale_pos_weight=1,
                      seed=27)
model.fit(data[:, 0:(n2 - 1)], data[:, (n2 - 1)])

preds = model.predict(xtest)
accuracy = accuracy_score(ytest, preds)

df = pd.read_csv('test.csv', delimiter=',')

Embarked_map = {'S': 0, 'C': 1, 'Q': 2}
sex_map = {'male': 1, 'female': 0}
Example #7
0
# Visualize the count
sns.countplot(df['status'])

#Get the Data type
df.dtypes

#Create the feature data set
X = df.drop(['name'], 1)
X = np.array(X.drop(['status'], 1))
y = np.array(df['status'])

#Split the data into 80% training and 20% testing data sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Transform the feature data to be values between 0 and 1
sc = MinMaxScaler(feature_range=(0, 1))
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Create the xbgclassifier
model = XGBClassifier().fit(x_train, y_train)

#get the models predictions
predictions = model.predict(x_test)
predictions

y_test

#get the models accuracy, precision, recall and the f1- score
print(classification_report(y_test, predictions))
  print(' ')
  print(' ')
  from sklearn.model_selection import cross_val_score
  #train model with cv of 10
  cv_scores = cross_val_score(model, X, y, cv=10)
  
  #Display the results
  print('List of Cross-Validation Scores:', cv_scores)
  print('Mean of Cross-Validation Scores:{}'.format(np.mean(cv_scores)))

"""# Model 1: XGBoost Classification
(Parallel Tree Gradient Boosting)
"""

Model = "XGBClassifier()" # Adds to title in viz
model = XGBClassifier() # Create the Model

train_test_ml_model(X_train, y_train, X_test, Model)
cross_val(X, y, Model)

"""# Model 2: K-Nearest Neighbors Classification (KNN)"""

# Attempt 1: Out of Box

#n_neighbors=5 out of the box
Model = "KNeighborsClassifier"
model = KNeighborsClassifier()

train_test_ml_model(X_train,y_train,X_test,Model)
cross_val(X, y, Model)
Example #9
0
    #print (td[col])
X_train = td[[x for x in td.columns if 'class' not in x]]
#print (X_train)
Y_train = td['class']
#print (Y_train)
y = preprocessing.LabelEncoder()
for col in td1.columns:
    td1[col] = y.fit_transform(td1[col])
X_test = td1[[x for x in td.columns if 'class' not in x]]
#print (X_test)
Y_test = td1['class']
#print (Y_test)

#import xgboost as xgb
from xgboost import XGBClassifier
xgb_data = XGBClassifier().fit(X_train, Y_train)
#print (xgb_data)

xgb_predictions = xgb_data.predict(X_test)
#print (xgb_predictions)

# model accuracy for X_test
acc_train = xgb_data.score(X_train, Y_train)
#print (acc_train)
accuracy = xgb_data.score(X_test, Y_test)
print(accuracy)

# creating a confusion matrix
cm = confusion_matrix(Y_test, xgb_predictions)
#print (cm)
print(classification_report(Y_test, xgb_predictions))
    prepare_data(train_values_df, test_values_df, train_labels_df)

# pipeline to place median for NaNs and normalize data
# prepared_X_train_values = feature_pipeline(train_values_df, num_attrib, cat_attrib)
# prepared_X_test_values = feature_pipeline(test_values_df, num_attrib, cat_attrib)

prepared_X_train_values, prepared_test_values = \
    target_encode_multiclass(train_values_df, train_labels_df, test_values_df)

# generating stratified training and validation data sets from sparse matrices
prepared_X_strat_train, y_strat_train_df, prepared_X_strat_val, y_strat_val_df = \
    stratified_shuffle_data_split(prepared_X_train_values, train_labels_df)

# classifiers employed for training
classifier_dict = {
                    'xgb_clf': XGBClassifier(n_estimators=500, learning_rate=0.3, colsample_bytree=0.3,
                                             subsample=0.3, early_stopping_rounds=50, verbosity=0),
                    'sgd_clf': SGDClassifier(loss='modified_huber', n_jobs=-1, early_stopping=True),
                    # 'rf_clf': RandomForestClassifier(n_estimators=500, n_jobs=-1),
                    'cat_clf': CatBoostClassifier(iterations=2e3, allow_writing_files=False,
                                                  learning_rate=0.3, loss_function='MultiClass',
                                                  custom_metric=['Accuracy', 'AUC', 'TotalF1'],
                                                  verbose=100),
                    'ada_clf': AdaBoostClassifier(n_estimators=100, learning_rate=0.3),
                   }

# creates list of named classifier tuples for training
clf_list = clf_func(classifier_dict)

# runs actual training on classifiers and outputs results to screen
run_clf(prepared_X_strat_train, prepared_X_strat_val, y_strat_train_df, y_strat_val_df, clf_list, model_dir)
def get_top_n_features(titanic_train_data_X, titanic_train_data_Y,
                       top_n_features):
    #randomforest
    rf_est = RandomForestClassifier(random_state=0)
    rf_param_grid = {
        'n_estimators': [500],
        'min_samples_split': [2, 3],
        'max_depth': [20]
    }
    rf_grid = model_selection.GridSearchCV(rf_est,
                                           rf_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    rf_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best RF Params:' + str(rf_grid.best_params_))
    print('Top N Features Best RF Score:' + str(rf_grid.best_score_))
    print('Top N Features RF Train Score:' +
          str(rf_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_rf = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        rf_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature']
    print('Sample 10 Feeatures from RF Classifier')
    print(str(features_top_n_rf[:10]))

    #AdaBoost
    ada_est = AdaBoostClassifier(random_state=0)
    ada_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1]}
    ada_grid = model_selection.GridSearchCV(ada_est,
                                            ada_param_grid,
                                            n_jobs=25,
                                            cv=10,
                                            verbose=1)
    ada_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best Ada Params:' + str(ada_grid.best_params_))
    print('Top N Features Best Ada Score:' + str(ada_grid.best_score_))
    print('Top N Features Ada Train Score:' +
          str(ada_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_ada = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        ada_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature']
    print('Sample 10 Features from Ada Classifier:')
    print(str(features_top_n_ada[:10]))

    #ExtraTree
    et_est = ExtraTreesClassifier(random_state=0)
    et_param_grid = {
        'n_estimators': [500],
        'min_samples_split': [3, 4],
        'max_depth': [20]
    }
    et_grid = model_selection.GridSearchCV(et_est,
                                           et_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    et_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best ET Params:' + str(et_grid.best_params_))
    print('Top N Features Best DT Score:' + str(et_grid.best_score_))
    print('Top N Features ET Train Score:' +
          str(et_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_et = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        et_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature']
    print('Sample 10 Features from ET Classifier:')
    print(str(features_top_n_et[:10]))

    # GradientBoosting
    gb_est = GradientBoostingClassifier(random_state=0)
    gb_param_grid = {
        'n_estimators': [500],
        'learning_rate': [0.01, 0.1],
        'max_depth': [20]
    }
    gb_grid = model_selection.GridSearchCV(gb_est,
                                           gb_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    gb_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best GB Params:' + str(gb_grid.best_params_))
    print('Top N Features Best GB Score:' + str(gb_grid.best_score_))
    print('Top N Features GB Train Score:' +
          str(gb_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_gb = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        gb_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_gb = feature_imp_sorted_gb.head(top_n_features)['feature']
    print('Sample 10 Feature from GB Classifier:')
    print(str(features_top_n_gb[:10]))

    # DecisionTree
    dt_est = DecisionTreeClassifier(random_state=0)
    dt_param_grid = {'min_samples_split': [2, 4], 'max_depth': [20]}
    dt_grid = model_selection.GridSearchCV(dt_est,
                                           dt_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    dt_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Bset DT Params:' + str(dt_grid.best_params_))
    print('Top N Features Best DT Score:' + str(dt_grid.best_score_))
    print('Top N Features DT Train Score:' +
          str(dt_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_dt = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        dt_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_dt = feature_imp_sorted_dt.head(top_n_features)['feature']
    print('Sample 10 Features from DT Classifier:')
    print(str(features_top_n_dt[:10]))

    # XGBClassifier
    XGB_est = XGBClassifier(random_state=0)
    XGB_param_grid = {'n_estimators': [60], 'max_depth': [9]}
    XGB_grid = model_selection.GridSearchCV(XGB_est,
                                            XGB_param_grid,
                                            n_jobs=25,
                                            cv=10,
                                            verbose=1)
    XGB_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Bset XGB Params:' + str(XGB_grid.best_params_))
    print('Top N Features Best XGB Score:' + str(XGB_grid.best_score_))
    print('Top N Features XGB Train Score:' +
          str(XGB_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_XGB = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        XGB_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_XGB = feature_imp_sorted_XGB.head(top_n_features)['feature']
    print('Sample 10 Features from XGB Classifier:')
    print(str(features_top_n_XGB[:10]))

    #merge the 6 models特征融合
    features_top_n = pd.concat([
        features_top_n_rf, features_top_n_ada, features_top_n_et,
        features_top_n_gb, features_top_n_dt, features_top_n_XGB
    ],
                               ignore_index=True).drop_duplicates()
    features_importance = pd.concat([
        feature_imp_sorted_rf, feature_imp_sorted_ada, feature_imp_sorted_et,
        feature_imp_sorted_gb, feature_imp_sorted_dt, features_top_n_XGB
    ],
                                    ignore_index=True)
    # features_top_n = pd.concat([features_top_n_dt],ignore_index=True).drop_duplicates()
    # features_importance = pd.concat([feature_imp_sorted_dt], ignore_index=True)
    # features_top_n = FeatureUnion([('randomforest',RandomForestClassifier()), ('AdaBoost',AdaBoostClassifier()),('ExtraTree',ExtraTreesClassifier()),
    #                                ('GradientBoosting',GradientBoostingClassifier()),('DecisionTree',DecisionTreeClassifier()),('XGBClassifier',XGBClassifier())])
    # features_importance = FeatureUnion([('randomforest',RandomForestClassifier()), ('AdaBoost',AdaBoostClassifier()),('ExtraTree',ExtraTreesClassifier()),
    #                                ('GradientBoosting',GradientBoostingClassifier()),('DecisionTree',DecisionTreeClassifier()),('XGBClassifier',XGBClassifier())])

    return features_top_n, features_importance
Example #12
0
        x = frame[features]
        y = frame[[Dependent]]

        # short
        X_train = frame[features].iloc[:108, :]
        y_train = frame[Dependent].iloc[:108]
        X_test = frame[features].iloc[108:, :]
        y_test = frame[Dependent].iloc[108:]

        sc = StandardScaler()
        sc.fit(X_train[feature])
        X_train_std = sc.transform(X_train[feature])
        X_test_std = sc.transform(X_test[feature])

        ml = XGBClassifier(n_estimators=100,
                           min_child_weight=1,
                           max_depth=6,
                           gamma=0)
        ml.fit(X_train_std, y_train)
        y_pred = ml.predict(X_test_std)

        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)

        print('accuracy : %.3f' % accuracy)
        print('precision : %.3f' % precision)
        print('recall : %.3f' % recall)

        X_test['LM3DN'] = y_test
        X_test['LM3DN_pred'] = y_pred
        X_test.to_csv("XGB_Kospi_LM3DN_result.csv", encoding='cp949')
Example #13
0
x_test = dict_vec.transform(x_test.to_dict(orient='record'))

# x_train['Sex']=(x_train['Sex']=='male').astype('int')
# x_test['Sex']=(x_test['Sex']=='male').astype('int')
# all_Embarked=x_train['Embarked'].unique().tolist()
# x_train['Embarked']=x_train['Embarked'].apply(lambda x:all_Embarked.index(x))
# x_test['Embarked']=x_test['Embarked'].apply(lambda x:all_Embarked.index(x))
'''
3、模型训练(随机森林、xgboost),交叉验证,学习曲线绘制,
    生成csv文件
'''
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

rfc = RandomForestClassifier()
xgbc = XGBClassifier()

from sklearn.model_selection import cross_val_score, ShuffleSplit

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=78)
rfc_scores = cross_val_score(rfc, x_train, y_train, cv=cv)
xgbc_scores = cross_val_score(xgbc, x_train, y_train, cv=cv)

#绘制学习曲线查看拟合情况
from learning_curve import plot_learning_curve
import matplotlib.pyplot as plt

plt.figure(figsize=(18, 6))
plt.subplot(121)
plot_learning_curve(xgbc, 'xgbc', x_train, y_train, cv=cv)
plt.subplot(122)
Example #14
0
# ABT

abt = pd.concat([df[target], df_numerical_imputed, df_categorical_encoded],
                axis=1)

# Model Training #

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    abt[abt.columns.difference(target)],
    abt[target],
    test_size=0.33,
    random_state=27513)

# Build Sklearn Random Forest
xgb = XGBClassifier(max_depth=4,
                    subsample=0.9,
                    objective='binary:logistic',
                    n_estimators=100,
                    learning_rate=0.1)
eval_set = [(X_train, y_train), (X_test, y_test)]
xgb.fit(X_train,
        y_train.values.ravel(),
        early_stopping_rounds=10,
        eval_metric=["error", "logloss"],
        eval_set=eval_set,
        verbose=True)

output = open('./experiment_xgboost/xgboost.pickle', 'wb')
joblib.dump(xgb, output)
output.close()
    plt.savefig('../img/gradient_boost_confusionmatrix.png')
    return ax


df = pd.read_csv('../data/clean_train.csv')

X = df.drop(['churn', 'Unnamed: 0'], axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.33)

model = XGBClassifier(booster='gbtree',
                      learning_rate=0.7,
                      max_depth=3,
                      n_estimators=30,
                      nthread=-1,
                      gamma=0.7,
                      max_delta_step=3,
                      min_child_weight=5,
                      subsample=1)
model.fit(X_train, y_train)

pred = model.predict(X_test)

#print(grid_search(X_train,y_train,model))
graph()
#print(cmatrix(y_test,pred))
plot_confusion_matrix(y_test, pred)
print('Precision:', prec(y_test, pred))
print('Recall:', rec(y_test, pred))
print('Accuracy:', acc(y_test, pred))
Example #16
0
# Define Variables
###############################################################################

print("Defining variables...")
# DATASETS TO BE GENERATED
DATASET_NAMES = ["5k_95_5_6d","5k_85_15_6d","5k_70_30_6d","10k_95_5_7d","10k_85_15_7d","10k_70_30_7d",
                 "15k_95_5_9d","15k_85_15_9d","15k_70_30_9d"]

#DATASET PARAMETERS
n = [5000,5000,5000,10000,10000,10000,15000,15000,15000]
#n = [300,300,300,400,400,400,500,500,500]
d = [6,6,6,7,7,7,9,9,9]
w = [[.95,.05],[.85,.15],[.70,.30],[.95,.05],[.85,.15],[.70,.30],[.95,.05],[.85,.15],[.70,.30]]

# MODELS TO BE FITTED
MODELS = [SVC(),RandomForestClassifier(),XGBClassifier()]
TRAINED_MODELS = [[SVC(),SVC(),SVC(),SVC(),SVC(),SVC(),SVC(),SVC(),SVC()],[RandomForestClassifier(),
                   RandomForestClassifier(),RandomForestClassifier(),RandomForestClassifier(),RandomForestClassifier(),
                   RandomForestClassifier(),RandomForestClassifier(),RandomForestClassifier(),
                   RandomForestClassifier()],[XGBClassifier(),XGBClassifier(),XGBClassifier(),XGBClassifier(),
                                         XGBClassifier(),XGBClassifier(),XGBClassifier(),XGBClassifier(),XGBClassifier()]]
# MODEL NAMES
MODEL_NAME = ['SVM','RF','GB_DT']

# MODEL PARAMETERS 
M_PARAMS = [{'kernel':['linear'],'gamma': [0.1, 0.01, 0.001,1,1.5,5,10]},
              {'max_depth': [3, 5, 6, 7],'min_samples_split': [3, 5, 6, 7],'n_estimators':[10,50,100]},
              {'max_depth': [5,6,7,8], 'gamma': [0.1, 0.01, 0.001],'learning_rate': [0.05,0.1, 0.2, 0.3]}]
print("Variables successfully defined")
###############################################################################
# Generate and save the various datasets
    dataset['Embarked'] = label.fit_transform(dataset['Embarked'])
    dataset['Title'] = dataset['Name'].str.split(
        ", ", expand=True)[1].str.split(".", expand=True)[0]
    dataset['Title'] = dataset['Title'].map({
        'Mr': 0,
        'Mrs': 1,
        'Miss': 2,
        'Master': 3
    })
    dataset['Title'] = dataset['Title'].fillna(4)
    dataset['Sex'] = label.fit_transform(dataset['Sex'])

features = train_set[[
    'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title'
]]
survival = train_set[['Survived']]

test_X = test_set[[
    'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title'
]]

#Training Model
XGB_Model = XGBClassifier()
XGB_Model.fit(features, survival.values.ravel())

#Testing Model and Submitting
XGBy_pred = XGB_Model.predict(test_X)  #Predicting based on testing data
submit = [test_set['PassengerId'], XGBy_pred]
submit = DataFrame(submit, index=['PassengerId', 'Survived']).T
submit = submit.set_index('PassengerId')
submit.to_csv('predictions.csv')
Example #18
0
    sc = StandardScaler()
    sc.fit(food_X)
    
    food_X = sc.transform(food_X)
    

    XGB_cv = []
    KNN_cv = []
    SVM_cv = []
    RFC_cv = []
    LR_cv = []
    
    #X_train, X_test, y_train, y_test = train_test_split(food_X, food_y, test_size=0.3,shuffle=False)

    #XGB
    XGB_model = XGBClassifier(min_child_weight=0.1,max_depth=7)
    
    #KNN
    KNN_model = KNeighborsClassifier()

    #SVM
    SVM_model = SVC(kernel = 'linear',probability = True)

    #隨機森
    RFC_model = RandomForestClassifier(n_estimators=100,n_jobs=5)

    #羅吉斯回歸
    LR_model = LogisticRegression()

    scores_x = cross_val_score(XGB_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error')
    scores_k = cross_val_score(KNN_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error')
Example #19
0
y_pred = classifier.predict(x_test)
y_pred = scPrice.inverse_transform(y_pred)
y_pred = np.squeeze(y_pred)

output = pd.DataFrame({ 'Id' : ids, 'SalePrice': y_pred })
output.to_csv('house_prediction_NN.csv', index = False)






############################ Xgboost ###################################### 

from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(x_train, y_train.ravel())
y_pred = classifier.predict(x_test)
y_pred = scPrice.inverse_transform(y_pred)
y_pred = np.squeeze(y_pred)

output = pd.DataFrame({ 'Id' : ids, 'SalePrice': y_pred })

output.to_csv('house_prediction_xgboost.csv', index = False)





Example #20
0
def Retrain_Model_10_Iterates_SVMSMOTE(target,
                                       title,
                                       max_depth=3,
                                       n_esti=160,
                                       lr=0.1,
                                       withexperience=False,
                                       color='YlGnBu'):
    matrics = []
    seed(2145)
    groups = df_model_draft['HospID']
    if withexperience is False:
        X = df_model_draft.drop(
            ['SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM'], axis=1)
        y = df_model_draft[target]
    else:
        X = df_model_draft.drop([
            'SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM',
            'HospID_Reop_CABG', 'HospID_total_CABG', 'surgyear',
            'HospID_total_cardiac_surgery', 'surgid_total_cardiac_surgery',
            'surgid_total_CABG', 'surgid_Reop_CABG'
        ],
                                axis=1)
        y = df_model_draft[target]

    print(groups.shape)
    print(groups.unique())
    gss = GroupShuffleSplit(n_splits=10, train_size=.8, random_state=42)
    gss.get_n_splits()
    i = 1
    for train_idx, test_idx in gss.split(X, y, groups):
        print("TRAIN:", train_idx, "TEST:", test_idx)
        if (i == 1):
            X = X.drop(['HospID'], axis=1)

        print(X.columns.tolist())
        X_train = X.loc[train_idx]
        y_train = y.loc[train_idx]

        X_test = X.loc[test_idx]
        y_test = y.loc[test_idx]
        print("\nTRAIN DATAFRAME\n", X_train.shape)
        print("\nTEST DATAFRAME\n", X_test.shape)
        # summarize class distribution

        sm = SVMSMOTE()  # SVMSMOTE(random_state=21)
        # fit and apply the transform
        X_over, y_over = sm.fit_resample(X_train, y_train)

        # summarize class distribution
        print("after under sampling")
        counter = Counter(y_over)
        print(counter)
        estimate = counter[0] / counter[1]
        print('Estimate: %.3f' % estimate)

        model = XGBClassifier(objective='binary:logistic',
                              eval_metric='logloss',
                              max_depth=max_depth,
                              learning_rate=lr,
                              n_estimators=n_esti)
        model.fit(X_over, y_over)
        y_pred = model.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        mats = Make_Confusion_Matrix(cm,
                                     categories=categories,
                                     cmap=color,
                                     title=title,
                                     group_names=labels,
                                     y_pred=y_pred,
                                     y_test=y_test)
        auc = roc_auc_score(y_test, model.predict_proba(X_test.values)[:, 1])
        mats['AUROC'] = auc
        matrics.append(mats)
        i = i + 1
    return matrics
Example #21
0
    'scale_pos_weight': 1,
    'max_delta_step': 5,
    'n_jobs': 1,
    'random_state': 0,
    'max_depth': 5,
    'min_child_weight': 3,
    'n_estimators': 300,
    'subsample': 1.0,  #0.9,
    'colsample_bytree': 0.5,
    'reg_lambda': 10,
    'reg_alpha': 0.1,
    'learning_rate': 0.01,
    'gamma': 0.1
}

xgb = XGBClassifier(**basicparameter)
# xgb=XGBClassifier() # apply the default parameters
xgb.fit(Xtrain, Ytrain)

#  score the model
print('============================= XGBoost =============================')
score(xgb, Xtrain, Ytrain, Xtest, Ytest)

print('============================== SHAP ===============================')
explainer = shap.TreeExplainer(xgb)  # define the explainer
shap_values = explainer.shap_values(X)  # use all data for analysis


def gen_data(inputs, X):
    """ creates a data Frame with inputs and X for statistics with shap """
    df1 = pd.DataFrame()
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:0.9080067906471255
exported_pipeline = XGBClassifier(learning_rate=0.1,
                                  max_depth=8,
                                  min_child_weight=12,
                                  n_estimators=100,
                                  nthread=1,
                                  subsample=0.9000000000000001)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
    (logistic_Y_test_predic.reshape(-1, 1), svc_Y_test_predic.reshape(-1, 1),
     knn_Y_test_predic.reshape(-1, 1), gauss_bayes_Y_test_predic.reshape(
         -1, 1), perceptron_Y_test_predic.reshape(
             -1, 1), sgd_Y_test_predic.reshape(
                 -1, 1), decision_tree_Y_test_predic.reshape(
                     -1, 1), random_forest_Y_test_predic.reshape(-1, 1)),
    axis=1)

# 建模9 xgboost
from xgboost import XGBClassifier
gbm = XGBClassifier(
    #learning_rate = 0.02,
    n_estimators=2000,
    max_depth=4,
    min_child_weight=2,
    #gamma=1,
    gamma=0.9,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=-1,
    scale_pos_weight=1).fit(X_train, y_train)
predictions = gbm.predict(X_test)
print("gbm score: " + str(gbm.score(X_train, y_train)))

# Generate Submission File
StackingSubmission = pd.DataFrame({
    'PassengerId': test_PassengerId,
    'Survived': predictions
})
StackingSubmission.to_csv("./data/StackingSubmission.csv", index=False)
Example #24
0
c = np.vstack((ID, predictions2)).transpose()
columns = ['id_num', 'is_pass']
index = range(len(predictions2))
obj1 = pd.DataFrame(c, index, columns)

# In[162]:

obj1

# ### XGboost 算法

# In[188]:

from xgboost import XGBClassifier

clf = XGBClassifier()
x_train = train_base[predictors]  ##训练数据
y_train = train_base["is_pass"]  ##训练方向

clf.fit(x_train, y_train)  ##模拟
test_predict = clf.predict(test_base[predictors])

# In[189]:

ID = test_base['id_num']
ID = np.array(ID)
c = np.vstack((ID, test_predict)).transpose()
columns = ['id_num', 'is_pass']
index = range(11684)
obj2 = pd.DataFrame(c, index, columns)
Example #25
0
def build_model(X, y, cross=5, models=['xgb']):
    """
    Need support for more models, along with cross validation and feature importances which can be easily taken out
    something like
    build_model(X,y,cross = 5,model)
        if model == 'xgb':
            ...
        if model == 'logistic'
            ...
    """
    best_score = 0
    seed = 7
    test_size = 0.30
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=seed)
    for model1 in models:
        if model1 == 'xgb':
            print("XGBoost Classifier: \n")
            model = XGBClassifier()
            model.fit(X_train, y_train)
            joblib.dump(model, 'xgb.pkl')
            pred = model.predict(X_test)
            pred = pred.astype(int)
            y_test = y_test.astype(int)
            print("Balanced Accuracy is ",
                  balanced_accuracy_score(y_test, pred) * 100)
            results = cross_val_score(model,
                                      X_train,
                                      y_train,
                                      cv=cross,
                                      scoring='balanced_accuracy')
            print("Cross Validation Balanced Accuracy: %.2f%% (%.2f%%)" %
                  (results.mean() * 100, results.std() * 100))
            acc = results.mean() * 100
            post_proc(X, model)
        if model1 == 'Logistic':
            print("\n Logistic Classifier: \n")
            model = LogisticRegression(solver='liblinear')
            model.fit(X_train, y_train)
            joblib.dump(model, 'logi.pkl')
            pred = model.predict(X_test)
            prob = model.predict_proba(X_test)
            y_test = y_test.astype(int)
            print("Balanced Accuracy is ",
                  balanced_accuracy_score(y_test, pred) * 100)
            results = cross_val_score(model,
                                      X_train,
                                      y_train,
                                      cv=cross,
                                      scoring='balanced_accuracy')
            print("Cross Validation Balanced Accuracy: %.2f%% (%.2f%%)" %
                  (results.mean() * 100, results.std() * 100))
            acc = results.mean() * 100
            cm = confusion_matrix(y_test, pred)
            fig, ax = plt.subplots(figsize=(8, 8))
            ax.imshow(cm)
            ax.grid(False)
            ax.xaxis.set(ticks=(0, 1),
                         ticklabels=('Predicted 0s', 'Predicted 1s'))
            ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
            ax.set_ylim(1.5, -0.5)
            for i in range(2):
                for j in range(2):
                    ax.text(j,
                            i,
                            cm[i, j],
                            ha='center',
                            va='center',
                            color='red')
            plt.show()
            Logi = pickle.dumps(model)
        if model1 == 'auto':
            print("\n Auto: \n")
            tpot = TPOTClassifier(verbosity=2,
                                  max_time_mins=2,
                                  scoring='balanced_accuracy')
            tpot.fit(X_train, y_train)
            print(tpot.score(X_test, y_test))
        if model1 == 'SVM':
            print("\n SVM: \n")
            model = svm.NuSVC(gamma='auto')
            model.fit(X_train, y_train)
            joblib.dump(model, 'svm.pkl')
            pred = model.predict(X_test)
            pred = pred.astype(int)
            y_test = y_test.astype(int)
            print("Balanced Accuracy is ",
                  balanced_accuracy_score(y_test, pred) * 100)
            results = cross_val_score(model,
                                      X_train,
                                      y_train,
                                      cv=cross,
                                      scoring='balanced_accuracy')
            print("Cross Validation Balanced Accuracy: %.2f%% (%.2f%%)" %
                  (results.mean() * 100, results.std() * 100))
            acc = results.mean() * 100
        if model1 == 'RandomForest':
            print("\n Random Forest: \n")
            model = RandomForestClassifier()
            model.fit(X_train, y_train)
            joblib.dump(model, 'rf.pkl')
            pred = model.predict(X_test)
            pred = pred.astype(int)
            y_test = y_test.astype(int)
            print("Balanced Accuracy is ",
                  balanced_accuracy_score(y_test, pred) * 100)
            results = cross_val_score(model,
                                      X_train,
                                      y_train,
                                      cv=cross,
                                      scoring='balanced_accuracy')
            print("Cross Validation Balanced Accuracy: %.2f%% (%.2f%%)" %
                  (results.mean() * 100, results.std() * 100))
            acc = results.mean() * 100

        if acc > best_score:
            best_score = acc
            model2 = model
            joblib.dump(model2, 'best.pkl')
Example #26
0
class OOF(object):
    """Out of flod prediction
    # TODO 支持回归

    lightGBM一个一个地建立节点; XGboost一层一层地建立节点
    https://blog.csdn.net/friyal/article/details/82758532
    Catboost总是使用完全二叉树。它的节点是镜像的(对称树)。Catboost称对称树有利于避免overfit,增加可靠性,并且能大大加速预测等等。
        计算某个category出现的频率,加上超参数,生成新的numerical features
    # https://blog.csdn.net/linxid/article/details/80723811
    """
    _params = {
        'metric': 'auc',
        'learning_rate': 0.01,
        'n_estimators': 30000,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'class_weight': 'balanced',  ##
        'scale_pos_weight': 1,  ##
        'random_state': 2019,
        'verbosity': -1
    }
    lgb = LGBMClassifier(n_jobs=16, **_params)  # TODO: 常用模型另存为其他模块
    xgb = XGBClassifier()
    cat = CatBoostClassifier(n_estimators=20000,
                             learning_rate=0.05,
                             loss_function='Logloss',
                             eval_metric='AUC',
                             random_state=2019)

    def __init__(self,
                 estimator=None,
                 folds=None,
                 early_stopping_rounds=300,
                 verbose=100):
        self.estimator = self.lgb if estimator is None else estimator  # 指定lgb: metric xgb: eval_metric
        self.folds = folds if folds else StratifiedKFold(
            5, True, 2019)  # 支持 RepeatedStratifiedKFold
        self.model_type = self.estimator.__repr__()

        self.early_stopping_rounds = early_stopping_rounds
        self.verbose = verbose
        # self.estimator_agrs = self.getfullargspec(self.estimator.fit).args if hasattr(self.estimator, 'fit') else None

    def fit(self,
            X,
            y,
            X_test,
            feval=None,
            cat_feats=None,
            exclude_columns=None,
            epochs=16,
            batch_size=128,
            oof2csv=False,
            plot=False):
        """
        # TODO: Rank 融合
        :param X: 保证索引唯一
        :param y:
        :param X_test:
        :param feval: roc_auc_score(y_true, y_score)
        :param cat_feats: 类别特征索引
        :param exclude_columns:
        仅针对 nn
        :param epochs:
        :param batch_size:
        :return:
        """
        # 判断输入数据转数据框
        if isinstance(y, pd.Series):
            y.reset_index(drop=True, inplace=True)

        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
            X_test = pd.DataFrame(X)
        else:
            X.reset_index(drop=True, inplace=True)
            X_test.reset_index(drop=True, inplace=True)

        # oof评估函数
        feval = feval if feval else roc_auc_score

        # 移除不需要的特征
        if exclude_columns:
            feats = X.columns.difference(exclude_columns)
            X, X_test = X[feats], X_test[feats]

        # Score
        if hasattr(feval, '__repr__'):
            score_name = feval.__repr__().split()[1]
        else:
            score_name = None

        # cv num
        if hasattr(self.folds, 'n_splits'):
            num_cv = self.folds.n_splits
        else:
            num_cv = self.folds.cvargs['n_splits'] * self.folds.n_repeats

        # Cross validation model
        # Create arrays and dataframes to store results
        oof_preds = np.zeros(X.shape[0])
        sub_preds = np.zeros((X_test.shape[0], num_cv))
        self.feature_importance_df = pd.DataFrame()

        for n_fold, (train_idx,
                     valid_idx) in enumerate(self.folds.split(X, y), 1):
            print("\n\033[94mFold %s started at %s\033[0m" %
                  (n_fold, time.ctime()))

            X_train, y_train = X.iloc[train_idx], y[train_idx]
            X_valid, y_valid = X.iloc[valid_idx], y[valid_idx]

            if not hasattr(self.estimator, 'fit'):
                print("该算法无fit方法")
                break
            else:
                if 'LGBMClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        categorical_feature=cat_feats if cat_feats else 'auto',
                        eval_metric='auc',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)
                elif 'LGBMRegressor' in self.model_type:
                    # reg_objs = ['regression_l1', 'regression_l2', 'huber', 'fair', 'poisson', 'quantile', 'mape', 'gamma', 'tweedie']
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        categorical_feature=cat_feats if cat_feats else 'auto',
                        # eval_metric='l2',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)

                elif 'XGBClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        eval_metric='auc',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)
                elif 'XGBRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        # eval_metric='rmse',
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)

                elif 'CatBoostClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        cat_features=cat_feats,
                        use_best_model=True,
                        plot=True,
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)
                elif 'CatBoostRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        cat_features=cat_feats,
                        use_best_model=True,
                        plot=True,
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose=self.verbose)

                elif 'RGFClassifier' in self.model_type:
                    pass
                elif 'RGFRegressor' in self.model_type:
                    pass

                # https://www.cnblogs.com/flyu6/p/7691106.html
                elif 'KerasClassifier' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(X_train,
                                       y_train,
                                       epochs=epochs,
                                       batch_size=batch_size,
                                       validation_data=eval_set)
                elif 'KerasRegressor' in self.model_type:
                    eval_set = [(X_train, y_train), (X_valid, y_valid)]
                    self.estimator.fit(X_train,
                                       y_train,
                                       epochs=epochs,
                                       batch_size=batch_size,
                                       validation_data=eval_set)

                elif self.model_type == 'GLM':
                    # TODO: 其他模型的支持
                    self.estimator = GLM(y_train,
                                         X_train,
                                         family=families.Binomial())
                    self.estimator = self.estimator.fit().predict(X)
                else:
                    # sklearn 原生模型
                    print('Sklearn Fitting ...')
                    self.estimator.fit(X_train, y_train)

                # 计算并保存 preds
                # TODO: 多分类需要修改
                if hasattr(self.estimator, 'predict_proba'):
                    oof_preds[valid_idx] = self.estimator.predict_proba(
                        X_valid)[:, 1]
                    sub_preds[:, n_fold -
                              1] = self.estimator.predict_proba(X_test)[:, 1]
                else:
                    oof_preds[valid_idx] = self.estimator.predict(X_valid)
                    sub_preds[:, n_fold - 1] = self.estimator.predict(X_test)

            if plot and hasattr(self.estimator, 'feature_importances_'):
                fold_importance_df = pd.DataFrame()
                fold_importance_df["feature"] = X.columns
                fold_importance_df[
                    "importance"] = self.estimator.feature_importances_
                fold_importance_df["fold"] = n_fold
                self.feature_importance_df = fold_importance_df.append(
                    self.feature_importance_df)

        # 输出需要的结果
        self.oof_preds = oof_preds
        self.sub_preds = sub_preds.mean(1)
        self.sub_preds_rank = pd.DataFrame(sub_preds).rank().mean(
            1) / sub_preds.shape[0]  # auc work

        try:
            self.score = feval(y, self.oof_preds)
        except Exception as e:
            self.score = 0
            print('Error feval:', e)

        print("\n\033[94mCV Score %s: %s ended at %s\033[0m" %
              (score_name, self.score, time.ctime()))

        # 保存的普通平均的得分
        if oof2csv:
            pd.Series(np.append(self.oof_preds, self.sub_preds), name='oof') \
                .to_csv('OOF %s %.4f.csv' % (time.ctime(), self.score), index=False)

        # 是否输出特征重要性
        if plot:
            self.feature_importance_df.sort_values(['fold', 'importance'],
                                                   0,
                                                   False,
                                                   inplace=True)
            self.plot_importances(self.feature_importance_df, len(X.columns))

    def plot_importances(self, df, topk=64):
        """Display/plot feature importance"""
        assert "feature" in df.columns and "importance" in df.columns, '无["feature", "importance"]'

        data = (df[["feature", "importance"
                    ]].groupby("feature").mean().reset_index().sort_values(
                        "importance", 0, False))[:topk]

        self.feature_importance_df_agg = data
        plt.figure(figsize=(12, topk // 4))
        sns.barplot(x="importance",
                    y="feature",
                    data=data.assign(feature='col_' +
                                     data.feature.astype(str)))
        plt.title('Features (avg over folds)')
        plt.tight_layout()
        plt.savefig('importances.png')
Example #27
0
# =====================================================================================================================

# define GridSearchCV parameters
cv_params = {'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5]}
ind_params = {
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'seed': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'binary:logistic'
}

# define model
model = GridSearchCV(
    XGBClassifier(**ind_params),
    cv_params,
    scoring='accuracy',  # 准确度评价标准
    cv=5,  # cross_validation,交叉验证
    n_jobs=-1)  # 并行数,int:个数,-1:跟CPU核数一致, 1:默认值

# =====================================================================================================================


def XGB_TRAIN_EVA():

    # prepare train data
    train_data = pd.read_csv(TRAIN_DATA_PATH)
    train_data.pop('index')
    Y_train = train_data.pop('income')
    X_train = train_data
from sklearn.model_selection import cross_validate

scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

results = cross_validate(knnclassifier, X_train, y_train, cv=10, scoring=list(scoring.values()), 
                         return_train_score=False)
print('K-fold cross-validation results:')
for sc in range(len(scoring)):
    print(knnclassifier.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
                               if list(scoring.values())[sc]=='neg_log_loss' 
                               else results['test_%s' % list(scoring.values())[sc]].mean(), 
                               results['test_%s' % list(scoring.values())[sc]].std()))

# Fitting XGBoost to the Training set
from xgboost import XGBClassifier
xgclassifier = XGBClassifier()
xgclassifier.fit(X_train, y_train)

# Predicting the Test set results
xg_pred = xgclassifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, xg_pred)
print(cm)
print("Model Accuracy for XGBoost:",metrics.accuracy_score(y_test, xg_pred))

print(classification_report(y_test,xg_pred))

"""Feature importance and weight determination"""
Example #29
0
                     scoring='f1',
                     return_train_score=True)
gs_rf.fit(X_train, y_train)
'''
Best Estimator
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=10, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

best_f1 = .39
'''
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

xgb = XGBClassifier()
cv_xg = cross_validate(xgb, X_train, y_train, scoring=['accuracy', 'f1'])
'''
 'test_accuracy': array([0.6727133 , 0.67297048, 0.67420049]),
 'test_f1': array([0.15816528, 0.20663931, 0.2000755 ])}
'''

svc = SVC()
cv_svc = cross_validate(svc, X_train, y_train, scoring=['accuracy', 'f1'])
'''
 'test_accuracy': array([0.667794  , 0.66774293, 0.66774293]),
 'test_f1': array([0., 0., 0.])}
'''
Example #30
0
# Instantiate the classifiers
n_estimators = config["threads"]
n_jobs = int(n_estimators / 2 + 1)

svm_classifier = svm.SVC(kernel='poly',
                         degree=3,
                         gamma='scale',
                         verbose=True,
                         max_iter=1000,
                         cache_size=5000,
                         random_state=now,
                         probability=True)
xgb_classifier = XGBClassifier(n_estimators=100,
                               verbosity=2,
                               nthread=config["threads"],
                               max_depth=4,
                               subsample=0.5)
rf_classifier = RandomForestClassifier(n_estimators=100,
                                       verbose=2,
                                       n_jobs=config["threads"],
                                       random_state=now)
mlp_classifier = MLPClassifier(hidden_layer_sizes=(50, 25),
                               max_iter=1000,
                               n_iter_no_change=50,
                               activation='relu',
                               solver='adam',
                               random_state=now,
                               verbose=True)
lda_classifier = LDA(solver='svd')