def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=0 for j in range(m): clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=7,num_round=400,seed=j*77,gamma=0.1) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) yqq=y_pred*(1.0/(j+1)) print j,llfun(y_test_cv,yqq) #y_pred/=m; clf=XGBClassifier(max_depth=10,colsample_bytree=0.8,learning_rate=0.02,n_estimators=500,nthread=-1) #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100) clf.fit(X_train_cv,(y_train_cv),eval_metric="logloss",eval_set=[(X_test_cv, y_test_cv)]) y_pred=clf.predict_proba(X_test_cv).T[1] print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) ypred=y_pred yreal=y_test_cv idx=idx[test_index] print xx[-1]#,y_pred.shape break print xx,'average:',np.mean(xx),'std',np.std(xx) return ypred,yreal,idx#np.mean(xx)
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] for f in ohe_feats: df_all_dummy = pd.get_dummies(df_all[f], prefix=f) df_all = df_all.drop([f], axis=1) df_all = pd.concat((df_all, df_all_dummy), axis=1) # split df into test and training data vals = df_all.values X = vals[:piv_train] le = LabelEncoder() y = le.fit_transform(labels) X_test = vals[piv_train:] # use xgboost XGBClassifier xgb = XGBClassifier(max_depth=8, learning_rate=0.075, n_estimators=250, objective='multi:softprob', subsample=0.75, colsample_bytree=0.85, seed=13) xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) # select the 5 highest probability classes ids = [] # list ids cts = [] # list countries for i in range(len(id_test)): idx = id_test[i] ids += [idx] * 5 cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist() # generate output 'pysub.csv' sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) sub.to_csv('/Users/ianmurray/Documents/kaggle/airbnb/output/pysub.csv',index=False)
def xgboostcv(max_depth, learning_rate, n_estimators, subsample, colsample_bytree, gamma, min_child_weight, silent=True, nthread=-1, seed=1234): clf = XGBClassifier(max_depth=int(max_depth), learning_rate=learning_rate, n_estimators=int(n_estimators), silent=silent, nthread=nthread, subsample=subsample, colsample_bytree=colsample_bytree, gamma=gamma, min_child_weight = min_child_weight, seed=seed, objective="binary:logistic") clf.fit(x0, y0, eval_metric="logloss", eval_set=[(x1, y1)],early_stopping_rounds=25) ll = -log_loss(y1, clf.predict_proba(x1)) return ll
def main(): # Set seed for reproducibility np.random.seed(0) print("Loading data...") # Load the data from the CSV files training_data = pd.read_csv('/home/vipin/Videos/train.csv', header=0) prediction_data = pd.read_csv('/home/vipin/Videos/test.csv', header=0) training_data['countrycode']=training_data['countrycode'].apply(lambda x:ord(x)) training_data['browserid']=training_data['browserid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") ) training_data['devid']=training_data['devid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("none")) #pd.to_csv('/home/vipin/Videos/train11.csv', sep=',', encoding='utf-8') #exit(0) prediction_data['countrycode']=prediction_data['countrycode'].apply(lambda x:ord(x)) prediction_data['browserid']=prediction_data['browserid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") ) prediction_data['devid']=prediction_data['devid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("none") ) features=['siteid','offerid','category','merchant','countrycode','browserid','devid'] target="click" X = training_data[features] x_prediction = prediction_data[features] Y= training_data[target] ids = prediction_data["ID"] model = XGBClassifier() #linear_model.LogisticRegression(n_jobs=-1) print("Training...") # Your model is trained on the training_data model.fit(X, Y) print("Predicting...") seed =7 test_size=0.33 X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=test_size,random_state=seed) y_prediction = model.predict_proba(x_prediction) results = y_prediction[:, 1] results_df = pd.DataFrame(data={'probability':results}) joined = pd.DataFrame(ids).join(results_df) y_pred=model.predict(X_test) accuracy=accuracy_score(y_test,y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) print("Writing predictions to predictions.csv") # Save the predictions out to a CSV file joined.to_csv("/home/vipin/Videos/predictions.csv", index=False)
def train_model_xgb(train_x, train_y, xgb_features): train_ind = StratifiedShuffleSplit(train_y, random_state=1, test_size=0.1) for train_index, test_index in train_ind: x_train = train_x.ix[train_index, :] y_train = train_y.ix[train_index] x_eval = train_x.ix[test_index, :] y_eval = train_y.ix[test_index] #Classifier xgb = XGBClassifier(max_depth=xgb_features['max_depth'], learning_rate=xgb_features['learning_rate'], n_estimators=int(xgb_features['n_estimators']), objective='binary:logistic', subsample=xgb_features['subsample'], colsample_bytree=xgb_features['colsample_bytree'], min_child_weight=xgb_features['min_child_weight']) # gives 0.458 xgb = xgb.fit(x_train, y_train, verbose=True, eval_metric='logloss', eval_set=[(x_eval, y_eval)], early_stopping_rounds=10) predictions = pd.Series(xgb.predict_proba(x_train, ntree_limit=xgb.best_iteration)[:, 1], name='PredictedProb') return xgb, predictions
def cv_BDT(input, output, params, show, channel, selection, names): # model = XGBClassifier() cvscores = [] AUC = [] cvscores_train = [] AUC_train = [] kfold = StratifiedKFold(5, True, 3456) for train, test in kfold.split(input, output): model = XGBClassifier(**params) X_train, X_test, y_train, y_test = ( input[train], input[test], output[train], output[test], ) model.fit(X_train, y_train) y_prob = model.predict_proba(X_test) y_pred = model.predict(X_test) prediction = [round(value) for value in y_pred] auc = roc_auc_score(y_test, y_prob[:, 1]) accuracy = accuracy_score(y_test, prediction) print("Accuracy: %.2f%%; AUC = %.4f%%" % (accuracy * 100, auc)) cvscores.append(accuracy * 100) AUC.append(auc) y_prob = model.predict_proba(X_train) y_pred = model.predict(X_train) prediction = [round(value) for value in y_pred] auc = roc_auc_score(y_train, y_prob[:, 1]) accuracy = accuracy_score(y_train, prediction) print("Accuracy train: %.2f%%; AUC = %.4f%%" % (accuracy * 100, auc)) cvscores_train.append(accuracy * 100) AUC_train.append(auc) print("Accuracy test = %.2f%% (+/- %.2f%%); AUC = %.4f (+/- %.4f)" % (np.mean(cvscores), np.std(cvscores), np.mean(AUC), np.std(AUC))) print("Accuracy train = %.2f%% (+/- %.2f%%); AUC = %.4f (+/- %.4f)" % ( np.mean(cvscores_train), np.std(cvscores_train), np.mean(AUC_train), np.std(AUC_train), )) if show: name = "channel_" + str(channel) + "_BDT" name = "%s_%s" % (name, selection) modelname = "models/%s.h5" % name print("Save to %s" % modelname) plotter.plot_separation(model, X_test, y_test, name, False) plotter.plot_ROC(model, X_test, y_test, name, False) model.get_booster().feature_names = names mp.rc("figure", figsize=(5, 5)) plot_importance(model.get_booster(), max_num_features=15, importance_type="gain") plt.subplots_adjust(left=0.3) plt.show()
bst = XGBClassifier(objective = 'binary:logistic', max_depth = 4, learning_rate= 0.01, subsample=0.8, colsample_bytree=0.4, n_estimators=1650, min_child_weight=1, silent=False) bst.fit(trainingSet[feature_names], np.array(trainingSet['TARGET']), eval_metric='auc', eval_set=[(trainingSet[feature_names], trainingSet['TARGET']), (validationSet[feature_names], validationSet['TARGET'])], verbose=100) preds = bst.predict_proba(validationSet[feature_names])[:, 1] tmp = pd.DataFrame({'ID': validationSet['ID'], pred_name: preds}) eval_matrix = eval_matrix.append(tmp, ignore_index = True) del trainingSet, validationSet, bst, val_ids, idx gc.collect() bst = XGBClassifier(objective = 'binary:logistic', max_depth = 4, learning_rate= 0.01, subsample=0.8, colsample_bytree=0.4, n_estimators=1650, min_child_weight=1, silent=False, nthread=-1)
# Fit Random Forest classifier RF = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',max_depth=8, max_features=6, max_leaf_nodes=None,min_impurity_decrease=0.0, min_impurity_split=None,min_samples_leaf=1, min_samples_split=3,min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None, oob_score=False, random_state=None, verbose=0,warm_start=False) RF.fit(df_train.loc[:, df_train.columns != 'Exited'],df_train.Exited) ######################################################################################### # Fit Extreme Gradient Boost Classifier XGB = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,colsample_bytree=1, gamma=0.01, learning_rate=0.1, max_delta_step=0,max_depth=7, min_child_weight=5, missing=None, n_estimators=20,n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=1) XGB.fit(df_train.loc[:, df_train.columns != 'Exited'],df_train.Exited) ######################################################################################### print(classification_report(df_train.Exited, log_primal.predict(df_train.loc[:, df_train.columns != 'Exited']))) print(classification_report(df_train.Exited, log_pol2.predict(df_train_pol2))) print(classification_report(df_train.Exited, SVM_RBF.predict(df_train.loc[:, df_train.columns != 'Exited']))) print(classification_report(df_train.Exited, SVM_POL.predict(df_train.loc[:, df_train.columns != 'Exited']))) print(classification_report(df_train.Exited, RF.predict(df_train.loc[:, df_train.columns != 'Exited']))) print(classification_report(df_train.Exited, XGB.predict(df_train.loc[:, df_train.columns != 'Exited']))) y = df_train.Exited X = df_train.loc[:, df_train.columns != 'Exited'] X_pol2 = df_train_pol2 auc_log_primal, fpr_log_primal, tpr_log_primal = get_auc_scores(y, log_primal.predict(X),log_primal.predict_proba(X)[:,1]) auc_log_pol2, fpr_log_pol2, tpr_log_pol2 = get_auc_scores(y, log_pol2.predict(X_pol2),log_pol2.predict_proba(X_pol2)[:,1]) auc_SVM_RBF, fpr_SVM_RBF, tpr_SVM_RBF = get_auc_scores(y, SVM_RBF.predict(X),SVM_RBF.predict_proba(X)[:,1]) auc_SVM_POL, fpr_SVM_POL, tpr_SVM_POL = get_auc_scores(y, SVM_POL.predict(X),SVM_POL.predict_proba(X)[:,1]) auc_RF, fpr_RF, tpr_RF = get_auc_scores(y, RF.predict(X),RF.predict_proba(X)[:,1]) auc_XGB, fpr_XGB, tpr_XGB = get_auc_scores(y, XGB.predict(X),XGB.predict_proba(X)[:,1])
gamma_loss = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5] for n in gamma_loss: model = XGBClassifier(max_depth=6, n_estimators=459, min_child_weight=1, gamma=n, learning_rate=0.1, n_jobs=40) model.fit(X_train, y_train, sample_weight=w_train) train_pred_A = model.predict(X_train) print 'train_pred =', train_pred_A train_pred_proba_B = model.predict_proba(X_train)[:, 1] print 'train_pred_proba =', train_pred_proba_B false_positive_rate_A, true_positive_rate_A, thresholds_A = roc_curve( y_train, train_pred_A, sample_weight=w_train) roc_auc_A = auc(false_positive_rate_A, true_positive_rate_A) false_positive_rate_B, true_positive_rate_B, thresholds_B = roc_curve( y_train, train_pred_proba_B, sample_weight=w_train) roc_auc_B = auc(false_positive_rate_B, true_positive_rate_B) print 'Train results---------predict' train_results_A.append(roc_auc_A) print(n, train_results_A) print 'train results--------predict_proba'
scores = np.empty(n_splits) for i_fold, (fold_train_index, fold_test_index) in enumerate(kf.split(X, y)): print("Training for " + early_stopping_string + " fold " + str(i_fold + 1) + "/" + str(n_splits)) # Split train_index into train set and eval set for early stopping. fold_X = X.loc[fold_train_index, :] fold_y = y[fold_train_index] X_train, X_test, y_train, y_test = train_test_split(fold_X, fold_y, test_size=0.2, stratify=fold_y) # eval_set: A list of (X, y) pairs to use as a validation set for early stopping model.fit(X_train, y_train, eval_set=[(X_test, y_test)], **fit_params) if early_stopping: # Get best iteration based on eval_set. sorted_iteration_scores = np.argsort( model.evals_result()['validation_0']['auc']) best_round = sorted_iteration_scores[-1] # Evaluate on test_index. proba = model.predict_proba(X.loc[fold_test_index, :], ntree_limit=best_round)[:, 1] else: proba = model.predict_proba(X.loc[fold_test_index, :])[:, 1] y_true = y[fold_test_index] scores[i_fold] = eval_gini(y_true, proba) # Report error. print('For ' + early_stopping_string + ', Gini score mean (standard deviation): ' + str(np.mean(scores)) + ' (' + str(np.sqrt(np.var(scores))) + ')')
eval_set=[(trainingSet[feature_names], np.array(trainingSet["TARGET"])), (validationSet[feature_names], np.array(validationSet["TARGET"]))], early_stopping_rounds=200,verbose=20) ll = gbm.best_score best_iter = gbm.best_iteration cv.append(ll) biter.append(best_iter) print "---auc : %0.6f\n" %ll print "---best_iter: %d\n" %best_iter gc.collect() gbm = XGBClassifier(max_depth=4, learning_rate = 0.01, n_estimators=370, subsample=0.8, colsample_bytree=0.5, objective="binary:logistic", silent = False, min_child_weight=5, nthread=-1) gbm.fit(train[feature_names], np.array(train["TARGET"]), eval_metric = "auc", eval_set = [(train[feature_names], np.array(train["TARGET"]))], verbose=20) tpreds = gbm.predict_proba(test[feature_names])[:, 1] df = pd.DataFrame({"ID" : test["ID"], "TARGET" : tpreds }) submission_name = "stacked_xgb_3.csv" df.to_csv(os.path.join(output_dir, submission_name), index = False)
X = train_df[main_cols] y = train_df.target.astype(int) # Split data into train and test sets X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42) model = XGBClassifier() model.fit(X_train, y_train) # Import GridSearchCV from sklearn.model_selection import GridSearchCV # Optimize model parameters # I run this code in google colab to make the execution much faster and use the best params in the next code param_grid = {'min_child_weighth': [1, 5, 10], 'gamma': [0.5, 1], 'subsample': [0.6, 0.8, 1.0], 'max_depth': [3, 5] } model = GridSearchCV(model, param_grid,n_jobs=-1,verbose=2,cv=5) model.fit(X_train, y_train) print(model.best_params_) # Make predictions y_pred = model.predict_proba(X_test)[:, 1] # Check the auc score of the model print(f'LGBM AUC score on the X_test is: {roc_auc_score(y_test, y_pred)}\n') # print classification report #print(classification_report(y_test, [1 if x >= 0.5 else 0 for x in y_pred]))
def predict(self, X): return XGBClassifier.predict_proba(self, X)[:,1]
def leave_one_trial_out(x, y, log, label_type, normalization, show_roc, num_fold=15, seed=0, verbose=True, select_top_k_feature=None): """Normalization and Leave one subject out cross validation Args: :param x: # people x # trials x (# channels x # features) :param y: # people x # trials :param log: (# channels x # features) :param label_type: {'rating', 'thought', 'withhold'} :param normalization: normalize feature or not :param show_roc: show ROC curve or not :param num_fold: do num_fold cross validation :param seed: use to fix the training and testing set :param verbose: show the result of each fold or not :param select_top_k_feature: select top k feature from training set, if None: use all features """ if normalization: x = normalize(x) x = x.reshape(x.shape[0] * x.shape[1], -1) y = y.reshape(y.shape[0] * y.shape[1]) x, y = convert_to_binary_label_and_remove_threshold(x, y, label_type=label_type) x, y = shuffle(x, y, random_state=seed) clf = XGBClassifier( n_estimators=300, learning_rate=0.05, max_depth=3, min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, reg_alpha=1, ) test_fold_len = len(y) // num_fold precision_list, recall_list, f1_list = list(), list(), list() tprs, aucs, mean_fpr = [], [], np.linspace(0, 1, 100) print( 'Start {}-fold leave one trial out cross validation'.format(num_fold)) for fold in range(num_fold): # x.shape: # data x # features, y.shape: # data start_idx, end_idx = test_fold_len * fold, test_fold_len * ( fold + 1) # start and end idx of testing fold x_train, x_test = np.delete(x, np.arange(start_idx, end_idx, 1), axis=0), x[np.arange( start_idx, end_idx, 1)] y_train, y_test = np.delete(y, np.arange(start_idx, end_idx, 1), axis=0), y[np.arange( start_idx, end_idx, 1)] if select_top_k_feature is not None: feature_ranking = rank_feature(x_train, y_train) x_train = x_train[:, feature_ranking[:select_top_k_feature]] x_test = x_test[:, feature_ranking[:select_top_k_feature]] # train and predict clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # plot roc curve of each fold (subject) probas_ = clf.predict_proba( x_test) # shape: len x 2 (prob of neg, prob of pos) fpr, tpr, _ = roc_curve(y_test, probas_[:, 1]) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold {} (AUC={:.2f})'.format(fold, roc_auc)) # be used to plot mean roc tprs.append( interp(mean_fpr, fpr, tpr)) # append mean tpr (interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 # mean_tpr[0] = 0 aucs.append(roc_auc) # confusion matrix tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel() precision = tp / (tp + fp) recall = tp / (tp + fn) f1 = 2 * (precision * recall) / (precision + recall) # used to calculate mean precision, recall, f1 score precision_list.append(precision) recall_list.append(recall) f1_list.append(f1) if verbose: print( 'Test on fold {}: Precision->{:.2f}, Recall->{:.2f}, F1->{:.2f}' .format(fold + 1, precision, recall, f1)) print('---------------------------') print('Average: Precision->{:.2f}, Recall->{:.2f}, F1->{:.2f}'.format( np.mean(precision_list), np.mean(recall_list), np.mean(f1_list))) # plot mean auc mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) # plot chance level roc plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.8) # plot chance level ROC plt.legend() if show_roc: plt.show() return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list)
import numpy as np from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel data = pd.read_csv( 'C:\mldata\V2.csv',header = None,encoding = "ISO-8859-1") data[9] = data[9].fillna(1) y = data.loc[:,9].values x = data.loc[:,0:7].values test_size = 0.3 X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42) model1 = XGBClassifier(n_estimators= 5000,max_depth=3,n_jobs=-1,seed = 1) model1.fit(X_train, y_train) # make predictions for test data y_pred1 = model1.predict_proba(X_test) model2 = XGBClassifier(n_estimators= 5000,max_depth=4, n_jobs=-1,seed = 2) model2.fit(X_train, y_train) # make predictions for test data y_pred2 = model2.predict_proba(X_test) model3 = XGBClassifier(n_estimators= 5000,max_depth=5, n_jobs=-1,seed = 3) model3.fit(X_train, y_train) # make predictions for test data y_pred3 = model3.predict_proba(X_test) model4 = XGBClassifier(n_estimators= 5000,max_depth=2 ,n_jobs=-1,seed = 4) model4.fit(X_train, y_train)
def leave_one_subject_out(x, y, log, label_type): """Normalization and Leave one subject out cross validation Args: :param x: # people x # trials x (# channels x # features) :param y: # people x # trials :param log: (# channels x # features) """ x = normalize(x) clf = XGBClassifier( n_estimators=300, learning_rate=0.05, max_depth=3, min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, reg_alpha=1, ) precision_list, recall_list, f1_list = list(), list(), list() tprs, aucs, mean_fpr = [], [], np.linspace(0, 1, 100) for subject in range(len(x)): x_train, y_train = np.delete(x, subject, axis=0), np.delete(y, subject, axis=0) x_test, y_test = x[subject], y[subject] # reshape x and y, and convert label to binary and remove threshold x_train, y_train = convert_to_binary_label_and_remove_threshold( x_train.reshape(-1, x.shape[2]), y_train.reshape(-1), label_type) x_test, y_test = convert_to_binary_label_and_remove_threshold( x_test.reshape(-1, x.shape[2]), y_test.reshape(-1), label_type) # train and predict clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # plot roc curve of each fold (subject) probas_ = clf.predict_proba( x_test) # shape: len x 2 (prob of neg, prob of pos) fpr, tpr, _ = roc_curve(y_test, probas_[:, 1]) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold {} (AUC={:.2f})'.format(subject, roc_auc)) # be used to plot mean roc tprs.append( interp(mean_fpr, fpr, tpr)) # append mean tpr (interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 # mean_tpr[0] = 0 aucs.append(roc_auc) # confusion matrix tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() precision = tp / (tp + fp) recall = tp / (tp + fn) f1 = 2 * (precision * recall) / (precision + recall) # used to calculate mean precision, recall, f1 score precision_list.append(precision) recall_list.append(recall) f1_list.append(f1) print( 'Test on subject {}: Precision->{:.2f}, Recall->{:.2f}, F1->{:.2f}' .format(subject + 1, precision, recall, f1)) print('---------------------------') print('Average: Precision->{:.2f}, Recall->{:.2f}, F1->{:.2f}'.format( np.mean(precision_list), np.mean(recall_list), np.mean(f1_list))) # plot mean auc mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) # plot chance level roc plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.8) # plot chance level ROC plt.show()
def cv_depth_weight(self): """ Runs cross-validation by grid-searching through depth and child_weight values. """ from xgboost import XGBClassifier for depth in self.depth_range: for child_weight in self.child_weight_range: all_predicted_probs = pd.DataFrame() all_testing_y = pd.Series() dates = [] self.log_loss_weights = [] for test_name in range(1, self.test_name + 1): self.cv_start = self.cv_params[test_name]['cv_start'] self.cv_end = self.cv_params[test_name]['cv_end'] self.get_cv_indices() training_x = self.full_df.loc[:(self.cv_indices[0] - 1), self.feature_names] self.training_y = self.full_df.loc[:(self.cv_indices[0] - 1), self.output_name] scaler = StandardScaler() scaler.fit(training_x) training_x_scaled = scaler.transform(training_x) testing_x = self.full_df[self.feature_names].loc[ self.cv_indices] testing_x_scaled = scaler.transform(testing_x) xgboost = XGBClassifier(max_depth=depth, min_child_weight=child_weight, gamma=0, learning_rate=0.1, n_estimators=100, reg_lambda=0.01, reg_alpha=0, subsample=1, colsample_bytree=1, objective='binary:logistic', booster='gbtree', silent=True, random_state=123) xgboost.fit(X=training_x_scaled, y=self.training_y) self.testing_y = self.full_df[self.output_name].loc[ self.cv_indices] self.calculate_log_loss_weights() predicted_probs = pd.DataFrame( xgboost.predict_proba(testing_x_scaled)) all_predicted_probs = all_predicted_probs.append( predicted_probs, ignore_index=True) all_testing_y = all_testing_y.append(self.testing_y) dates.extend(self.full_df['Dates'].loc[self.cv_indices]) log_loss_score = log_loss(y_true=all_testing_y, y_pred=all_predicted_probs, sample_weight=self.log_loss_weights) if log_loss_score < self.best_cv_score: self.best_cv_score = log_loss_score self.optimal_depth = depth self.optimal_child_weight = child_weight self.xgboost_cv_predictions['Dates'] = dates self.xgboost_cv_predictions[ 'True'] = all_testing_y.to_list() self.xgboost_cv_predictions[ 'Predicted'] = all_predicted_probs[1].to_list()
def run_xgboost_prediction(self): """ Performs prediction on the hold-out sample. """ from xgboost import XGBClassifier self.optimal_depth = self.xgboost_optimal_params['Depth'] self.optimal_child_weight = self.xgboost_optimal_params[ 'Min Child Weight'] self.optimal_lambda = self.xgboost_optimal_params['Lambda'] all_predicted_probs = pd.DataFrame() all_testing_y = pd.Series() dates = [] self.log_loss_weights = [] training_x = self.full_df.loc[:(self.pred_indices[0] - 1), self.feature_names] self.training_y = self.full_df.loc[:(self.pred_indices[0] - 1), self.output_name] scaler = StandardScaler() scaler.fit(training_x) training_x_scaled = scaler.transform(training_x) xgboost = XGBClassifier(max_depth=self.optimal_depth, min_child_weight=self.optimal_child_weight, gamma=0, learning_rate=0.1, n_estimators=100, reg_lambda=self.optimal_lambda, reg_alpha=0, subsample=1, colsample_bytree=1, objective='binary:logistic', booster='gbtree', silent=True, random_state=123) xgboost.fit(X=training_x_scaled, y=self.training_y) self.importances = pd.DataFrame(xgboost.feature_importances_).T self.importances.rename(columns=self.feature_dict, inplace=True) testing_x = self.full_df[self.feature_names].loc[self.pred_indices] testing_x_scaled = scaler.transform(testing_x) self.testing_y = self.full_df[self.output_name].loc[self.pred_indices] self.calculate_log_loss_weights() predicted_probs = pd.DataFrame(xgboost.predict_proba(testing_x_scaled)) all_predicted_probs = all_predicted_probs.append(predicted_probs, ignore_index=True) all_testing_y = all_testing_y.append(self.testing_y) dates.extend(self.full_df['Dates'].loc[self.pred_indices]) self.xgboost_pred_error = log_loss(y_true=all_testing_y, y_pred=all_predicted_probs, sample_weight=self.log_loss_weights) self.xgboost_predictions['Dates'] = dates self.xgboost_predictions['True'] = all_testing_y.to_list() self.xgboost_predictions['Predicted'] = all_predicted_probs[1].to_list( ) self.metadata['Importances'] = self.importances.to_dict() #MIT License # #Copyright (c) 2019 Terrence Zhang # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal #in the Software without restriction, including without limitation the rights #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell #copies of the Software, and to permit persons to whom the Software is #furnished to do so, subject to the following conditions: # #The above copyright notice and this permission notice shall be included in all #copies or substantial portions of the Software. # #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #SOFTWARE.
bst = XGBClassifier(max_depth=8, learning_rate = 0.01, n_estimators=2100, subsample=0.9, colsample_bytree=0.45, objective="binary:logistic", silent = False, min_child_weight=1, nthread=-1) bst.fit(X_train, y_train, eval_metric= "logloss", eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=200) preds = bst.predict_proba(X_valid)[:, 1] ll = log_loss(validationSet["target"], preds) df = pd.DataFrame({"ID" : validationSet["ID"], pred_name : preds}) eval_matrix = eval_matrix.append(df, ignore_index = True) print "fold : {} | logloss: {}".format(i+1, ll) del trainingSet, validationSet, bst, preds, ll, X_train, X_valid, y_train, y_valid gc.collect() X_train = train[feature_names].copy() y_train = np.array(train["target"].copy()) bst = XGBClassifier(max_depth=8, learning_rate = 0.01, n_estimators=2100, subsample=0.9, colsample_bytree=0.45, objective="binary:logistic",
print confusion_matrix(real, pred) from sklearn.metrics import cohen_kappa_score kappa = cohen_kappa_score(real, pred) fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1) roc_auc = metrics.auc(fpr, tpr) print 'Accuracy = ', float(cm[0][0] + cm[1][1]) / len(real) print 'kappa score = ', kappa print 'AUC Score = ', metrics.auc(fpr, tpr) print 'recall = ', tpr[1] print 'precision = ', float(cm[1][1]) / (cm[1][1] + cm[0][1]) #Getting the probability scores predictions = model.predict_proba(X_test) print predictions addresses = housenum + ' ' + address #Addresses with fire and risk score risk = [] for row in predictions: risk.append(row[1]) cols = { "Address": addresses, "Fire": pred, "RiskScore": risk, "state_desc": state_desc, "school_desc": school_desc,
n_estimators = 600 max_depth = 6 subsample = 0.9 colsample_bytree = 0.85 min_child_weight = 1 # default eval_metrics = ['auc'] eval_sets = [(X_train, y_train), (X_test, y_test)] xgb = XGBClassifier(seed=0, learning_rate=learning_rate, n_estimators=n_estimators, min_child_weight=min_child_weight, max_depth=max_depth, colsample_bytree=colsample_bytree, subsample=subsample) print("Fitting the model") xgb = xgb.fit(X_train, y_train, eval_metric=eval_metrics, eval_set=eval_sets, verbose=False) print("Predicting Probabilities") probs['xgb'] = xgb.predict_proba(X_test)[:, -1] print("Computing AUC") auc_test = [xgb.evals_result_['validation_%d' % i]['auc'] for i in range(len(eval_sets))] auc_test = np.array(auc_test, dtype=float).T auc_best_round = np.argmax(auc_test, axis=0) auc_best = [auc_test[auc_best_round[0], 0], auc_test[auc_best_round[1], 1]] print('Best AUC train=%f (round=%d), test=%f (round=%d)' % (auc_best[0], auc_best_round[0], auc_best[1], auc_best_round[1])) print('Validation') test_probs = pd.DataFrame() test_probs['xgb_valid'] = xgb.predict_proba(df_test)[:,-1] print(test_probs['xgb_valid'].head()) fpr, tpr, thresholds = metrics.roc_curve(df_test_target, test_probs, pos_label=1)
subsample=0.4, subsample_freq=1, colsample_bytree=0.4, random_state=2019, num_leaves=10, min_child_samples=20, max_depth=3) clf_xgb.fit(train_X, train_y, \ eval_set=[(train_X, train_y), (val_X, val_y)], \ early_stopping_rounds=10) joblib.dump(clf, 'treemodel/xgb.model') # predict print('predict...') test_pred_prob_1 = clf.predict_proba(test_X, num_iteration=clf.best_iteration_) test_pred_prob_2 = clf_xgb.predict_proba(test_X) test_pred_prob = (test_pred_prob_1 + test_pred_prob_2) / 2 sub = pd.read_csv(path_data + file_test, parse_dates=['due_date']) prob_cols = ['prob_{}'.format(i) for i in range(33)] for i, f in enumerate(prob_cols): sub[f] = test_pred_prob[:, i] sub_example = pd.read_csv('../result/submission_sample.csv', parse_dates=['repay_date']) sub_example = pd.merge(sub_example, sub, how='left', on='listing_id') sub_example['days'] = (sub_example['due_date'] - sub_example['repay_date']).dt.days test_prob = sub_example[prob_cols].values test_labels = sub_example['days'].values test_prob = [test_prob[i][test_labels[i]] for i in range(test_prob.shape[0])] sub_example['repay_amt'] = sub_example['due_amt'] * test_prob sub_example[['listing_id', 'repay_date', 'repay_amt']].to_csv('sub.csv',
print(roc_auc_score(y, preds)) # pick the best threshold out-of-fold thresholds = np.linspace(0.01, 0.99, 50) mcc = np.array([matthews_corrcoef(y, preds>thr) for thr in thresholds]) best_threshold = thresholds[mcc.argmax()] print(mcc.max()) # load test data test_X = np.concatenate([ pd.read_csv("../Data/test_date.csv", index_col=0, dtype=np.float32, usecols=np.concatenate([[0], important_indices[important_indices<1156]+1])).values, pd.read_csv("../Data/test_numeric.csv", index_col=0, dtype=np.float32, usecols=np.concatenate([[0], important_indices[important_indices>=1156] +1 - 1156])).values ], axis=1) # generate predictions at the chosen threshold preds = (clf.predict_proba(test_X)[:,1] > best_threshold).astype(np.int8) # and submit location = '../Submission/' import time timestr = time.strftime("%Y%m%d-%H%M%S") filename = location + 'sub-' + timestr + '.csv.gz' sub = pd.read_csv("../Data/sample_submission.csv", index_col=0) sub["Response"] = preds sub.to_csv(filename, compression="gzip")
#Only two guys to a fight train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') #Someone yells stop, goes limp, taps out, the fight is over train.isnull().sum() train = KNN(k=3).complete(train) test = KNN(k=3).complete(test) #One fight at a time le = LabelEncoder() cat = ['genre', 'certificate', 'distributor'] for col in cat: train[col] = le.fit_transform(train[col]) test[col] = le.fit_transform(test[col]) #no shirts, no shoes train_X = train.drop(['year', 'oscar', 'movie_name', 'actor_name', 'href'], axis=1) test_X = test.drop(['year', 'oscar', 'movie_name', 'actor_name', 'href'], axis=1) train_Y = train['oscar'] #Fights will go on as long as they want to model = XGBClassifier() model.fit(train_X, train_Y) #If this is your first night at Fight Club, you have to fight. pred_xgb = model.predict_proba(test_X)[:, 1] xgb_prediction = pd.DataFrame(pred_xgb, test['movie_name'])
class XGBClassifierCV(object): """cross_val_predict""" def __init__(self, params=None, cv=5, random_state=None, n_repeats=None): self.clf = XGBClassifier() if params: self.clf.set_params(**params) if n_repeats: self._kf = RepeatedStratifiedKFold(cv, True, random_state) self._num_preds = cv * n_repeats else: self._kf = StratifiedKFold(cv, True, random_state) self._num_preds = cv def fit(self, X, y, X_test=None, feval=roc_auc_score, sample_weight=None, eval_metric='auc', early_stopping_rounds=100, verbose=100, xgb_model=None, sample_weight_eval_set=None, callbacks=None): """输入数组""" if X_test is None: X_test = X[:1] self.oof_train = np.zeros(len(X)) self.oof_test = np.zeros((len(X_test), self._num_preds)) for n_fold, (train_index, valid_index) in enumerate(self._kf.split(X, y)): if verbose: print("\033[94mFold %s started at %s\033[0m" % (n_fold + 1, time.ctime())) X_train, y_train = X[train_index], y[train_index] X_valid, y_valid = X[valid_index], y[valid_index] eval_set = [(X_train, y_train), (X_valid, y_valid)] ######################################################################## self.clf.fit(X_train, y_train, sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) self.oof_train[valid_index] = self.clf.predict_proba(X_valid)[:, 1] self.oof_test[:, n_fold] = self.clf.predict_proba(X_test)[:, 1] ######################################################################## # 输出 测试集 oof self.oof_test_rank = pd.DataFrame(self.oof_test).rank().mean(1) / len( self.oof_test) self.oof_test = self.oof_test.mean(1) # 计算 训练集 oof 得分 if feval: oof_train_score = feval(y, self.oof_train) print( f"\n\033[94mCV Score: {oof_train_score} ended at {time.ctime()}\033[0m" ) return oof_train_score def oof_save(self, file='./oof_train_and_test.csv'): assert isinstance(file, str) _ = np.append(self.oof_train, self.oof_test) pd.DataFrame(_, columns='oof_train_and_test').to_csv(file, index=False)
def learn(title, data_loader, allow_printing, calculate_rhos, SVM, XGBOOST, NN, cross_validation, create_coeff_plots, check_all_parameters, svm_parameters, xgb_parameters, create_pca_plots, test_size, edge_percent, BINARY=True): # create a folder for the task if not os.path.exists(title): os.makedirs(title) os.chdir(os.path.join(os.path.abspath(os.path.curdir), title)) print("learning..." + title) ids, tag_map, task_name = data_loader.get_learning_data(title) id_to_features_map = data_loader.get_id_to_features_map X = [ id_to_features_map[id] for id in ids if id in id_to_features_map.keys() ] y = [tag_map[id] for id in ids if id in id_to_features_map.keys()] # ----------------------------------------------! calculate_rhos ------------------------------------------------ if calculate_rhos: print("calculating rho") draw_rhos_calculation_figure(tag_map, data_loader.get_preproccessed_data, title, data_loader._taxnomy_level, ids_list=ids, save_folder="rhos") # ----------------------------------------------! PCA ------------------------------------------------ if create_pca_plots: PCA_t_test(group_1=[x for x, y in zip(X, y) if y == 0], group_2=[x for x, y in zip(X, y) if y == 1], title="T test for PCA dimentions on " + task_name, save=True, folder="PCA") plot_data_3d(X, y, data_name=task_name.capitalize(), save=True, folder="PCA") plot_data_2d(X, y, data_name=task_name.capitalize(), save=True, folder="PCA") # ----------------------------------------------! SVM ------------------------------------------------ # Set the parameters by cross-validation # multi_class =”crammer_singer” if SVM: if not os.path.exists("SVM"): os.makedirs("SVM") os.chdir(os.path.join(os.path.abspath(os.path.curdir), "SVM")) print("SVM...") # update each classifier results in a mutual file svm_results_file = Path("all_svm_results.csv") if not svm_results_file.exists(): all_svm_results = pd.DataFrame(columns=[ 'KERNEL', 'GAMMA', 'C', 'TRAIN-AUC', 'TRAIN-ACC', 'TEST-AUC', 'TEST-ACC' ]) all_svm_results.to_csv(svm_results_file, index=False) optional_classifiers = [] if check_all_parameters: svm_tuned_parameters = { 'kernel': ['linear'], ###### 'rbf', 'poly', 'sigmoid', ?????????????? 'gamma': ['auto', 'scale'], 'C': [0.01, 0.1, 1, 10, 100, 1000] } # create all possible classifiers weights = data_loader.get_weights() for kernel in svm_tuned_parameters['kernel']: for gamma in svm_tuned_parameters['gamma']: for C in svm_tuned_parameters['C']: clf = svm.SVC( kernel=kernel, C=C, gamma=gamma, class_weight=weights) # class_weight='balanced') optional_classifiers.append(clf) else: # use the wanted classifier clf = svm.SVC(kernel=svm_parameters['kernel'], C=svm_parameters['C'], gamma=svm_parameters['gamma'], class_weight='balanced') optional_classifiers.append(clf) for clf in optional_classifiers: all_svm_results = pd.read_csv(svm_results_file) clf_folder_name = "k=" + clf.kernel + "_c=" + str( clf.C) + "_g=" + clf.gamma if not os.path.exists(clf_folder_name): os.makedirs(clf_folder_name) # Split the data set X_trains, X_tests, y_trains, y_tests, svm_coefs = [], [], [], [], [] svm_y_test_from_all_iter, svm_y_score_from_all_iter = np.array( []), np.array([]) svm_y_pred_from_all_iter, svm_class_report_from_all_iter = np.array( []), np.array([]) train_accuracies, test_accuracies, confusion_matrixes, y_train_preds, y_train_scores,\ y_test_preds = [], [], [], [], [], [] for i in range(cross_validation): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, shuffle=True) X_trains.append(X_train) X_tests.append(X_test) y_trains.append(y_train) y_tests.append(y_test) bacteria_coeff_average = [] for iter_num in range(cross_validation): print('------------------------------\niteration number ' + str(iter_num)) # FIT clf.fit(X_trains[iter_num], y_trains[iter_num]) # GET RESULTS y_score = clf.decision_function( X_tests[iter_num]) # what is this for? y_pred = clf.predict(X_tests[iter_num]) y_test_preds.append(y_pred) svm_class_report = classification_report( y_tests[iter_num], y_pred).split("\n") train_pred = clf.predict(X_trains[iter_num]) train_score = clf.decision_function(X_trains[iter_num]) y_train_preds.append(train_pred) y_train_scores.append(train_score) # SAVE RESULTS train_accuracies.append( accuracy_score(y_trains[iter_num], train_pred)) test_accuracies.append( accuracy_score(y_tests[iter_num], y_pred)) confusion_matrixes.append( confusion_matrix(y_tests[iter_num], y_pred)) # AUC if BINARY: _, _, _, svm_roc_auc = roc_auc(y_tests[iter_num], y_pred, visualize=False, graph_title='SVM\n' + str(iter_num), save=True, folder=task_name) # SAVE y_test AND y_score svm_y_test_from_all_iter = np.append( svm_y_test_from_all_iter, y_tests[iter_num]) # .values) svm_y_pred_from_all_iter = np.append(svm_y_pred_from_all_iter, list(y_pred)) svm_class_report_from_all_iter = np.append( svm_class_report_from_all_iter, svm_class_report) if svm_y_score_from_all_iter.size > 0: svm_y_score_from_all_iter = np.concatenate( (svm_y_score_from_all_iter, y_score), axis=0) else: svm_y_score_from_all_iter = y_score # --------------------------------------------! COEFF PLOTS ----------------------------------------- if create_coeff_plots: svm_coefs, bacterias, coefficients, bacteria_coeff_average = svm_calc_bacteria_coeff_average( data_loader, clf, svm_coefs, bacteria_coeff_average) # --------------------------------------------! AUC ----------------------------------------- all_y_train, all_predictions_train, all_test_real_tags, all_test_pred_tags, train_auc, test_auc, train_rho, \ test_rho = calc_auc_on_joined_results(cross_validation, y_trains, y_train_preds, y_tests, y_test_preds) # ----------------------------------------! CONFUSION MATRIX ------------------------------------- print("\n------------------------------") names = data_loader.get_confusin_matrix_names() # binary = len(names) == 2 confusion_matrix_average, confusion_matrix_acc = edit_confusion_matrix( confusion_matrixes, "SVM", names, BINARY=BINARY) if BINARY: _, _, _, svm_roc_auc = roc_auc( svm_y_test_from_all_iter.astype(int), svm_y_score_from_all_iter, visualize=True, graph_title='SVM\n' + task_name.capitalize() + " AUC on all iterations", save=True, folder=clf_folder_name) res_path = os.path.join(clf_folder_name, str(round(svm_roc_auc, 5))) else: svm_roc_auc = 0 res_path = clf_folder_name if not os.path.exists(res_path): os.mkdir(res_path) if create_coeff_plots: plot_bacteria_coeff_average(bacteria_coeff_average, len(names), data_loader, title, task_name, bacterias, cross_validation, "SVM", res_path, BINARY, edge_percent) # if allow_printing: print_confusion_matrix(confusion_matrix_average, names, confusion_matrix_acc, "SVM", task_name, res_path) t = np.array(y_trains).astype(int) t = t.flatten() s = np.array(y_train_scores) s = s.flatten() if BINARY: _, _, _, svm_train_roc_auc = roc_auc(t, s, visualize=False, graph_title="train auc", save=False, folder=res_path) else: svm_train_roc_auc = 0 multi_class_roc_auc(svm_y_test_from_all_iter.astype(int), svm_y_score_from_all_iter, names, graph_title='SVM\n' + task_name.capitalize() + " AUC on all iterations", save=True, folder=res_path) # ----------------------------------------! SAVE RESULTS ------------------------------------- save_results(task_name, train_auc, test_auc, train_rho, test_rho, confusion_matrix_average, confusion_matrix_acc, train_accuracies, test_accuracies, svm_y_score_from_all_iter, svm_y_pred_from_all_iter, svm_y_test_from_all_iter, "SVM", res_path) all_svm_results.loc[len(all_svm_results)] = [ clf.kernel, clf.C, clf.gamma, svm_train_roc_auc, np.mean(train_accuracies), svm_roc_auc, np.mean(test_accuracies) ] if BINARY: all_svm_results = all_svm_results.sort_values(by=['TEST-AUC'], ascending=False) else: all_svm_results = all_svm_results.sort_values(by=['TEST-ACC'], ascending=False) all_svm_results.to_csv(svm_results_file, index=False) # ----------------------------------------------! XGBOOST ------------------------------------------------ if XGBOOST: if SVM: os.chdir("..") if not os.path.exists("XGBOOST"): os.makedirs("XGBOOST") os.chdir(os.path.join(os.path.abspath(os.path.curdir), ("XGBOOST"))) print("XGBOOST...") # update each classifier results in a mutual file xgb_results_file = Path("all_xgb_results.csv") if not xgb_results_file.exists(): all_xgb_results = pd.DataFrame(columns=[ 'LR', 'MAX-DEPTH', 'N-ESTIMATORS', 'OBJECTIVE', 'GAMMA', 'MIN-CHILD-WEIGHT', 'BOOSTER', 'TRAIN-AUC', 'TRAIN-ACC', 'TEST-AUC', 'TEST-ACC' ]) all_xgb_results.to_csv(xgb_results_file, index=False) optional_classifiers = [] if check_all_parameters: """ xgboost_tuned_parameters = {'learning_rate': [0.01, 0.05, 0.1], 'objective': ['binary:logistic'], 'n_estimators': [1000], 'max_depth': range(3, 10), 'min_child_weight': range(1, 12), 'gamma': [0.0, 0.1, 0.2, 0.3, 1, 3, 6, 9]} """ xgboost_tuned_parameters = { 'learning_rate': [0.01, 0.05, 0.1], 'objective': ['binary:logistic'], 'n_estimators': [1000], 'max_depth': [3, 5, 7, 9], 'min_child_weight': [1, 5, 9], 'gamma': [0.0, 0.5, 1, 5, 9] } # create all possible classifiers for max_depth in xgboost_tuned_parameters['max_depth']: for learning_rate in xgboost_tuned_parameters['learning_rate']: for n_estimators in xgboost_tuned_parameters[ 'n_estimators']: for objective in xgboost_tuned_parameters['objective']: for gamma in xgboost_tuned_parameters['gamma']: for min_child_weight in xgboost_tuned_parameters[ 'min_child_weight']: clf = XGBClassifier( max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective=objective, gamma=gamma, min_child_weight=min_child_weight, booster='gblinear') optional_classifiers.append(clf) else: # use the wanted classifier clf = XGBClassifier( max_depth=xgb_parameters['max_depth'], learning_rate=xgb_parameters['learning_rate'], n_estimators=xgb_parameters['n_estimators'], objective=xgb_parameters['objective'], gamma=xgb_parameters['gamma'], min_child_weight=xgb_parameters['min_child_weight'], booster='gblinear') optional_classifiers.append(clf) for clf in optional_classifiers: all_xgb_results = pd.read_csv(xgb_results_file) clf_folder_name = "d=" + str(clf.max_depth) + "_lr=" + str(clf.learning_rate) + "_e=" +\ str(clf.n_estimators) + "_o=" + clf.objective + "_g=" + str(clf.gamma) + "_m=" +\ str(clf.min_child_weight) + "_b=" + clf.booster if not os.path.exists(clf_folder_name): os.makedirs(clf_folder_name) # Split the data set X_trains, X_tests, y_trains, y_tests, xgb_coefs = [], [], [], [], [] xgb_y_test_from_all_iter, xgb_y_score_from_all_iter = np.array( []), np.array([]) xgb_y_pred_from_all_iter, xgb_class_report_from_all_iter = np.array( []), np.array([]) xgb_coefs, bacteria_coeff_average, y_train_scores = [], [], [] train_accuracies, test_accuracies, confusion_matrixes, y_train_preds, y_test_preds = [], [], [], [], [] for i in range(cross_validation): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, shuffle=True, stratify=y) X_trains.append(X_train) X_tests.append(X_test) y_trains.append(y_train) y_tests.append(y_test) for iter_num in range(cross_validation): print("------------------------------\niteration number " + str(iter_num)) classes_sum = [ np.sum(np.array(y_trains[iter_num]) == unique_class) for unique_class in np.unique(np.array(y_trains[iter_num])) ] classes_ratio = [ 1 - (a / sum(classes_sum)) for a in classes_sum ] weights = [ classes_ratio[a] for a in np.array(y_trains[iter_num]) ] # classes_ratio clf.fit(np.array(X_trains[iter_num]), np.array(y_trains[iter_num]), sample_weight=weights) clf.predict_proba(X_tests[iter_num]) y_score = clf.predict_proba( X_tests[iter_num]) # what is this for? y_pred = clf.predict(X_tests[iter_num]) y_test_preds.append(y_pred) y_pred = clf.predict(X_tests[iter_num]) xgb_class_report = classification_report( y_tests[iter_num], y_pred) train_pred = clf.predict(X_trains[iter_num]) train_score = clf.predict_proba(X_trains[iter_num]) y_train_preds.append(train_pred) y_train_scores.append(train_score) train_accuracies.append( accuracy_score(y_trains[iter_num], clf.predict(X_trains[iter_num]))) test_accuracies.append( accuracy_score( y_tests[iter_num], y_pred)) # same as - clf.score(X_test, y_test) confusion_matrixes.append( confusion_matrix(y_tests[iter_num], y_pred)) if BINARY: _, _, _, xgb_roc_auc = roc_auc(y_tests[iter_num], y_pred, visualize=True, graph_title='XGB\n' + str(iter_num), folder=task_name) else: xgb_roc_auc = 0 # save the y_test and y_score xgb_y_test_from_all_iter = np.append(xgb_y_test_from_all_iter, y_tests[iter_num]) xgb_y_pred_from_all_iter = np.append(xgb_y_pred_from_all_iter, y_pred) xgb_class_report_from_all_iter = np.append( xgb_class_report_from_all_iter, xgb_class_report) if xgb_y_score_from_all_iter.size > 0: xgb_y_score_from_all_iter = np.concatenate( (xgb_y_score_from_all_iter, y_score), axis=0) else: xgb_y_score_from_all_iter = y_score # --------------------------------------! PLOT CORRELATION - XGBOOST ------------------------------- # if create_coeff_plots: # num_of_classes, bacterias = xgb_calc_bacteria_coeff_average(data_loader, clf, xgb_coefs, # bacteria_coeff_average) # if create_coeff_plots: # plot_bacteria_coeff_average(bacteria_coeff_average, num_of_classes, data_loader, title, task_name, # bacterias, cross_validation, "XGB") all_y_train, all_predictions_train, all_test_real_tags, all_test_pred_tags, train_auc, test_auc, train_rho, \ test_rho = calc_auc_on_joined_results(cross_validation, y_trains, y_train_preds, y_tests, y_test_preds) names = data_loader.get_confusin_matrix_names() confusion_matrix_average, confusion_matrix_acc = \ edit_confusion_matrix(title, confusion_matrixes, data_loader, "XGB", names, BINARY=BINARY) if BINARY: _, _, _, xgb_roc_auc = roc_auc( xgb_y_test_from_all_iter.astype(int), xgb_y_score_from_all_iter[:, 1], visualize=True, graph_title='XGB\n' + task_name.capitalize() + " AUC on all iterations", save=True, folder=clf_folder_name) res_path = os.path.join(clf_folder_name, str(round(xgb_roc_auc, 5))) else: xgb_roc_auc = 0 res_path = clf_folder_name if not os.path.exists(res_path): os.mkdir(res_path) # if allow_printing: print_confusion_matrix(confusion_matrix_average, names, confusion_matrix_acc, "XGB", task_name, res_path) t = np.array(y_trains).astype(int) t = t.flatten() s = np.array(y_train_scores) s = s.flatten() c = s[::2] if BINARY: _, _, _, xgb_train_roc_auc = roc_auc(t, c, visualize=False, graph_title="", save=False, folder=res_path) else: xgb_train_roc_auc = 0 multi_class_roc_auc(xgb_y_test_from_all_iter.astype(int), xgb_y_score_from_all_iter, names, graph_title='XGB\n' + task_name.capitalize() + " AUC on all iterations", save=True, folder=res_path) # ----------------------------------------! SAVE RESULTS ------------------------------------- save_results(task_name, train_auc, test_auc, train_rho, test_rho, confusion_matrix_average, confusion_matrix_acc, train_accuracies, test_accuracies, xgb_y_score_from_all_iter, xgb_y_pred_from_all_iter, xgb_y_test_from_all_iter, "XGB", res_path) all_xgb_results.loc[len(all_xgb_results)] = [ clf.learning_rate, clf.max_depth, clf.n_estimators, clf.objective, clf.gamma, clf.min_child_weight, clf.booster, xgb_train_roc_auc, np.mean(train_accuracies), xgb_roc_auc, np.mean(test_accuracies) ] if BINARY: all_xgb_results = all_xgb_results.sort_values(by=['TEST-AUC'], ascending=False) else: all_xgb_results = all_xgb_results.sort_values(by=['TEST-ACC'], ascending=False) all_xgb_results.to_csv(xgb_results_file, index=False) # ----------------------------------------------! NN ------------------------------------------------ if NN: if SVM or XGBOOST: os.chdir("..") if not os.path.exists("NN"): os.makedirs("NN") param_dict = { "lr": [0.005], "test_size": [0.2], "batch_size": [16], "shuffle": [True], "num_workers": [4], "epochs": [100] } for lr in param_dict['lr']: for test_size in param_dict['test_size']: for batch_size in param_dict['batch_size']: for shuffle in param_dict['shuffle']: for num_workers in param_dict['num_workers']: for epochs in param_dict['epochs']: clf_folder_name = "lr=" + str(lr) + "_t=" + str(test_size) + "_bs=" +\ str(batch_size) + "_s=" + str(shuffle) + "_nw=" +\ str(num_workers) + "_e=" + str(epochs) if not os.path.exists(clf_folder_name): os.makedirs(clf_folder_name) nn_main(X, y, title, clf_folder_name, 46, 200, 100, 1, lr, test_size, batch_size, shuffle, 4, 100) os.chdir("../..")
biter.append(best_iter) print "---log_loss: %0.6f\n" %ll print "---best_iter: %d\n" %best_iter gc.collect() best_i = np.mean(biter) + 50 # train on whole data gbm = XGBClassifier(max_depth=8, learning_rate = 0.01, n_estimators=best_i, subsample=0.9, colsample_bytree=0.45, objective="binary:logistic", silent = False, min_child_weight=1, nthread=-1) gbm.fit(train_processed, target, eval_metric="logloss", eval_set = [(train_processed, target)], verbose=20) tid = test_processed["ID"].copy() assert (len(tid) == 114393), "test length does not match!" test_processed.drop(["ID", "target", "train_flag"], axis = 1, inplace = True) tpreds = gbm.predict_proba(test_processed)[:, 1] sub = pd.DataFrame({"ID" : tid, "PredictedProb" : tpreds}) submission_file = os.path.join(submission_dir, "xgb_denormalized.csv") sub.to_csv(submission_file, index = False) end_time = datetime.now() print 'elapsed time: {}'.format(end_time - start_time)
test_size=1 / 3, random_state=0) for subsample in np.arange(0.5, 1, 0.05): i = i + 1 xgb = XGBClassifier(subsample=subsample, early_stopping_rounds=100) xgb.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="auc", eval_set=[(X_trainVal, y_trainVal), (X_testVal, y_testVal)], verbose=100) y_pred_rm_xgb = xgb.predict(X_test) # get roc/auc info Y_score = xgb.predict_proba(X_test)[:, 1] fpr = dict() tpr = dict() fpr, tpr, _ = roc_curve(y_test, Y_score) auc_fit = auc(fpr, tpr) roc_auc_df_subs.loc[i] = [subsample, auc_fit] #El mejor es un subsample de 1 roc_auc_df_subs = roc_auc_df_subs.drop_duplicates() plt.style.use('seaborn-pastel') plt.plot(roc_auc_df_subs.subsample, roc_auc_df_subs.auc_fit) plt.title('AUCROC vs Subsample') plt.xlabel('Número de subsample') plt.ylabel('AUCROC') plt.show()
class XGBoostModel: def __init__(self, use_rfc=True): self.use_rfc = use_rfc if self.use_rfc: # Instantiate Random Forest Classifier self.rfc = RFCModel() self.rfc.unpickle() def load_train_data(self): self.df, y, _ = clean_df('data/data.json', training=True) if self.use_rfc: # Include results from random forest classifier as new column rfc_probs = self.rfc.predict_proba_all() self.df['rfc_proba'] = rfc_probs X = self.df.values self.features = self.df.columns self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, test_size=0.20, stratify=y, random_state=42) def load_test_data(self): self.df, _, oid = clean_df('data/data_point.json', training=False) if self.use_rfc: # Include results from random forest classifier as new column rfc_probs = self.rfc.predict_proba('data/data_point.json') self.df['rfc_proba'] = rfc_probs return self.df.values, oid def load_one(self, one_json): # with open('one.json', 'w') as f: # temp = '[' + one_json + ']' # f.write(temp) self.df, _, oid = clean_df('[' + one_json + ']', training=False) if self.use_rfc: # Include results from random forest classifier as new column rfc_probs = self.rfc.predict_proba('data/data_point.json') self.df['rfc_proba'] = rfc_probs return self.df.values, oid def fit(self): self.model = XGBClassifier(max_depth=8,\ # reg_alpha=.8,\ n_estimators=200,\ scale_pos_weight=10.13,\ learning_rate=0.1) self.model.fit(self.X_train, self.y_train) @property def feature_importances_(self): #I couldn't call the master class, so just copy-n-pasted #See https://github.com/dmlc/xgboost/commit/dd477ac903eb6f658d6fb2984763c3f8a4516389#diff-2c197a11c1b576e821f5942be9eab74c b = self.model.booster() fs = b.get_fscore() all_features = [fs.get(f, 0.) for f in b.feature_names] all_features = np.array(all_features, dtype=np.float32) return all_features / all_features.sum() def plot_features(self, save_img_dir=None, img_name_prefix='', ext='svg'): ''' use ext='svg' for web! add save_file_dir location to save images save_file_dir has NO trailing slash! eg 'static/images' to keep multiple images saved add prefix string prefix will be added to image file name ''' # this is needed to fix lable clipping in saved files from matplotlib import rcParams rcParams.update({'figure.autolayout': True}) #severly modified from https://gist.github.com/light94/6c42df29f3232ae31e52 b = self.model.booster() fs = b.get_fscore() #print('feature...') #print(b.feature_names) #all_features = {f:fs.get(f, 0.) for f in b.feature_names} #need to add real feature names all_features = { self.features[i]: float(fs.get('f' + str(i), 0.)) for i in range(len(b.feature_names)) } importance = sorted(all_features.items(), key=itemgetter(1)) ff = pd.DataFrame(importance, columns=['feature', 'fscore']) ff['fscore'] = ff['fscore'] / ff['fscore'].sum() #"plot 1" ax = ff.fscore.plot(xticks=ff.index, rot=65) ax.set_xticklabels(ff.feature) plt.title('XGBoost F-scores by feature') if save_img_dir is not None: plt.savefig('{}/{}feature_fscores.{}'.format( save_img_dir, img_name_prefix, ext)) plt.show() #"plot 2" ff.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10)) plt.title('XGBoost Feature Importance') plt.xlabel('relative importance') if save_img_dir is not None: plt.savefig('{}/{}features_barh.{}'.format(save_img_dir, img_name_prefix, ext)) plt.show() plt.close() def pickle(self): _pickle(self.model, 'data/XGBoostModel.pkl') def unpickle(self): self.model = _unpickle('data/XGBoostModel.pkl') def score(self): y_pred = self.model.predict(self.X_test) probs = self.model.predict_proba(self.X_test)[:, 1] accuracy = accuracy_score(self.y_test, y_pred) f1 = f1_score(self.y_test, y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) print("f1: %.2f" % f1) print('Confusion matrix') print(np.array([['TN', 'FP'], ['FN', 'TP']])) print(confusion_matrix(self.y_test, y_pred)) def predict(self, X): return self.model.predict(X) def predict_proba(self, X): prob = self.model.predict_proba(X) return prob[:, 1]
thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') # Compute confusion matrix cnf_matrix = confusion_matrix(y_valid, predictions) np.set_printoptions(precision=2) # Plot non-normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=le.classes_, title='Confusion matrix, without normalization') plt.show() predictions_test = clfr.predict_proba(test_X) predictions_test = pd.DataFrame(predictions_test).reset_index(drop=True) submission_df = pd.concat([test_df['id'],predictions_test], axis=1).reset_index(drop=True) submission_df = submission_df.rename(columns = le_name_mapping) submission_df.to_csv('xgb1.csv', index=False)
colsample_bytree=1, subsample=1) # In[20]: model_A.fit(X_train_A, y_train_A) model_B.fit(X_train_B, y_train_B) model_C.fit(X_train_C, y_train_C) # In[21]: a_preds = model_A.predict_proba(X_test_A) b_preds = model_B.predict_proba(X_test_B) c_preds = model_C.predict_proba(X_test_C) # In[ ]: def make_country_sub(preds, test_feat, country): # make sure we code the country correctly country_codes = ['A', 'B', 'C'] # get just the poor probabilities country_sub = pd.DataFrame(data=preds[:, 1], # proba p=1 columns=['poor'], index=test_feat.index)
nikkei_pred_lag = model_lag.predict(poly_nyse_test_lag) # In[29]: plt.scatter(return_datas_test['NYSE'], return_datas_test['NIKKEI']) plt.plot(nyse_new[:, 0], nikkei_pred, 'r') plt.plot(nyse_test_new[:, 0], nikkei_test_pred, 'g') plt.legend(['Predicted line', 'Test data', 'Observed data']) plt.show() # In[ ]: from xgboost import XGBClassifier # XGBoost is an implementation of gradient boosted decision trees xgmodel = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=100, n_jobs=16, scale_pos_weight=4, missing=np.nan, gamma=16, eval_metric="auc", reg_lambda=40, reg_alpha=40) xgmodel.fit(nikkei_train, nyse_train) # In[ ]: from sklearn.metric import roc_auc_score y_train_predcted = xgmodel.predict_proba()
# L1 stacking would be improved by actually doing another proper kfold # gbm w/ same params std = StandardScaler() dataset_blend_train = std.fit_transform(dataset_blend_train) dataset_blend_test = std.transform(dataset_blend_test) X_train_l1 = np.hstack([X_train, dataset_blend_train]) X_test_l1 = np.hstack([X_test, dataset_blend_test]) print 'GBM L1' gbm_l1 = XGBClassifier(seed=0, learning_rate=gbm_learning_rate, n_estimators=gbm_n_estimators, min_child_weight=gbm_min_child_weight, max_depth=gbm_max_depth, colsample_bytree=gbm_colsample_bytree, subsample=gbm_subsample) gbm_l1.fit(X_train_l1, y_train) print 'GBM L1 AUC: %f' % roc_auc_score(y_train, gbm_l1.predict_proba(X_train_l1)[:, -1]) # nn w/ same params nn_l1 = Sequential() nn_l1.add(Dense(32, input_shape=(X_train_l1.shape[1],), activation='sigmoid')) nn_l1.add(Dropout(0.25)) nn_l1.add(Dense(32, activation='sigmoid')) nn_l1.add(Dropout(0.25)) nn_l1.add(Dense(1, activation='sigmoid')) opt = SGD(lr=nn_sgd_lr, decay=nn_sgd_decay, momentum=nn_sgd_momentum, nesterov=True) nn_l1.compile(loss='binary_crossentropy', optimizer=opt) print 'NN L1' nn_l1.fit(X_train_l1, y_train, verbose=0, nb_epoch=100) print 'NN L1 AUC: %f' % roc_auc_score(y_train, nn_l1.predict_proba(X_train_l1)[:, -1])
#model = linear_model.LogisticRegression(C=1e5) model.fit(X_train, y_train) pred_type = input('Predict on avr or ind? ') if pred_type == 'ind': y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) print("###########################################") print("###########################################") from sklearn.metrics import roc_curve, auc probs = model.predict_proba(X_test) preds = probs[:, 0] fpr, tpr, threshold = roc_curve(y_test, preds) roc_auc = auc(fpr, tpr) # method I: plt import matplotlib.pyplot as plt plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.ion()
# 'xgb__learning_rate': (0.01, 0.03, 0.05), # 'xgb__colsample_bytree': (0.8, 0.85) # } # # grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=1, scoring='roc_auc', cv=3) # grid_search.fit(scaled_X_train, y_train) # print 'Best score: %.3f'%grid_search.best_score_ # print 'Best parameters set:' # best_parameters = grid_search.best_estimator_.get_params() # for param_name in sorted(parameters.keys()): # print '\t%s: %r' %(param_name, best_parameters[param_name]) # # predictions = grid_search.predict(scaled_X_test) # print classification_report(y_test, predictions) # # for param_name in parameters.keys(): # xgb_args[param_name[5:]] = best_parameters[param_name] # # print 'xgb_args:', xgb_args final_scaler = preprocessing.StandardScaler() scaled_final_train_df = final_scaler.fit_transform(final_train_df) scaled_final_test_df = final_scaler.transform(final_test_df) classifier = XGBClassifier(**xgb_args) classifier.fit(scaled_final_train_df, final_targets_df) output = classifier.predict_proba(scaled_final_test_df)[:,1] S = Series(output, index=Ids) S.to_csv('Santander_xgboost_results_1.csv', header=True, index_label=['ID', 'TARGET'])
xgb = XGBClassifier() xgb.fit(X_train, y_train) print(xgb.score(X_test, y_test)) print(log_loss(y_test, xgb.predict(X_test))) print(f1_score(y_test, xgb.predict(X_test))) importance2 = xgb.feature_importances_ for i,v in enumerate(importance2): print('Feature: %0d, Score: %.5f' % (i,v)) import matplotlib.pyplot as plt plt.bar([x for x in range(len(importance2))], importance2) plt.show() XGB = XGBClassifier() XGB.fit(X,y) y_pred = XGB.predict_proba(df_test) Y = pd.DataFrame(y_pred) Y.to_excel("output1.xlsx",index=False)
max_score = 0 for i in range(5,10): model = XGBClassifier(max_depth=i) kf = KFold(len(y),n_folds=5,random_state=42, shuffle=True) #Using accuracy because of final table using it measure score = cross_val_score(model, X, y, cv=kf, scoring='accuracy').mean() print('Cross validation score =', score) print('max_depth =', i) if score > max_score: max_score = score max_n = i print('Max Cross validation score =',max_score) print('Max max_depth =', max_n) model = XGBClassifier(max_depth=max_n) model.fit(X,y) prediction = model.predict_proba(test_pred) #Just to see what features are important and what are not print(model.feature_importances_) #Step 3. Save data to file. submission = pd.DataFrame({ "ID": test["ID"], "Adoption": prediction[:,0], "Died": prediction[:,1], "Euthanasia": prediction[:,2], "Return_to_owner": prediction[:,3], "Transfer": prediction[:,4] })
CV_accuracy = accuracies.mean() CV_std = accuracies.std() # Applying Grid Search to find the best model and the best parameters from sklearn.model_selection import GridSearchCV parameters = [{'reg_lambda' : [0.1, 0.5, 1, 2, 5, 10, 30, 50], 'n_estimators' : [50, 75, 100, 300, 301], 'learning_rate' : [0.01, 0.02, 0.05, 0.1, 0.5, 1], 'max_depth' : [3, 4, 5, 6, 8, 10], 'subsample' : [0.1, 0.2, 0.5, 0.75, 0.85, 1]} ] grid_search = GridSearchCV(estimator = classifier1, param_grid = parameters, scoring = "neg_log_loss", cv = 10, n_jobs = -1) grid_search = grid_search.fit(X_train, y_train) best_metric = grid_search.best_score_ best_params = grid_search.best_params_ # Predicting the Test Set results y_pred1 = classifier1.predict_proba(X_test)[:,1] y_pred2 = classifier2.predict_proba(X_test)[:,1] y_pred_NN = classifier_NN.predict(X_test) # Creating predictions from ensemble models ensemble1 = ((0.25*y_pred1) + (0.75*y_pred_NN).T).T # Writing the results to a csv file np.savetxt('results.csv', ensemble1)
Xg_train, num_boost_round = clf.get_params()['n_estimators'], nfold = 5, show_progress = True, early_stopping_rounds = 100) clf.set_params(n_estimators=cvresult.shape[0]) clf.fit(X_train, y_train) best_outcome_params = clf.get_params() best_outcome_score = cvresult.min() try: # predict the outcome probabilities y_pred = grid.predict_proba(X_test) except: # predict the outcome probabilities y_pred = clf.predict_proba(X_test) # Create a data frame column_names = possible_outcomes[:] idx = pd.Int64Index(np.arange(1,11457, dtype='int64')) idx.rename('ID', inplace=True) df = pd.DataFrame(index = idx, data=y_pred, columns=column_names) # write it to file, timestamp it import time, datetime ts = time.time() submission_time_stamp = datetime.datetime.fromtimestamp(ts).strftime('%Y.%m.%d.%H.%M.%S') df.to_csv('./Data/xgb_submission_'+submission_time_stamp+'.csv',header=True) # save parameters to file:
train.loc[train_series.isnull(), train_name] = -9999 #train_series.mean() #and Test tmp_len = len(test[test_series.isnull()]) if tmp_len>0: test.loc[test_series.isnull(), test_name] = -9999 #train_series.mean() #TODO X_train = train X_test = test extc = XGBClassifier(max_depth=10,colsample_bytree=0.8,learning_rate=0.02,n_estimators=500,nthread=-1)#max_features= 50,criterion= 'entropy',min_samples_split= 4, #max_depth= 50, min_samples_leaf= 4) y_test=pd.read_csv('good/xgb4.csv')['real'].values extc.fit(X_train,target,eval_metric="logloss",eval_set=[(X_test, y_test)]) print('Predict...') y_pred = extc.predict_proba(X_test) #print y_pred pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('mycv1.csv',index=False) y=pd.read_csv('good/xgb4.csv')['real'].values yp=y_pred[:,1] score=str(llfun(y,yp))[2:] print sys.argv[0],score import subprocess cmd='cp mycv1.csv vabackup/mycv%s.csv'%score subprocess.call(cmd,shell=True) cmd='cp mycv.py vabackup/mycv%s.py'%score subprocess.call(cmd,shell=True)
model_cnt = 0 XGBmodels = [] seeds = [0, 1000] for one in seeds: for max_depth in [3]: for learning_rate in [0.05]: model_cnt += 1 model = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=500, silent=True, \ objective='binary:logistic', nthread=-1, gamma=0, min_child_weight=1, \ max_delta_step=0, subsample=1, colsample_bytree=0.8, colsample_bylevel=0.8, \ reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=one, missing=None) XGBmodels.append([model, 50, 5, 'xgb'+str(model_cnt)]) model_cnt += 1 XGBmodels.append([model, 20, 5, 'xgb'+str(model_cnt)]) layer_2_valid['xgb_fe'] += model.predict_proba(valid_data[cols].as_matrix())[:, 0] model_cnt += 1 model = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=None, \ min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, \ max_features=0.2, max_leaf_nodes=None, min_impurity_split=1e-07, \ bootstrap=True, oob_score=False, n_jobs=30, random_state=None, verbose=0, \ warm_start=False, class_weight=None) XGBmodels.append([model, 50, 5, 'rf'+str(model_cnt)]) LRmodels = [] seeds = [0] Cs = [0.15] tols = [0.0001] model_cnt = 0
def main(): args = parse_args() config = parse_config(args.config_file) if config is None: print('No configuration file is defined. ' 'Define one with `--config-file`.') sys.exit(1) if args.plot_dir is not None: if not os.path.isdir(args.plot_dir): os.mkdir(args.plot_dir) index_cols = config['index_features'] event_cols = config['unique_event_features'] # this will be the training dataframe if args.input_file: merged_training_df = read_root(args.input_file, stop=args.stop) merged_training_df.set_index(index_cols, inplace=True) # duplicates may have ended up in the root file len_before = len(merged_training_df) merged_training_df.drop_duplicates(inplace=True) print(f'Dropped {(1 - len(merged_training_df) / len_before) * 100:.5f}%' ' duplicated entries in dataframe') else: merged_training_df = read_full_files(args, config) # in every case, define a proper target merged_training_df['target'] = merged_training_df.eval(config['target_eval']) # sort for performance merged_training_df.sort_index(inplace=True) print_avg_tagging_info(merged_training_df, config) mva_features = config['mva_features'] total_event_number = get_event_number(config) selected_event_number = (merged_training_df.groupby( event_cols).SigYield_sw.head(1).sum()) # build BDT model and train the classifier nBootstrap x 3 times xgb_kwargs = config['xgb_kwargs'] n_jobs = config['n_jobs'] sorting_feature = config['sorting_feature'] bootstrap_roc_aucs = [] bootstrap_scores = [] bootstrap_d2s = [] bootstrap_roc_curves = [] bootstrap_calibration_params = [] nBootstrap = args.n_bootstrap or config['n_bootstrap'] print('Starting bootstrapping.') pbar = tqdm(total=nBootstrap * 6) for _ in range(nBootstrap): # yield 3-fold split for CV df_sets = [merged_training_df.iloc[indices] for indices in NSplit(merged_training_df)] # try to compensate for slow subset creation pbar.update(3) for i in range(3): df1, df2, df3 = (df_sets[i % 3], df_sets[(i + 1) % 3], df_sets[(i + 2) % 3]) model = XGBClassifier(nthread=n_jobs, **xgb_kwargs) model.fit(df1[mva_features], df1.target, sample_weight=df1.SigYield_sw) roc1 = roc_auc_score(df1.target, model.predict_proba(df1[mva_features])[:, 1]) probas = model.predict_proba(df2[mva_features])[:, 1] roc2 = roc_auc_score(df2.target, probas) # calibrate calibrator = PolynomialLogisticRegression(power=3, solver='lbfgs', n_jobs=n_jobs) calibrator.fit(probas.reshape(-1, 1), df2.target, sample_weight=df2.SigYield_sw) bootstrap_calibration_params.append(calibrator.lr.coef_) probas = model.predict_proba(df3[mva_features])[:, 1] calib_probas = calibrator.predict_proba(probas)[:, 1] roc3 = roc_auc_score(df3.target, calib_probas) # concatenating here, since df3 is a view on the main df and will # throw warnings when adding any columns to it df3 = pd.concat([ df3.reset_index(), pd.Series(calib_probas, name='calib_probas'), ], axis=1) best_indices = df3.groupby(event_cols)[sorting_feature].idxmax() best_particles = df3.loc[best_indices] bootstrap_roc_aucs.append([roc1, roc2, roc3]) score = tagging_power_score(best_particles, config, efficiency=selected_event_number/total_event_number, etas='calib_probas') if args.plot_dir is not None: fpr, tpr = roc_curve(best_particles.target, best_particles.calib_probas, sample_weight=best_particles.SigYield_sw)[:2] bootstrap_roc_curves.append([fpr, tpr]) bootstrap_scores.append(score) bootstrap_d2s.append(d2_score(best_particles.calib_probas, sample_weight=best_particles.SigYield_sw)) pbar.update(1) pbar.close() # pickle bootstrap results with open('crossval_training_dump.pkl', 'bw') as f: pickle.dump(dict( roc_curves=bootstrap_roc_curves, tagging_power_scores=bootstrap_scores, d2_scores=bootstrap_d2s, ), f) # plot roc curve on request if args.plot_dir is not None: print('Plotting ROC curves...', end=' ') curve_points = np.array(bootstrap_roc_curves) # hacky test for correct roc curve shapes min_roc_shape = np.min([len(a[0]) for a in curve_points]) fprs, tprs = [], [] for fpr, tpr in curve_points: fprs.append(fpr[:min_roc_shape]) tprs.append(tpr[:min_roc_shape]) fprs = np.array(fprs) tprs = np.array(tprs) plt.style.use('ggplot') plt.rcParams['figure.figsize'] = (6, 6) plt.rcParams['font.size'] = 12 plt.plot([0, 1], '--', label='random') plt.plot(fprs.mean(axis=0), tprs.mean(axis=0), label='Mean ROC curve') plt.fill_between(fprs.mean(axis=0), tprs.mean(axis=0) - tprs.std(axis=0), tprs.mean(axis=0) + tprs.std(axis=0), label=r'$\pm 1 \sigma$ area', alpha=0.4) plt.xlim(-0.05, 1.05) plt.ylim(0, 1.05) plt.text(1, 0.05, 'LHCb unofficial', verticalalignment='bottom', horizontalalignment='right') plt.legend(loc='best') plt.xlabel('false positive rate') plt.ylabel('true positive rate') filename = os.path.join(args.plot_dir, 'ROC-curves.pdf') plt.savefig(filename, bbox_inches='tight') print('done.') d2 = 100 * ufloat(np.mean(bootstrap_d2s), np.std(bootstrap_d2s)) eff = 100 * ufloat(np.mean(noms(bootstrap_scores)), np.std(noms(bootstrap_scores))) print(dedent(f""" CalibrationParams: {np.array(bootstrap_calibration_params).mean(axis=0)} {np.array(bootstrap_calibration_params).std(axis=0)} ROC AUCs: {np.array(bootstrap_roc_aucs).mean(axis=0)} {np.array(bootstrap_roc_aucs).std(axis=0)} Final {nBootstrap}-fold bootstrap performance D2 = {d2}% ε_eff = {eff}%"""))
n_jobs=-1, cv=kfold) result = grid_search.fit(np.array(features), labels) # summarize results print("Best: %f using %s" % (result.best_score_, result.best_params_)) means, stdevs = [], [] for params, mean_score, scores in result.grid_scores_: stdev = scores.std() means.append(mean_score) stdevs.append(stdev) print("%f (%f) with: %r" % (mean_score, stdev, params)) ### final training #features,labels = get_training_data() model = XGBClassifier(learning_rate=0.1, max_depth=8, min_child_weight=200, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1.0, seed=27) model.fit(np.array(features), labels) ### final prediction ids, test_x = get_testing_data() predicted_y = model.predict_proba(np.array(test_x)) predicted_is = predicted_y[:, 1] write_results(ids, predicted_is, fname='rs4.csv')
sample_weight=w_train, # instance weights eval_set = [(x_train,y_train), (x_val,y_val)], # a list of (X,y) tuple pairs to use as validation sets ---> validation_0=train, validation_1=validation sample_weight_eval_set = [w_train, w_val], # list of arrays storing instances weights for the i-th validation set eval_metric = ['auc', 'error'], # list of parameters under eval_metric: https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters early_stopping_rounds=50, # validation metric needs to improve at least once in every early_stopping_rounds round(s) verbose=100) results = model.evals_result() # takes the results from the BDT training above n_estimators = len(results['validation_0']['error']) # number of rounds used for the BDT training auc_train = results['validation_0']['auc'] # subsample: auc for training auc_val = results['validation_1']['auc'] # subsample: auc for validation error_train = results['validation_0']['error'] # subsample: error for training error_val = results['validation_1']['error'] # subsample: error for validation # save the bdt result to our dataframe df.loc[:,'bdt_score'] = model.predict_proba(df[variables])[:,1] df_overlay.loc[:,'bdt_score'] = model.predict_proba(df_overlay[variables])[:,1] # ==================== # # MAKE PLOTS # # ==================== # printtitle('Making plots...') # --- plot auc and error for training and validation plt.figure(figsize=(15,5)) plt.subplot(121) plt.plot(range(0,n_estimators), auc_train, c='blue', label='train') plt.plot(range(0,n_estimators), auc_val, c='orange', label='validation')
def main(): args = parse_args() config = parse_config(args.config_file) if config is None: print('No configuration file is defined. ' 'Define one with `--config-file`.') sys.exit(1) # read dataset files = config['files'] if 'filepath' in config: files = [config['filepath'] + f for f in files] kwargs = config['pandas_kwargs'] print('Reading ', end='') entries = 0 for f in files: rootfile = ROOT.TFile(f) tree = rootfile.Get(kwargs['key']) entries += tree.GetEntries() maxslices = args.max_slices chunksize = kwargs['chunksize'] total = (maxslices if maxslices is not None and maxslices < (entries / chunksize) else (entries / chunksize)) print(total * chunksize, 'events.') df = pd.concat([ df for df in tqdm( islice( read_root(files, flatten=True, **kwargs), maxslices), total=total)]) # rename the tagging particle branches df.rename(columns=dict(zip(df.columns, [c.replace(config['tagging_particle_prefix'], 'tp').replace('-', '_') for c in df.columns])), inplace=True) df['event_id'] = df.runNumber.apply(str) + '_' + df.eventNumber.apply(str) if 'invert_target' in config and config['invert_target']: df['target'] = np.sign(df.B_ID) != np.sign(df.tp_ID) else: df['target'] = np.sign(df.B_ID) == np.sign(df.tp_ID) # read features and selections try: if 'inclusive_mva_features' in config: mva_features = ['tp_' + f for f in config['inclusive_mva_features']] else: mva_features = ['tp_' + f.split(' ')[0] for f in config['selections']] except: raise ValueError('Tried to parse features for the BDT.' ' Either provide well-formatted `selections` or' ' define a `inclusive_mva_features` set.') # build BDT model and train the classifier n_cv x 3 times xgb_kwargs = config['xgb_kwargs'] n_jobs = config['n_jobs'] bootstrap_scores = [] bootstrap_d2s = [] nfold = (args.bootstrap_folds if args.bootstrap_folds is not None else config['n_cv']) print('Starting bootstrapping.') pbar = tqdm(total=nfold * 3) for _ in range(nfold): # yield 3-fold split for CV df_sets = [df.iloc[indices] for indices in NSplit(df)] cv_scores = [] for i in range(3): df1, df2, df3 = (df_sets[i % 3].copy(), df_sets[(i + 1) % 3].copy(), df_sets[(i + 2) % 3].copy()) model = XGBClassifier(nthread=n_jobs, **xgb_kwargs) sample_weight = (df1.target if 'training_weights' in config and config['training_weights'] else None) model.fit(df1[mva_features], df1.target, sample_weight=df1.SigYield_sw) df2['probas'] = model.predict_proba(df2[mva_features])[:, 1] df2.reset_index(inplace=True, drop=True) df2_max = df2.iloc[df2.groupby('event_id')['probas'].idxmax()].copy() df3['probas'] = model.predict_proba(df3[mva_features])[:, 1] df3.reset_index(inplace=True, drop=True) df3_max = df3.iloc[df3.groupby('event_id')['probas'].idxmax()].copy() # calibrate calibrator = PolynomialLogisticRegression(power=4, solver='lbfgs', n_jobs=n_jobs) calibrator.fit(df2_max.probas.reshape(-1, 1), df2_max.target, sample_weight=df2_max.SigYield_sw) df3_max['calib_probas'] = calibrator.predict_proba(df3_max.probas)[:, 1] score = tagging_power_score(df3_max.calib_probas, tot_event_number=get_event_number(df3_max), sample_weight=df3_max.SigYield_sw) bootstrap_scores.append(score) bootstrap_d2s.append(d2_score(df3_max.calib_probas, sample_weight=df3_max.SigYield_sw)) pbar.update(1) pbar.close() print(dedent("""\ Final {}-fold bootstrap performance D2 = {:<6}% ε_eff = {:<6}%""") .format(nfold, 100 * ufloat(np.mean(bootstrap_d2s), np.std(bootstrap_d2s)), 100 * ufloat(np.mean(noms(bootstrap_scores)), np.std(noms(bootstrap_scores)))))
print('We have %d classes and %d models TOTAL so in resulting arrays \ we expect to see %d columns.' % (n_classes, len(models_1) + len(models_2), n_classes * (len(models_1) + len(models_2)))) # Create empty arrays S_train_all = np.zeros((X_train.shape[0], 0)) S_test_all = np.zeros((X_test.shape[0], 0)) # Load results for name in sorted(glob('*.npy')): print('Loading: %s' % name) S = np.load(name) S_train_all = np.c_[S_train_all, S[0]] S_test_all = np.c_[S_test_all, S[1]] print('\nS_train_all shape:', S_train_all.shape) print('S_test_all shape: ', S_test_all.shape) # Initialize 2nd level model model = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=3) # Fit 2nd level model model = model.fit(S_train_all, y_train) # Predict y_pred = model.predict_proba(S_test_all) # Final prediction score print('Final prediction score: %.8f' % log_loss(y_test, y_pred))
def saveTrainingsDev(self, df, imp_features, save_pred=False): logging.debug( "inside saveTrainingsDev Module of XgbSelection Class . dictionary is : {}" .format(self.dictionary)) print('Iterating on different hyper parameters..') version = self.version out = df.loc[:, self.dictionary['id'] + self.dictionary['performance']] out['actual'] = df[self.dictionary['target'][0]] summary_df = pd.DataFrame() identifier = str(len(imp_features)) + 'var' alias = { 'n_estimators': 'est', 'max_depth': 'max_dep', 'subsample': 'sub_s', 'learning_rate': 'learn_r', 'colsample_bytree': 'col_samp', 'reg_lambda': 'lambda', 'gamma': 'gamma', 'min_child_weight': 'mcw' } for idx, row in self.params_df.astype(object).iterrows(): print('Iteration {0} of {1}'.format(idx + 1, self.params_df.shape[0])) tup = [ i for i in zip([ alias.get(row.index[j]) for j in range(len(self.params_df.columns)) ], row.values.astype(str)) ] params_str = [''.join(t) for t in tup] identifier = identifier + '_'.join(params_str) + '_' + version param = row.to_dict() #model = XGBClassifier(seed = 10, **params, nthread = 10) model = XGBClassifier(seed=10, learning_rate=param['learning_rate'], colsample_bytree=param['colsample_bytree'], n_estimators=param['n_estimators'], subsample=param['subsample'], max_depth=param['max_depth'], gamma=param['gamma'], min_child_weight=param['min_child_weight'], nthread=10) model.fit(df.loc[:, imp_features], df[self.dictionary['target'][0]]) joblib.dump( model, self.dictionary['path'] + '/' + 'saved_objects/xgb_' + identifier) feature_imp = pd.DataFrame({ 'feature_names': imp_features, 'importance': model.feature_importances_ }) feature_imp.to_csv(self.dictionary['path'] + '/' + 'results/feature_importance_' + identifier + '.csv', index=False) score = model.predict_proba(df.loc[:, imp_features]) if save_pred: out['pred'] = score[:, 1] out.to_csv(self.dictionary['path'] + '/' + 'results/pred_dev_' + identifier + '.csv', index=False) ks = self.ksTable(score[:, 1], df[self.dictionary['target'][0]], 'dev_xgb_' + identifier) breaks = np.diff(ks['No.Res']) > 0 dec_break = (np.diff(ks['No.Res']) > 0).any() ks_val = ks.KS.max() ks_decile = ks.KS.idxmax() + 1 capture = ks['percent_cum_res'][3] if dec_break: break_dec = min([idx for idx, x in enumerate(breaks) if x]) + 2 summary_df = summary_df.append( pd.DataFrame([ list(row.values) + [ks_val, break_dec, ks_decile, capture] ], columns=list(row.index) + [ 'dev_ks', 'dev_ro_break', 'dev_ks_decile', 'dev_capture' ])) else: break_dec = np.nan summary_df = summary_df.append( pd.DataFrame([ list(row.values) + [ks_val, break_dec, ks_decile, capture] ], columns=list(row.index) + [ 'dev_ks', 'dev_ro_break', 'dev_ks_decile', 'dev_capture' ])) identifier = str(len(imp_features)) + 'var' summary_df.to_csv(self.dictionary['path'] + '/' + 'results/summary_df_params_xgb_' + version + '.csv', index=False) logging.debug( "saveTrainingsDev module of XgbSelection Class executed successfully. summary is :{} " .format(summary_df)) logging.debug(" dictionary is :{} ".format(self.dictionary))
# ada = CustomizedAdaBoostClassifier(n_estimators=100) # ada.fit(X, y) # result0_tmp = ada.predict(X_test) d_tree = DecisionTreeClassifier(max_depth=8) d_tree.fit(X, y) result1 = d_tree.predict_proba(X_test) G = GradientBoostingClassifier(max_depth=6, n_estimators=150) G.fit(X, y) result2 = G.predict_proba(X_test) xg = XGBClassifier(max_depth=8, n_estimators=100) xg.fit(X, y) result3 = xg.predict_proba(X_test) threshold = 0.1 threshold_dict = {} while threshold < 0.95: print('===========\nthreshold: ', threshold) result1_tmp = list(map(lambda x: 0 if x[0] > threshold else 1, result1)) result2_tmp = list(map(lambda x: 0 if x[0] > threshold else 1, result2)) result3_tmp = list(map(lambda x: 0 if x[0] > threshold else 1, result3)) final_result_list = [result1_tmp, result2_tmp, result3_tmp] train_profit, tpp, opf, ofp, off = customize_acc(y_test, ensemble(final_result_list)) print(threshold, train_profit, tpp, opf, ofp, off, tpp / ofp, (tpp + ofp) / (tpp + opf + ofp + off)) final_df = pandas.DataFrame({'predict_y': ensemble(final_result_list)}) final_df.to_csv(str(threshold) + '_jan_pred_result.csv',index=None) threshold = threshold + 0.05
X=np.hstack([train[good+goodx].as_matrix(),train1.as_matrix()]) Xt=np.hstack([test[good+goodx].as_matrix(),test1.as_matrix()]) from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer() names_categorical = [] cand=['v40','v63','v109'] for name in train.columns.values : if train[name].value_counts().shape[0]<1000 or name in cand:# and name not in good: train[name] = map(str, train[name]) test[name] = map(str, test[name]) names_categorical.append(name) print name,train[name].value_counts().shape[0] X_sparse = vec.fit_transform(train[names_categorical].T.to_dict().values()) Xt_sparse = vec.transform(test[names_categorical].T.to_dict().values()) idx=np.array(train.index) del train gc.collect() X=sparse.hstack([X,X_sparse],format='csr')#.toarray() Xt=sparse.hstack([Xt,Xt_sparse],format='csr') print X.shape,y.shape,Xt.shape clf=XGBClassifier(max_depth=11,colsample_bytree=0.5,learning_rate=0.01,n_estimators=1200,nthread=-1) clf.fit(X,y) idx=np.array(test.index)#id_test yp=clf.predict_proba(Xt).T[1] s=pd.DataFrame({idname:idx,'PredictedProb':yp}) s.to_csv('xgb10.csv',index=False)
def pdpVarReduction(self, devset, valsets, valnames, feature_names, params): logging.debug("inside pdpVarReduction Module of XgbSelection Class .") print('Reducing variable using partial dependency plots ..') y = self.dictionary['target'][0] version = self.version save_loc = self.dictionary['path'] num_flat = len(feature_names) nonflat = feature_names summary_df_pdp = pd.DataFrame() dct = collections.OrderedDict(params) identifier = '_'.join([ list(dct.keys())[i] + str(list(dct.values())[i]) for i in range(len(dct.keys())) ]) X_train = devset while num_flat > 0: summary_df = pd.DataFrame() curr_X = X_train[nonflat] target = X_train[y] #model = XGBClassifier(seed=10, **params, nthread=10) model = XGBClassifier(seed=10, learning_rate=params['learning_rate'], colsample_bytree=params['colsample_bytree'], n_estimators=params['n_estimators'], subsample=params['subsample'], max_depth=params['max_depth'], gamma=params['gamma'], min_child_weight=params['min_child_weight'], nthread=10) model.fit(curr_X, target) joblib.dump(model, self.dictionary['path'] + '/' + 'saved_objects/xgb_nonflat_pdp_' + version + '_' + identifier + '_' + str(len(nonflat)) + '.joblib', compress=1) feature_imp = pd.DataFrame({ 'feature_names': nonflat, 'importance': model.feature_importances_ }) feature_imp.to_csv(self.dictionary['path'] + '/' + 'results/feature_importance_nonflat_pdp_' + version + '_' + identifier + '_' + str(len(nonflat)) + '.csv', index=False) score = model.predict_proba(curr_X) ks = self.ksTable( score[:, 1], target, 'dev' + '_xgb_nonflat_pdp_' + version + '_' + identifier + '_' + str(len(nonflat))) breaks = np.diff(ks['No.Res']) > 0 dec_break = (np.diff(ks['No.Res']) > 0).any() ks_val = ks.KS.max() ks_decile = ks.KS.idxmax() + 1 #Top 3 decile capture capture = ks['percent_cum_res'][3] if dec_break: break_dec = min([idx for idx, x in enumerate(breaks) if x]) + 2 summary_df = summary_df.append( pd.DataFrame( [[len(nonflat), ks_val, break_dec, ks_decile, capture] ], columns=[ 'feature_count', 'dev_ks', 'dev_ro_break', 'dev_ks_decile', 'dev_capture' ])) else: break_dec = np.nan summary_df = summary_df.append( pd.DataFrame( [[len(nonflat), ks_val, break_dec, ks_decile, capture] ], columns=[ 'feature_count', 'dev_ks', 'dev_ro_break', 'dev_ks_decile', 'dev_capture' ])) for X_test, dset in zip(valsets, valnames): summary_df_test = pd.DataFrame() curr_X = X_test[nonflat] target = X_test[y] score = model.predict_proba(curr_X) ks = self.ksTable( score[:, 1], target, dset + '_xgb_nonflat_pdp_' + version + '_' + identifier + '_' + str(len(nonflat))) breaks = np.diff(ks['No.Res']) > 0 dec_break = (np.diff(ks['No.Res']) > 0).any() ks_val = ks.KS.max() ks_decile = ks.KS.idxmax() + 1 capture = ks['percent_cum_res'][3] if dec_break: break_dec = min([idx for idx, x in enumerate(breaks) if x]) + 2 summary_df_test = summary_df_test.append( pd.DataFrame([[ len(nonflat), ks_val, break_dec, ks_decile, capture ]], columns=[ 'feature_count', dset + '_ks', dset + '_ro_break', dset + '_ks_decile', dset + '_capture' ])) else: break_dec = np.nan summary_df_test = summary_df_test.append( pd.DataFrame([[ len(nonflat), ks_val, break_dec, ks_decile, capture ]], columns=[ 'feature_count', dset + '_ks', dset + '_ro_break', dset + '_ks_decile', dset + '_capture' ])) summary_df_test.reset_index(drop=True, inplace=True) summary_df[dset + '_ks'] = summary_df_test[dset + '_ks'] summary_df[dset + '_ro_break'] = summary_df_test[dset + '_ro_break'] summary_df[dset + '_ks_decile'] = summary_df_test[dset + '_ks_decile'] summary_df[dset + '_capture'] = summary_df_test[dset + '_capture'] summary_df['dev_' + dset + '_ks_diff'] = ( summary_df['dev_ks'] - summary_df[dset + '_ks']) * 100 / summary_df['dev_ks'] summary_df_pdp = summary_df_pdp.append(summary_df) nonflat_prev = nonflat if not os.path.exists(self.dictionary['path'] + '/' + 'PDP/' + version + '_' + identifier + '_' + str(len(nonflat))): os.makedirs(self.dictionary['path'] + '/' + 'PDP/' + version + '_' + identifier + '_' + str(len(nonflat))) nonflat = self.generatePDP( model, X_train, nonflat, os.path.join( save_loc, self.dictionary['path'] + '/' + 'PDP/' + version + '_' + identifier + '_' + str(len(nonflat)))) num_flat = len(set(nonflat_prev) - set(nonflat)) summary_df_pdp.to_csv(self.dictionary['path'] + '/' + 'results/summary_df_nonflat_pdp_xgb_' + version + '_' + identifier + '.csv', index=False) logging.debug( "pdpvarreduction Module of XgbSelection Class executed successfully." ) return nonflat
loo = LeaveOneOut() y_pred_list = [] auc = [] auc_train = [] for train_index, test_index in loo.split(X): train_index = list(train_index) # print("%s %s" % (train_index, test_index)) X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :] y_train, y_test = y[train_index], y[test_index] model = XGBClassifier(max_depth=3, n_estimators=250, learning_rate=15 / 100, #objective='multi:softmax', objective='binary:logistic', scale_pos_weight=(np.sum(y_train == -1) / np.sum(y_train == 1)), reg_lambda=250) model.fit(X_train, y_train) pred_train = model.predict_proba(X_train)[:, 1] auc_train.append(metrics.roc_auc_score(y_train, pred_train)) y_pred = model.predict_proba(X_test)[:, 1] y_pred_list.append(y_pred[0]) try: auc = metrics.roc_auc_score(y, y_pred_list) except: pass scores = round(auc, 2) scores_train = round(np.array(auc_train).mean(), 2) train_accuracy.append(scores_train) test_accuracy.append(round(scores.mean(), 2)) train_accuracy_all = [] test_accuracy_all = [] def pca_graph(max_num_of_pcas = max_num_of_pcas):
#import matplotlib.pyplot as plt # ## summarize history for accuracy ##plt.plot(model.eval_metric['auc']) #plt.plot(eval_metric['error']) #plt.title('Error') #plt.ylabel('error') #plt.xlabel('n_trees') #plt.legend(['train', 'test'], loc='upper left') # #plt.show() ##plt.savefig('/home/vljchr004/msc-hpc/feedforward_python/fig/feed_forward_2_history1.png', bbox_inches='tight') # #plt.close() # make predictions for test data y_pred = model.predict_proba(x_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) # save model to file pickle.dump(model, open("C:/Users/gerhard/Documents/msc-hpc/xgb/xgb0.pkl", "wb")) np.savetxt("C:/Users/gerhard/Documents/msc-hpc/xgb/xgb0_preds.csv",y_pred,delimiter=", ") np.savetxt("C:/Users/gerhard/Documents/msc-hpc/xgb/xgb0_y_test.csv",y_test,delimiter=", ")
del df['TARGET'] # del df['ID'] id = df_test['ID'] # del df_test['ID'] pca = PCA(n_components=250) train_pcaed = pca.fit_transform(df, target) random_forest = RandomForestClassifier(n_estimators=30, max_depth=5, max_features=20) random_forest.fit(train_pcaed, target) forested = random_forest.predict_proba(train_pcaed) # pipe = Pipeline(steps=[('pca', pca), ('random_forest', random_forest)]) m2_xgb = XGBClassifier(n_estimators=110, nthread=1, max_depth=4) m2_xgb.fit(train_pcaed, target) m2_xgbed = m2_xgb.predict_proba(train_pcaed) logistic_regression = LogisticRegression(penalty='l1') logistic_regression.fit(train_pcaed, target) logistic_regressioned = logistic_regression.predict_proba(train_pcaed) combined = np.concatenate([forested, m2_xgbed, logistic_regressioned], axis=1) log_reg = LogisticRegression() log_reg.fit(combined, target) scores = cross_validation.cross_val_score(log_reg, combined, target, cv=5, scoring='roc_auc') print(scores.mean(), scores)
print('-' * 53) print('Final Results') print('XGBOOST: %f' % xgboostBO.res['max']['max_val']) # Build and Run on the full data set K-fold times for bagging seeds = [1234, 5434, 87897, 123125, 88888] for seed_bag in seeds: X_train, X_valid, y_train, y_valid = train_test_split(train, train_labels, test_size=0.1, random_state=seed_bag) clf = XGBClassifier(max_depth=int(xgboostBO.res['max']['max_params']['max_depth']), learning_rate=xgboostBO.res['max']['max_params']['learning_rate'], n_estimators=int(xgboostBO.res['max']['max_params']['n_estimators']), gamma=xgboostBO.res['max']['max_params']['gamma'], min_child_weight=xgboostBO.res['max']['max_params']['min_child_weight'], max_delta_step=xgboostBO.res['max']['max_params']['max_delta_step'], subsample=xgboostBO.res['max']['max_params']['subsample'], colsample_bytree=xgboostBO.res['max']['max_params']['colsample_bytree'], seed=seed_bag, objective="binary:logistic") clf.fit(X_train, y_train, eval_metric="auc", eval_set=[(X_valid, y_valid)], early_stopping_rounds=20) print('Prediction Complete') preds = clf.predict_proba(test)[:, 1] submission = submission = pd.DataFrame(preds, index=test_labels, columns=['target']) outfile_seed = '../output/xgb_autotune' + str(seed_bag) + '.csv' submission.to_csv(outfile_seed)