def build_prediction_model(path, percentage, para_tuning_mark, last_mark): # Read data if not last_mark: train = pandas.read_csv(path + "train_" + str(percentage)) dev = pandas.read_csv(path + "dev_" + str(percentage)) test = pandas.read_csv(path + "test_" + str(percentage)) else: if percentage == 1.0: return train = pandas.read_csv(path + "train_" + str(percentage) + "_last") dev = pandas.read_csv(path + "dev_" + str(percentage) + "_last") test = pandas.read_csv(path + "test_" + str(percentage) + "_last") # Check whether there are any columns with all zeros nonzero_colums = train.loc[:, (train != 0).any(axis=0)].columns # Scale scale_pos_weight = {0: 0, 1: 0} for index, value in train['label'].iteritems(): scale_pos_weight[value] += 1 scale_value = scale_pos_weight[0] / float(scale_pos_weight[1]) # Build prediction model predictors = [x for x in nonzero_colums if x not in ['label']] if para_tuning_mark: # Parameter turning guide: # https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/ # https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ # Parameter: learning_rate para_tuning_0(train, dev, test, scale_value) # para_tuning_1(train, dev, test, scale_value) # para_tuning_2(train, dev, test, scale_value) # para_tuning_3(train, dev, test, scale_value) # para_tuning_4(train, dev, test, scale_value) else: xgb = XGBClassifier(learning_rate=0.015, n_estimators=686, max_depth=9, min_child_weight=5, gamma=0.0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.01, objective='binary:logistic', nthread=4, scale_pos_weight=scale_value, seed=27) xgb.fit(train[predictors], train['label'], eval_metric='auc') dtest_predprob = xgb.predict_proba(test[predictors])[:, 1] print( "AUC/F1 Score/Kappa (Test):\t%f\t%f\t%f\t" % (metrics.roc_auc_score(test['label'], dtest_predprob), metrics.f1_score(test['label'], dtest_predprob.round()), metrics.cohen_kappa_score(test['label'], dtest_predprob.round())))
def train(ite): print(i) data = train_target_0.sample(700) #数据显示1 :0 = 17:2(》0.5) data = data.append(train_target_1) y_ = data.target del data['target'] xgb.fit(data, y_) # train_p[ite] = xgb.predict(train_data) res[ite] = xgb.predict_proba(test_data)[:, 1]
def trainxgb(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) print('random state: {state}'.format(state=random_state)) xgb = XGBoostClassifier(base_estimator='gbtree', objective='multi:softprob', metric='mlogloss', num_classes=9, learning_rate=random.uniform(0.01,0.05), max_depth=random.randint(10,20), max_samples=random.uniform(0.0,1.0), max_features=random.uniform(0.0,1.0), max_delta_step=random.randint(1,10), min_child_weight=random.randint(1,10), min_loss_reduction=1, l1_weight=0.0, l2_weight=0.0, l2_on_bias=False, gamma=0.02, inital_bias=random.uniform(0.0,1.0), random_state=random_state, watchlist=[[valid_x,valid_y]], n_jobs=30, n_iter=3000, ) xgb.fit(train_x, train_y) valid_predictions = xgb.predict_proba(valid_x) if test(valid_y,valid_predictions) <0.450: test_predictions= xgb.predict_proba(test_x) data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv") data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
def predict(data, slot): xgb = joblib.load(MODELS_D % slot) pred_xgb = xgb.predict_proba(data)[:, 1] # knn = pickle.load(open('../knn-models/%d' % slot, 'rb')) # knn = joblib.load('../knn-models/%d' % slot) # pred_knn = knn.predict_proba(data)[:, 1] # proba = 0.8 * pred_xgb + 0.2 * pred_knn # sgd = pickle.load(open('../sgd-models/%d' % slot, 'rb')) # pred_sgd = sgd.predict_proba(data)[:, 1] # proba = 0.9 * pred_xgb + 0.1 * pred_sgd proba = pred_xgb return proba
def test_xgb(test_tbl, xgb_model, train_list): df_test_x, df_test_y, f_list_test, df_median = data_preprocess(test_tbl) df_test = pd.DataFrame() for e in train_list: df_test[e] = df_test_x[e] df_test_x = df_test # df_test_x.fillna(-1, inplace=True) print 'Read test done' test_y = np.array(df_test_y) xgb = xgb_model test_x = np.array(df_test_x) y_proba = xgb.predict_proba(test_x) y_score = [item[0] for item in y_proba] y_good = [1 - item for item in test_y] tmp_df = pd.DataFrame() tmp_df['f'] = y_score tmp_df['good'] = y_good tmp_df['bad'] = test_y ks_dict = run_ks(test_y, y_proba[:, 1]) auc = roc_auc_score(test_y, y_proba[:, 1]) print "%f\t%f" % (auc, ks_dict['ks']) print_ks(ks_dict, test_tbl + '_score_ks_detail')
def xgboost_param_solution(): xgb=XGBoostClassifier(alpha=0, booster='gbtree', colsample_bytree=0.459971793632, early_stopping_rounds=30, eta=0.0305648288294, eval_metric='mlogloss', gamma=0.0669039612464, l=0, lambda_bias=0, max_delta_step=4, max_depth=14, min_child_weight=8, nthread=4, ntree_limit=0, num_class=9, num_round=1000, objective='multi:softprob', seed=84425, silent=0, subsample=0.972607582489, use_buffer=True) train=load_data('train.csv') test=load_data('test.csv') le = preprocessing.LabelEncoder() le.fit(train['target']) train['target']=le.transform(train['target']) feature_cols= [col for col in train.columns if col not in ['target','id']] X_train=train[feature_cols] X_test=test[feature_cols] y=train['target'] test_ids=test['id'] xgb.fit(X_train, y) preds=xgb.predict_proba(X_test) write_submission(test_ids,preds,'submissions/xgboost_param_solution_76.csv')
print ('F1 Score',f1_score(y_val,RF2preds_val)) print ('ROC AUC Score',roc_auc_score(y_train,RF2predprob_train)) print ('ROC AUC Score',roc_auc_score(y_val,RF2predprob_val)) # As we can see we got an improved score on the tuned datasets and the validation set had a better f1 score than both our first RF and both LRs. But our AUC score did go down some. Next we will be comparing the XGBoost to these models to see if it does better. So lets look at our confusion matrix here tounderstand the classification reports to help us figure out what we still need to improve. print ('Training Confusion Matrix',confusion_matrix(y_train,RF2preds_train)) print ('Val Confusion Matrix',confusion_matrix(y_val,RF2preds_val)) print ('Training Classification report',classification_report(y_train,RF2preds_train)) print ('Val Classification Report',classification_report(y_val,RF2preds_val)) # Now that we have improved the model with the RF, we can next see if an XGBoost will get us any better numbers and then we can choose the best model. Lets start with getting the base model. xgb = XGBClassifier() xgb.fit(x_train, y_train) # Lets get the base predictions for the train and validations sets. The predicted probabilities for predicting the class and getting our AUC and f1 score. xgbpredprob_train = xgb.predict_proba(x_train)[:, 1] xgbpredprob_val = xgb.predict_proba(x_val)[:, 1] # The decision predictions to help us classify and get the f1 scores and see what the recall and precision are if we want them. xgbpreds_train = xgb.predict(x_train) xgbpreds_val = xgb.predict(x_val) # Lets look at the error to assess the fit and efficacy. We will use the aucpr eval metric to get the f1 score related score. We can also use auc but we are focusing mroe on f1 score for prediction and wewill get the auc later. This is us basically re-running the fit and evaluating it vs the val set to see what we get without any tuning. But we will look at the results from above without the evaluation step to get the general baseline next. eval_set = [(x_val, y_val)] eval_metric = ["aucpr","error"] %time xgb.fit(x_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=2) # We get a good base score with a low error to start lets see if we can improve. (aucpr 0.913701, error 0.125) # Results from the initial basline model we want to improve without evaluation. print ('F1 Score',f1_score(y_train,xgbpreds_train))
# initial the model xgb = xgb.XGBClassifier(parameters=xgb_parameters) """## Training and validation""" # split validation set X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=['user_id','product_id','reordered']), train['reordered'], test_size=0.3, random_state=42) # fit the model xgb.fit(X_train, y_train) # make prediction y_pred = (xgb.predict_proba(X_val)[:, 1] >= 0.21).astype('int') #setting a threshold !pip install scikit-plot # evaluation from sklearn.metrics import f1_score, classification_report from scikitplot.metrics import plot_confusion_matrix from scikitplot.classifiers import plot_feature_importances print('F1 Score: {}'.format(f1_score(y_pred, y_val))) print(classification_report(y_pred, y_val)) # plot confusion matrix plot_confusion_matrix(y_pred, y_val) # plot importance features = train.drop(columns=['user_id','product_id','reordered']) plot_feature_importances(xgb, feature_names=features.columns, x_tick_rotation=90, max_num_features=20, figsize=(10,8))
def alation_test(path): # Read data train = pandas.read_csv(path + "train_1.0") test = pandas.read_csv(path + "test_1.0") # Check whether there are any columns with all zeros nonzero_colums = train.loc[:, (train != 0).any(axis=0)].columns # Scale scale_pos_weight = {0: 0, 1: 0} for index, value in train['label'].iteritems(): scale_pos_weight[value] += 1 scale_value = scale_pos_weight[0] / float(scale_pos_weight[1]) # Build prediction model non_linguistic_features = [ 'duration', 'utterance_tutor', 'utterance_student', 'words_tutor', 'words_student', 'unique_words_tutor', 'unique_words_student', 'unique_concepts_tutor', 'unique_concepts_student', 'new_words_tutor', 'new_words_student', 'new_concepts_tutor', 'new_concepts_student', 'wait_time', 'responsiveness_mean', 'alignment_all', 'alignment_concept', 'complexity_tutor', 'complexity_student', 'questions_tutor', 'questions_student', 'sentiment_tutor', 'sentiment_student', 'tutor_experience', 'student_experience' ] features_groups = [[ 'duration', 'utterance_tutor', 'utterance_student', 'words_tutor', 'words_student' ], [ 'unique_words_tutor', 'unique_words_student', 'unique_concepts_tutor', 'unique_concepts_student', 'new_words_tutor', 'new_words_student', 'new_concepts_tutor', 'new_concepts_student' ], ['wait_time', 'responsiveness_mean'], ['alignment_all', 'alignment_concept'], ['complexity_tutor', 'complexity_student'], ['questions_tutor', 'questions_student'], ['sentiment_tutor', 'sentiment_student'], ['tutor_experience', 'student_experience']] # Feature groups k = 0 for i in range(len(features_groups) + 3): # if i < len(features_groups): # continue if i in range(len(features_groups)): print(features_groups[i]) if i < len(features_groups): features_group = features_groups[i] features_group.append('label') else: if i < len(features_groups) + 2: features_group = [ x for x in train.columns if x not in non_linguistic_features ][100 * k + 1:100 * (k + 1) + 1] else: # Trigrams features_group = [ x for x in train.columns if x not in non_linguistic_features ][100 * k + 1:] features_group.append('label') k += 1 train_predictors = [ x for x in nonzero_colums if x not in features_group ] test_predictors = [ x for x in nonzero_colums if x not in features_group ] xgb = XGBClassifier(learning_rate=0.015, n_estimators=686, max_depth=9, min_child_weight=5, gamma=0.0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.01, objective='binary:logistic', nthread=4, scale_pos_weight=scale_value, seed=27) xgb.fit(train[train_predictors], train['label'], eval_metric='auc') dtest_predprob = xgb.predict_proba(test[test_predictors])[:, 1] print( "AUC/F1 Score/Kappa (Test):\t%f\t%f\t%f\t" % (metrics.roc_auc_score(test['label'], dtest_predprob), metrics.f1_score(test['label'], dtest_predprob.round()), metrics.cohen_kappa_score(test['label'], dtest_predprob.round()))) print('')
result = bayes_cv_tuner.fit(train_mod_std[selected_features].values, target_mod.values, callback=status_print) #best_params_mod = { # #} xgb = xgb.XGBClassifier(best_params) print("3.1 model development") oof_xgb = np.zeros(len(train)) predictions_xgb = np.zeros(len(test)) for fold_, (trn_idx, val_idx) in enumerate( folds.split(train_mod.values, target_mod.values)): print("Fold {}".format(fold_ + 1)) xgb.fit(train_mod.iloc[trn_idx][selected_features], target_mod.iloc[trn_idx]) oof_xgb[val_idx] = xgb.predict_proba( train_mod.iloc[val_idx][selectedfeatures])[:, 1] predictions_xgb += xgb.predict_proba( test_mod[selected_features])[:, 1] / folds.n_splits print("CV score: {:<8.5f}".format(roc_auc_score(target_mod, oof_xgb))) sub_df = pd.DataFrame({"ID_code": test["ID_code"].values}) sub_df["target"] = predictions_xgb sub_df.to_csv("../result/submission_xgb_mod.csv", index=False)
for f in ohe_feats: df_all_dummy = pd.get_dummies(df_all[f], prefix=f) df_all = df_all.drop([f], axis=1) df_all = pd.concat((df_all, df_all_dummy), axis=1) #dtrain = xgb.DMatrix(df_all.values, label=labels, missing=np.nan) #Splitting train and test vals = df_all.values X = vals[:piv_train] le = LabelEncoder() y = le.fit_transform(labels) X_test = vals[piv_train:] #Classifier xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) #Taking the 5 classes with highest probabilities ids = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i] ids += [idx] * 5 cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist() #Generate submission sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) sub.to_csv('../out/submission.csv',index=False)
## Find optimal weight for bagging(use both geometric and arthematic progression and ) ## ap1 * [XGBOOST^gp1 * NN^gp2] + ap2 * [ET] # In[ ]: X_train,X_test,y_train,y_test=train_test_split(train[predictors].values,train[target].values.ravel(),test_size=0.3) # In[ ]: xgb=XGBoostClassifier(num_class=2,num_boost_round=148,params) xgb.fit(X_train,y_train) probs1=xgb.predict_proba(X_test) probs1=probs1[:,1] XGBOOST=(probs1>0.4).astype('int') # In[ ]: nn = KerasClassifier(build_fn=base_model, nb_epoch=25, batch_size=64, verbose=2) ## tune the model? No,taking too long. nn.fit(X_train,y_train) probs2=nn.predict_proba(X_test) probs2=probs2[:,1] NN=(probs2>0.4).astype('int') # In[ ]:
def train_pred(Xtrain, ytrain, Xvalid, isfindSTOP=False): from sklearn.model_selection import train_test_split print('%%%%%%%%%%%%%%%%%%%% Start train model. %%%%%%%%%%%%%%%%%%%%') ### FIXED PARS ### learn_rate = 0.07 n_trees = 1000 # previous tested using early stop if isfindSTOP: n_trees = 1000 ### FIXED PARS ### par_max_depth = 7 par_gamma = 20 par_min_child_weight = 10 par_reg_alpha = 0.0 par_reg_lambda = 2.0 par_scale_pos_weight = 1.3 # Define XGBoost classifier xgb = XGBClassifier(objective='binary:logistic', seed=np.random.randint(0, 1000000), learning_rate=learn_rate, n_estimators=n_trees, subsample=0.8, colsample_bytree=0.8, max_depth=par_max_depth, min_child_weight=par_min_child_weight, gamma=par_gamma, reg_alpha=par_reg_alpha, reg_lambda=par_reg_lambda, scale_pos_weight=par_scale_pos_weight) if isfindSTOP: # Use stratified train_test_split due to the very imbalanced label classes X_train, X_val, y_train, y_val = train_test_split(Xtrain, ytrain, test_size=0.1, stratify=ytrain) eval_set = [(X_train, y_train), (X_val, y_val)] # Fit the classifier instance on the training data xgb.fit(X_train, y_train, eval_set=eval_set, early_stopping_rounds=50, eval_metric=gini_xgb_min) # Predict training set: train_predprob = xgb.predict_proba(X_train)[:, 1] val_predprob = xgb.predict_proba(X_val)[:, 1] gini_train = gini_normalized(y_train, train_predprob) gini_val = gini_normalized(y_val, val_predprob) print("Val, Train Gini coef : %.5f %.5f" % (gini_val, gini_train)) else: # Fit the classifier instance on the training data xgb.fit(Xtrain, ytrain) # Predict test sets: p_valid = xgb.predict_proba(Xvalid)[:, 1] return p_valid
# 导入第三方包 import xgboost import numpy as np # 构建XGBoost分类器 xgboost = xgboost.XGBClassifier() # 使用重抽样后的数据,对其建模 xgboost.fit(over_samples_X, over_samples_y) # 将模型运用到测试数据集中 resample_pred = xgboost.predict(np.array(X_test)) # 返回模型的预测效果 print('模型的准确率为:\n', metrics.accuracy_score(y_test, resample_pred)) print('模型的评估报告:\n', metrics.classification_report(y_test, resample_pred)) # 计算欺诈交易的概率值,用于生成ROC曲线的数据 y_score = xgboost.predict_proba(np.array(X_test))[:, 1] fpr, tpr, threshold = metrics.roc_curve(y_test, y_score) # 计算AUC的值 roc_auc = metrics.auc(fpr, tpr) # 绘制面积图 plt.stackplot(fpr, tpr, color='steelblue', alpha=0.5, edgecolor='black') # 添加边际线 plt.plot(fpr, tpr, color='black', lw=1) # 添加对角线 plt.plot([0, 1], [0, 1], color='red', linestyle='--') # 添加文本信息 plt.text(0.5, 0.3, 'ROC curve (area = %0.2f)' % roc_auc) # 添加x轴与y轴标签 plt.xlabel('1-Specificity') plt.ylabel('Sensitivity')
import xgboost as xgb xgb=xgb.XGBClassifier() xgb.fit(X_train, y_train) # In[73]: plt.bar(range(len(xgb.feature_importances_)), xgb.feature_importances_) plt.show() # In[74]: probabilities = xgb.predict_proba(X_test) print_metrics(y_test, probabilities, 0.5) # In[75]: solution=xgb.predict(test1) my_submission=pd.DataFrame({'CustomerID':test.CustomerID,'BikeBuyer': solution}) my_submission.to_csv('XgboostClassifierMicrosoft01.csv', index=False) # In[76]: from sklearn.neural_network import MLPClassifier
n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, subsample=0.9, verbosity=1) X_train_, X_valid = X.iloc[train_index], X.iloc[valid_index] y_train_, y_valid = y.iloc[train_index], y.iloc[valid_index] xgb.fit(X_train_, y_train_) del X_train_, y_train_ pred = xgb.predict_proba(test_X)[:, 1] val = xgb.predict_proba(X_valid)[:, 1] del xgb, X_valid print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val))) del val, y_valid xgb_sub['isFraud'] = xgb_sub['isFraud'] + pred / n_fold del pred gc.collect() xgb_sub.to_csv('sub_xgb.csv', index=False) # ### ensemble # In[63]: if PREDICT:
# random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,Y_train), verbose=3, random_state=27) # start_time = timer(None) # timing starts from this point for "start_time" variable # search = random_search.fit(X_train, Y_train) # timer(start_time) # print(search.best_params_) print('Start training XGB') start_time = timer(None) xgb.fit(X_train, Y_train, eval_metric='auc') timer(start_time) print("Start predicting XGB") start_time = timer(None) predictions = xgb.predict(X_test) timer(start_time) pred_proba = xgb.predict_proba(X_test)[:, 1] print('Statistics') print("AUC : %f" % metrics.roc_auc_score(Y_test, pred_proba)) print("F1 Score: %f" % metrics.f1_score(Y_test, predictions)) print('**********************************') clf = LogisticRegression(C=1e5) print('Start training log regression') start_time = timer(None) clf.fit(X_train, Y_train) timer(start_time) print("Score: ", clf.score(X_test, Y_test)) start_time = timer(None) print("Start predicting log regression") y_pred = clf.predict(X_test) timer(start_time)
'n_estimators': 27, 'subsample': 0.45} ''' xgb.best_score_ # 0.83585339132974634 xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=27, objective='multi:softprob', subsample=0.4, colsample_bytree=0.5, seed=0) xgb.fit(X, y) xgb_predictions = xgb.predict_proba(X_test) # Put these in a good form to spit out xgb_predictions = xgb_predictions.ravel() # Have to ensure these are in the same order, yep, looks good classes = np.tile(xgb.classes_, X_test.shape[0]) ids = np.repeat(test["id"].values, 12) print(xgb_predictions.shape) print(classes.shape) print(ids.shape) print(test_users['id'].shape) print(test['id'].shape) # We want to make this a list of most likely occurances
objective='multi:softmax', sub_sample=1, num_class=4, n_gpus=0) # error evaluation for multiclass training xgb.fit(X_train, y_train) # In[ ]: from sklearn.preprocessing import LabelEncoder labels = LabelEncoder() y_train_labels_fit = labels.fit(y_train) y_train_lables_trf = labels.transform(y_train) test_pred = pd.DataFrame(xgb.predict_proba(X_test), columns=labels.classes_) # In[ ]: #test_pred = pd.DataFrame(bst.predict(X_test1), columns=labels.classes_) q = { 'ID': test_data["ID"], 'no_financial_services': test_pred[0], 'other_only': test_pred[1], 'mm_only': test_pred[2], 'mm_plus': test_pred[3] } df_pred1 = pd.DataFrame(data=q) df_pred1 = df_pred1[[ 'ID', 'no_financial_services', 'other_only', 'mm_only', 'mm_plus' ]]
random_state=20, n_jobs=4) lr = LogisticRegression() lr.fit(train_x, train_y) lr_pred = lr.predict_proba(test_x)[:, 1] lgb.fit(train_x, train_y) lgb_pred = lgb.predict_proba(test_x)[:, 1] gbdt.fit(train_x, train_y) gbdt_pred = gbdt.predict_proba(test_x)[:, 1] xgb.fit(train_x, train_y) xgb_pred = xgb.predict_proba(test_x)[:, 1] y_pred = 0.7 * lgb_pred + 0.15 * xgb_pred + 0.15 * gbdt_pred auc = roc_auc_score(test_y, y_pred) print("xgboost+lightgbm+gbdt的加权auc是{}".format(auc)) mine = MINE() mine.compute_score(lr_pred, xgb_pred) print("lr和xgb的mic:{}".format(mine.mic())) """ class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin): def __init__(self, base_models, meta_model, n_folds=5): self.base_models = base_models
ans = svc.predict_proba(xvalid_tfv) print "The log loss using tfidVectorizer for SVM is " + str(multiclass_logloss(yvalid,ans)) svc = MultinomialNB(C=1.0, kernel='rbf', degree=3, probability=True) svc.fit(xtrain_cv, ytrain) ans = svc.predict_proba(xvalid_cv) print "The log loss using CountVectorizer for SVM is " + str(multiclass_logloss(yvalid,ans)) ''' Predicting using xgboost''' xgb = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, subsample=0.8, nthread=10, learning_rate=0.1) xgb.fit(xtrain_tfv, ytrain) ans = xgb.predict_proba(xvalid_tfv) print "The log loss using tfidVectorizer for xgboost is " + str(multiclass_logloss(yvalid,ans)) xgb.fit(xtrain_cv, ytrain) ans = xgb.predict_proba(xvalid_cv) print "The log loss using CountVectorizer for xgboost is " + str(multiclass_logloss(yvalid,ans)) Grid Search mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True) svd = TruncatedSVD() scl = preprocessing.StandardScaler() lr_model = LogisticRegression()
name = SLOT_FMT % i ans.__setattr__(name, ans.__getattr__(name).astype(float)) feature_frames = [] for i in range(46, 76): data = pd.read_csv('../sess/%03d.csv' % i, index_col='user_id') feature_frames.append(data) features = pd.concat(feature_frames) user = pd.read_csv('../public/user_create_time.csv', index_col='user_id') user2 = user['user_create_time'].str.split('-', 1, expand=True) user2 = user2.astype('int') for index, row in features.iterrows(): user_row = user2.loc[index] features.at[index, 'created'] = (user_row[0] - 2016) * 12 + user_row[1] guess = {} for slot in range(0, 28): xgb = pickle.load(open('../xgb-models/%d' % slot, 'rb')) pred_xgb = xgb.predict_proba(features)[:, 1] # sgd = pickle.load(open('../sgd-models/%d' % slot, 'rb')) # pred_sgd = sgd.predict_proba(features)[:, 1] # proba = 0.9 * pred_xgb + 0.1 * pred_sgd guess[SLOT_FMT % slot] = pred_xgb for i in range(0, len(features)): for slot in range(0, 28): ans.iat[i, slot] = guess[SLOT_FMT % slot][i]
# data_parent = pd.read_csv('septic_patients_data.csv') data_parent = pd.read_csv('sample_test_data.csv') #data_parent = pd.read_csv('results.csv') datax_temp = data_parent[[ 'tissue_extraction', 'temp_fin', 'ph', 'hb', 'lactate' ]] scaler = joblib.load("scaler.save") datax = scaler.transform(datax_temp) xgb = pickle.load(open("xgboost.dat", "rb")) svm = pickle.load(open("svm.dat", "rb")) lr = pickle.load(open("lr.dat", "rb")) randomforest = pickle.load(open("randomforest.dat", "rb")) xgb_pred = pd.DataFrame(xgb.predict_proba(datax)[:, 1]) rf_pred = pd.DataFrame(randomforest.predict_proba(datax)[:, 1]) temp = pd.concat([xgb_pred, rf_pred], axis=1) temp['avg'] = temp.mean(axis=1) combined_df = pd.concat([ pd.DataFrame(data_parent[['subject_id', 'datetime']]), pd.DataFrame(datax_temp), temp['avg'] ], axis=1) combined_df['patient_category'] = combined_df.apply(f, axis=1) # critical_patients = combined_df.loc[combined_df['patient_category'].isin(['very-critical', 'critical', 'moderate-critical'])] combined_df.to_csv('critical_patients_records.csv')
T_train_sample_xgb = xgb.DMatrix(X_train_sample, Y_train_sample) X_test_sample_xgb = xgb.DMatrix(X_test_sample) xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=200, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) #scores: XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=50 #0.8183974444336767 121 rounds Y_test_sample = test_sample["country_destination"] Y_test_sample = Y_test_sample.map(country_num_dic) X_train_sample.isnull().sum() eval_set = [(X_train_sample, Y_train_sample), (X_test_sample, Y_test_sample)] xgb.fit(X_train_sample, Y_train_sample, eval_set = eval_set, eval_metric = 'mlogloss', early_stopping_rounds= 10) Y_pred_sample = xgb.predict_proba(X_test_sample) y_le_train_sample = (train_sample['country_destination'].map(country_num_dic)).values y_le_test_sample = (test_sample['country_destination'].map(country_num_dic)).values y_le_train = (train['country_destination'].map(country_num_dic)).values id_train = train['id'].values id_train_sample = train_sample['id'].values id_test_sample = test_sample['id'].values id_test = test['id'].values #------------- TRAIN SAMPLE PREDICTION -------------------------- ids = [] #list of ids cts = [] #list of countries
def forest_model(test=True, grid_cv=False, save_final_results=True): ''' execute final model ''' global train_full global target_full global X_train global X_test global Y_train global Y_test global final_X_test global GS_CV global f_pred global accuracies logging.warn('Create boosted trees model with training data') ## Encode categories ## le = LabelEncoder() lb = LabelBinarizer() cat_full = le.fit_transform(np.array(target_full).ravel()) cat_full_lb = lb.fit_transform(np.array(target_full).ravel()) mcl = MultiColumnLabelEncoder() ohe = OneHotEncoder() im = Imputer(strategy='most_frequent') im2 = Imputer(strategy='mean') p = Pipeline([('mcl', mcl), ('im', im), ('ohe', ohe)]) ## full dataset ## X = np.concatenate((p.fit_transform(train_full[CAT_COLS]).todense() \ ,im2.fit_transform(np.array(train_full[NUM_COLS]))),axis=1) Y = cat_full ## Set up X,Y data for modeling ## X_train, X_test, Y_train, Y_test = cross_validation.train_test_split( \ X \ , Y \ , test_size=TEST_SIZE \ , random_state=0) if grid_cv: ## Run grid search to find optimal parameters ## params_grid = { 'max_depth': [15, 20, 25], 'subsample': [0.25, 0.5], # 'colsample_bytree':[ 0.25, 0.5, 0.75 ] , } logging.warn( 'Running grid search CV with params: {}'.format(params_grid)) ndcg = make_scorer(ndcg_score, needs_proba=True, k=5) xgb = XGBClassifier(n_estimators=50, objective='multi:softprob', seed=0) cv = GridSearchCV(xgb, params_grid, scoring=ndcg).fit(X, Y) logging.warn('Best XGB params: {}'.format(cv.best_params_)) GS_CV = cv.best_params_ ## Run model with all data and save ## if save_final_results: ''' Write results to a csv file NOTE: sorting is not done here ''' logging.warn('Make predictions for final test set') xgb = XGBClassifier(learning_rate=0.1, n_estimators=500, objective='multi:softprob', seed=0, **GS_CV) xgb.fit(X_train, Y_train) if test: logging.warn('Test prediction accuracy') p_pred = xgb.predict(X_test) p_pred_i = le.inverse_transform(p_pred) p_pred_p = xgb.predict_proba(X_test) cat_tst_lb = lb.fit_transform(Y_test) logging.warn('Accuracy: ' + str(np.mean(p_pred == Y_test))) logging.warn('\n' + classification_report(p_pred, Y_test)) logging.warn('Log Loss: {}'.format(log_loss(Y_test, p_pred_p))) logging.warn('Label Ranking Precision score: {}'\ .format(label_ranking_average_precision_score(cat_tst_lb, p_pred_p))) logging.warn('Label Ranking loss: {}'.format( label_ranking_loss(cat_tst_lb, p_pred_p))) logging.warn('NDCG score: {}'.format( ndcg_score(cat_tst_lb, p_pred_p, k=5))) categories = set(Y_test) accuracies = np.zeros(len(categories)) for c in categories: accuracies[c] = np.sum( p_pred[p_pred == c] == Y_test[p_pred == c]) * 1.0 accuracies[c] /= p_pred[p_pred == c].shape[0] X = np.concatenate((p.transform(final_X_test[CAT_COLS]).todense() \ ,im2.transform(np.array(final_X_test[NUM_COLS]))),axis=1) f_pred = xgb.predict_proba(X)
{'colsample_bytree': 0.5, 'gamma': 0.15, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 6, 'n_estimators': 27, 'subsample': 0.45} ''' xgb.best_score_ # 0.83585339132974634 xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=27, objective='multi:softprob', subsample=0.4, colsample_bytree=0.5, seed=0) xgb.fit(X, y) xgb_predictions = xgb.predict_proba(X_test) # Put these in a good form to spit out xgb_predictions = xgb_predictions.ravel() # Have to ensure these are in the same order, yep, looks good classes = np.tile(xgb.classes_, X_test.shape[0]) ids = np.repeat(test["id"].values, 12) print(xgb_predictions.shape) print(classes.shape) print(ids.shape) print(test_users['id'].shape) print(test['id'].shape) # We want to make this a list of most likely occurances
y = le.fit_transform(labels) X_test = vals[piv_train:] print("PRIV", piv_train) print() #Classifier xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) print("X", X.shape, " Y", y.shape) xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) #Taking the 5 classes with highest probabilities ids = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i] ids += [idx] * 5 cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist() #Generate submission sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) sub.to_csv('../output/sub.csv', index=False) #myfunc()