print('RandomForest on SMOTED Training Data:') tic = time() X_train = X_all[:y_train.shape[0]] X_test = X_all[y_train.shape[0]:] X_resampled, y_resampled = SMOTE().fit_sample(X_train, y_train) toc = time() print('SMOTE time = ', toc - tic) ## create temporary train and test sets to examine the goodness of fits X_train_tmp, X_test_tmp, y_train_tmp, y_test_tmp = train_test_split( X_resampled, y_resampled, test_size=0.5, random_state=42) #train and predict importances, std, y_pred_tmp, exec_time = compute_feature_importances_RF( X_train_tmp, y_train_tmp, X_test_tmp) # test confusion_matrix = calculate_confusion_matrix(np.ravel(y_test_tmp), np.ravel(y_pred_tmp)) true_neg, false_pos, false_neg, true_pos = confusion_matrix[ 0, 0], confusion_matrix[0, 1], confusion_matrix[1, 0], confusion_matrix[1, 1] # TP = 573312, FP = 52579, FN = 197, TN = 520939 print(confusion_matrix) print(classification_report(y_test_tmp, y_pred_tmp)) #select features top_features = filter_for_top_features( X_train.columns, importances, threshold_imps=0.95 ) # threshold from 0.9 to 0.95same 68 features selected ##Try ADASYN and compare the top_features sets #tic = time() #X_resampled2, y_resampled2 = ADASYN().fit_sample(X_train, y_train) #toc = time()
subsample=0.25, colsample_bytree=0.2, max_delta_step=0, gamma=3, objective='mlogloss', reg_alpha=0.5, missing=np.nan) # fitting model # gbm.fit(x_train, y_train, eval_set=[(x_train,y_train),(x_test, y_test)], eval_metric='mlogloss', verbose=True) # print gbm.evals_result() gbm.fit(x_train, y_train) # training performance y_pred = gbm.predict(x_train) conf_train, roc_auc_train = calculate_confusion_matrix(y_pred, y_train) # test y_pred = gbm.predict(x_test) conf, roc_auc = calculate_confusion_matrix(y_pred, y_test) labels.append(np.unique(y_test)) confs.append(conf) aucs.append(roc_auc) print np.unique(y_test) print roc_auc_train, np.nanmean(roc_auc_train) print roc_auc, np.nanmean(roc_auc) # saving the results if save_results:
# conf_train, roc_auc_train = calculate_confusion_matrix(y_pred, y_train) # test (layer 1) y_pred1 = gbm1.predict(x_test1) # test (layer 2) y_pred1_code = pd.DataFrame( columns=['loc {}'.format(j) for j in range(len(location_top))]) for j in range(x_test1.shape[0]): y_pred1_code.loc[j, :] = one_hot_encoder(y_pred1[j], np.array(location_top)) x_test2 = pd.concat([x_test1, y_pred1_code], axis=1) y_pred = gbm2.predict(x_test2) # test performance conf, roc_auc = calculate_confusion_matrix(y_pred, y_test) # foursquare performance #conf_fsq, roc_auc_fsq = calculate_confusion_matrix(state_fsq_all[i], y_test) labels.append(np.unique(y_test)) confs.append(conf) aucs.append(roc_auc) #confs_fsq.append(conf_fsq) #aucs_fsq.append(roc_auc_fsq) # print 'train' # print np.unique(y_train) # #print conf # print np.nanmean(roc_auc_train)
labels = [] inds = np.arange(0, len(target_all), 1) for i in range(n_boot): print "------------------" print i # ind_boot = np.random.choice(inds, size=inds.size, replace=True) ind_boot = np.random.choice(inds, size=np.floor(inds.size * split), replace=False) y_report = pd.concat([target_all[j]["location"] for j in ind_boot], axis=0) y_fsq = pd.concat([target_all[j]["fsq"] for j in ind_boot], axis=0) # foursquare performance conf, roc_auc = calculate_confusion_matrix(y_fsq, y_report) labels.append(np.unique(y_report)) confs.append(conf) aucs.append(roc_auc) print np.unique(y_report) print roc_auc, np.nanmean(roc_auc) # saving the results if save_results: with open("auc_location_new_10fold_fsq2.dat", "w") as f: pickle.dump([aucs, confs, labels], f) f.close()
else: feature_new = np.append(feature_new, [feature[i,:]], axis=0) state_new = np.append(state_new, state[i]) state = state_new feature = feature_new #creating train and test sets x_train, y_train, x_test, y_test = split_balanced(feature, state, 0.5) #train gbm = xgboost.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(x_train, y_train) #test predictions = gbm.predict(x_test) conf, roc_auc = calculate_confusion_matrix(predictions, y_test) print np.unique(state) print roc_auc print '------------------' labels.append(np.unique(np.append(y_test, predictions))) confs.append(conf) aucs.append(roc_auc) # saving the results with open('accuracy_personal.dat','w') as f: pickle.dump([aucs, confs, labels], f) f.close() # In[35]:
# remove foursquare data x_train = x_train.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7','fsq 8','fsq distance'],axis=1) x_test = x_test.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7','fsq 8','fsq distance'],axis=1) x_train = x_train.reset_index(drop=True) x_test = x_test.reset_index(drop=True) # train (layer 1) #eta_array = np.array([0.05]*200+[0.02]*200+[0.01]*200) gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05, nthread=12, subsample=0.1, colsample_bytree=0.5, max_delta_step=0, gamma=2, objective='multi:softmax', reg_alpha=0.9, missing=np.nan) # gbm.fit(x_train, y_train, eval_set=[(x_train,y_train),(x_test, y_test)], eval_metric='mlogloss', verbose=True) # print gbm.evals_result() gbm.fit(x_train, y_train) # training performance y_pred = gbm.predict(x_train) conf_train, roc_auc_train = calculate_confusion_matrix(y_pred, y_train) # test y_pred = gbm.predict(x_test) conf, roc_auc = calculate_confusion_matrix(y_pred, y_test) # foursquare performance #conf_fsq, roc_auc_fsq = calculate_confusion_matrix(state_fsq_all[i], y_test) labels.append(np.unique(y_test)) confs.append(conf) aucs.append(roc_auc) print np.unique(y_test) print roc_auc_train, np.nanmean(roc_auc_train) print roc_auc, np.nanmean(roc_auc)
# x_train = x_train.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7'],axis=1) # x_test = x_test.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7'],axis=1) # train (layer 1) #eta_list = np.array([0.05]*200+[0.02]*200+[0.01]*200) gbm = xgb.XGBClassifier(max_depth=3, n_estimators=100, learning_rate=0.01, nthread=12, subsample=1, max_delta_step=0).fit(x_train, y_train) # train performance # y_pred = gbm.predict(x_train) # conf_train, roc_auc_train = calculate_confusion_matrix(y_pred, y_train) # test (layer 1) y_pred = gbm.predict(x_test) # test performance conf, roc_auc = calculate_confusion_matrix(y_pred, y_test) # foursquare performance #conf_fsq, roc_auc_fsq = calculate_confusion_matrix(state_fsq_all[i], y_test) labels.append(np.unique(y_test)) confs.append(conf) aucs.append(roc_auc) #confs_fsq.append(conf_fsq) #aucs_fsq.append(roc_auc_fsq) # print 'train' # print np.unique(y_train) # #print conf # print np.nanmean(roc_auc_train)
state_new = np.append(state_new, state[i]) state = state_new feature = feature_new #creating train and test sets x_train, y_train, x_test, y_test = split_balanced(feature, state, 0.5) #train gbm = xgboost.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(x_train, y_train) #test predictions = gbm.predict(x_test) conf, roc_auc = calculate_confusion_matrix(predictions, y_test) print np.unique(state) print roc_auc print '------------------' labels.append(np.unique(np.append(y_test, predictions))) confs.append(conf) aucs.append(roc_auc) # saving the results with open('accuracy_personal.dat', 'w') as f: pickle.dump([aucs, confs, labels], f) f.close() # In[35]:
# # test set # x_test = feature.loc[inds[split:],:] # y_test = target.loc[inds[split:]] # x_test = x_test.reset_index(drop=True) # y_test = y_test.reset_index(drop=True) x_train, y_train, x_test, y_test = split_binary(feature, target, 0.9, oversample=True) # train (layer 1) #eta_list = np.array([0.05]*200+[0.02]*200+[0.01]*200) gbm = xgb.XGBClassifier(max_depth=20, n_estimators=400, learning_rate=0.001, nthread=12, subsample=.1, colsample_bytree=np.sqrt(feature.shape[1])/float(feature.shape[1]), colsample_bylevel=1, max_delta_step=20, seed=0, objective='binary:logistic').fit(x_train, y_train) # TODO: cgb.cv(///) # train performance y_pred = gbm.predict(x_train) conf, auc = calculate_confusion_matrix(y_pred, y_train) aucs_train[i] = auc[0] # test (layer 1) y_pred = gbm.predict(x_test) # test performance conf, auc = calculate_confusion_matrix(y_pred, y_test) aucs[i] = auc[0] confs.append(conf) # conf, roc_auc = calculate_confusion_matrix(y_pred, np.array(target)) print print 'Train AUC: {}'.format(np.nanmean(aucs_train))