def objective(parameters): i.append(0) set_weights(model, parameters) p = model.predict(X, batch_size=256, verbose=0)[:, 1] auc = roc_auc_truncated(y, p) pa = model.predict(Xa, batch_size=256, verbose=0)[:, 1] ks = compute_ks(pa[ya == 0], pa[ya == 1], wa[ya == 0], wa[ya == 1]) pc = model.predict(Xc, batch_size=256, verbose=0)[:, 1] cvm = compute_cvm(pc, mc) ks_importance = 1 # relative KS importance ks_target = ks_threshold cvm_importance = 1 # relative CVM importance cvm_target = cvm_threshold alpha = 0.001 # LeakyReLU ks_loss = (1 if ks > ks_target else alpha) * (ks - ks_target) cvm_loss = (1 if cvm > cvm_target else alpha) * (cvm - cvm_target) loss = -auc + ks_importance*ks_loss + cvm_importance*cvm_loss if ks < ks_threshold and cvm < cvm_threshold and auc > auc_log[0]: d.append(0) dump_transductor_model(model, transductor_model_file.format(len(d))) auc_log.pop() auc_log.append(auc) message = "iteration {:7}: Best AUC={:7.5f} achieved, KS={:7.5f}, CVM={:7.5f}".format(len(i), auc, ks, cvm) logger.info(message) if verbose: print("iteration {:7}: AUC: {:7.5f}, KS: {:7.5f}, CVM: {:7.5f}, loss: {:8.5f}".format(len(i), auc, ks, cvm, loss)) return loss
def cv_model(model_list): print "generating cv csv files...." train, test = gen_data() label = train['signal'] train_id = train.id test_id = test.id train_del, test_del = delete_features(train), delete_features(test) check_agreement = pd.read_csv('../data/check_agreement.csv') check_correlation = pd.read_csv('../data/check_correlation.csv') check_agreement= add_features(check_agreement) check_correlation = add_features(check_correlation) X, X_test = train_del.as_matrix(), test_del.as_matrix() print X.shape, X_test.shape kf = KFold(label, n_folds = 4) for j, (clf, clf_name) in enumerate(model_list): print "modelling model %i ...."%j cv_train = np.zeros(len(label)) for i, (train_fold, validate) in enumerate(kf): X_train, X_validate, label_train, label_validate = X[train_fold,:], X[validate,:], label[train_fold], label[validate] clf.fit(X_train,label_train) cv_train[validate] = clf.predict_proba(X_validate)[:,1] auc_score = evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], pd.Series(cv_train)[train['min_ANNmuon'] > 0.4]) print "the true roc_auc_truncated is %.6f"%auc_score clf.fit(X, label) test_probs = clf.predict_proba(X_test)[:,1] # check if it passes the tests print "check if it passes the tests" agreement_probs = clf.predict_proba(delete_features(check_agreement).as_matrix())[:,1] ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print ('KS metric', ks, ks <= 0.09) correlation_probs = clf.predict_proba(delete_features(check_correlation).as_matrix())[:,1] print ('Checking correlation...') cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) print ('CvM metric', cvm, cvm <= 0.002) #if ks <= 0.09 and cvm <= 0.002 and auc_score > 0.975: # no need to check here if auc_score > 0.965: # the minimum threshold # save the cv cv_sub = pd.DataFrame({"id": train_id, "prediction": cv_train, "label": label}) cv_sub.to_csv("../data/cv_folder/xgb%i.csv"%j, index=False) # save the prediction submission = pd.DataFrame({"id": test_id, "prediction": test_probs}) submission.to_csv("../data/pred_folder/xgb%i.csv"%j, index=False) # save agreement submission = pd.DataFrame({"id": check_agreement['id'], "prediction": agreement_probs}) submission.to_csv("../data/agree_folder/xgb%i.csv"%j, index=False) # save correlation submission = pd.DataFrame({"id": check_correlation['id'], "prediction": correlation_probs}) submission.to_csv("../data/correlation_folder/xgb%i.csv"%j, index=False)
def auc_func(weights): final_prediction = 0 for weight, prediction in zip(weights, predictions): final_prediction += weight*prediction # final_prediction = map(lambda x: 1 if x > 0.5 else 0, final_prediction) # return -1.0 * accuracy_score(test_y, final_prediction) return -1.0 * evaluation.roc_auc_truncated(test_y, final_prediction)
def auc_func(weights): final_prediction = 0 for weight, prediction in zip(weights, predictions): final_prediction += weight * prediction # final_prediction = map(lambda x: 1 if x > 0.5 else 0, final_prediction) # return -1.0 * accuracy_score(test_y, final_prediction) return -1.0 * evaluation.roc_auc_truncated(test_y, final_prediction)
def cv_loop(X, y, model, N): mean_auc_truncated = 0. for i in range(N): X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( X, y, test_size=.02, random_state = i*SEED) model.fit(X_train, y_train) preds = model.predict_proba(X_cv)[:,1] auc = evaluation.roc_auc_truncated(y_cv, preds) print "AUC (fold %d/%d): %f" % (i + 1, N, auc) mean_auc_truncated += auc return mean_auc_truncated/N
def objective(parameters): i.append(0) set_weights(model, parameters) p = model.predict(X, batch_size=256, verbose=0)[:, 1] auc = roc_auc_truncated(y, p) pa = model.predict(Xa, batch_size=256, verbose=0)[:, 1] ks = compute_ks(pa[ya == 0], pa[ya == 1], wa[ya == 0], wa[ya == 1]) pc = model.predict(Xc, batch_size=256, verbose=0)[:, 1] cvm = compute_cvm(pc, mc) ks_importance = 1 # relative KS importance ks_target = ks_threshold cvm_importance = 1 # relative CVM importance cvm_target = cvm_threshold alpha = 0.001 # LeakyReLU ks_loss = (1 if ks > ks_target else alpha) * (ks - ks_target) cvm_loss = (1 if cvm > cvm_target else alpha) * (cvm - cvm_target) loss = -auc + ks_importance * ks_loss + cvm_importance * cvm_loss if ks < ks_threshold and cvm < cvm_threshold and auc > auc_log[0]: d.append(0) dump_transductor_model(model, transductor_model_file.format(len(d))) auc_log.pop() auc_log.append(auc) message = "iteration {:7}: Best AUC={:7.5f} achieved, KS={:7.5f}, CVM={:7.5f}".format( len(i), auc, ks, cvm) logger.info(message) if verbose: print( "iteration {:7}: AUC: {:7.5f}, KS: {:7.5f}, CVM: {:7.5f}, loss: {:8.5f}" .format(len(i), auc, ks, cvm, loss)) return loss
"colsample_bytree": 0.7, "seed": 1, "eval_metric": "auc", "nthread": 2, } n_rounds = 4000 # Just a big number to trigger early stopping and best iteration # Train xgb_model = xgb.train( params, xg_train, n_rounds, [(xg_train, "train"), (xg_test, "test")], early_stopping_rounds=40 ) # Predict predictions = xgb_model.predict(xg_test) # Compute weighted AUC AUC = evaluation.roc_auc_truncated(test_y, predictions) errors.append(AUC) print "AUC", AUC # Save best iteration best_iterations.append(xgb_model.best_iteration) # Append new grid error grid_errors.append(np.mean(errors)) grid_best_iterations.append(list(best_iterations)) # Show results for i in xrange(len(params_space)): print "Params: %s, wighted AUC: %f, best iterations: %s, mean: %f" % ( str(params_space[i]), grid_errors[i],
def search(self): ''' main function to perform searching... ''' # initial status current_cv = self.init_cv current_pred = self.init_pred current_agree = self.init_agree current_corr = self.init_correlation train = pd.read_csv("../data/training.csv") # for cross validation purposes label = train['signal'] current_ks = self.check_agreement_func(self.init_agree) current_cvm = self.check_corr_func(self.init_correlation) current_auc = evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], pd.Series(current_cv)[train['min_ANNmuon'] > 0.4]) print "the initial test results..." print ('KS metric',current_ks, current_ks <= 0.09) print ('Cvm metric',current_cvm, current_cvm <= 0.002) # start searching num_round = 0 while current_ks > 0.09 or current_cvm > 0.002: num_round += 1 print "doing round %i..."%num_round if num_round > 10: print "finished all the rounds and can't find a solution..." break random_files = random.sample(self.files, len(self.files)) # add some randomness for f in random_files: # read all the files tmp_cv = pd.read_csv(self.cv_folder + f) tmp_pred = pd.read_csv(self.pred_folder + f) tmp_agree = pd.read_csv(self.agree_folder + f) tmp_corr = pd.read_csv(self.correlation_folder + f) tmp_agree_average = (tmp_agree['prediction'].values + current_agree) * 0.5 tmp_corr_average = (tmp_corr['prediction'].values + current_corr) * 0.5 tmp_cv_average = (tmp_cv['prediction'].values + current_cv) * 0.5 tmp_auc = evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], pd.Series(current_cv)[train['min_ANNmuon'] > 0.4]) if self.check_agreement_func(tmp_agree_average) < current_ks and self.check_corr_func(tmp_corr_average) <= 0.002: # update them current_ks = self.check_agreement_func(tmp_agree_average) current_cvm = self.check_corr_func(tmp_corr_average) current_cv = tmp_cv_average current_pred = (tmp_pred['prediction'].values + current_pred) * 0.5 current_agree,current_corr = tmp_agree_average, tmp_corr_average print "find a reduced ks score %.3f..."%current_ks current_auc = tmp_auc print "the corresponding AUC score is %.5f"%current_auc if current_ks <= 0.09: print "found one that passes the test, and now start to optimize the AUC" print "doing 5 rounds..." n_r = 0 while n_r < 2: n_r += 1 print n_r for e2, f2 in enumerate(random_files): # read all the files tmp_cv2 = pd.read_csv(self.cv_folder + f2) tmp_pred2 = pd.read_csv(self.pred_folder + f2) tmp_agree2 = pd.read_csv(self.agree_folder + f2) tmp_corr2 = pd.read_csv(self.correlation_folder + f2) tmp_agree_average2 = (tmp_agree2['prediction'].values + current_agree) * 0.5 tmp_corr_average2 = (tmp_corr2['prediction'].values + current_corr) * 0.5 tmp_cv_average2 = (tmp_cv2['prediction'].values + current_cv) * 0.5 tmp_auc2 = evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], pd.Series(tmp_cv_average2)[train['min_ANNmuon'] > 0.4]) if self.check_agreement_func(tmp_agree_average2) <= 0.09 and self.check_corr_func(tmp_corr_average2) <= 0.002 and \ tmp_auc2 > current_auc: # update them current_ks = self.check_agreement_func(tmp_agree_average2) current_cvm = self.check_corr_func(tmp_corr_average2) current_cv = tmp_cv_average2 current_pred = (tmp_pred2['prediction'].values + current_pred) * 0.5 current_agree,current_corr = tmp_agree_average2, tmp_corr_average2 print "current ks score %.3f..."%current_ks current_auc = tmp_auc2 print "the corresponding AUC score is %.5f"%current_auc print "Yeah! We've found one good prediction!" submission = pd.DataFrame({"id": tmp_pred['id'], "prediction": current_pred}) submission.to_csv("../submissions/xgb_search_%.4f.csv"%current_auc, index=False) break
Xt, yt, _, _ = load(pt.training_file, pt.train_prediction_file) # shuffled Xa, ya, wa, _ = load(pt.check_agreement_file, pt.check_agreement_prediction_file, tail=len(yt), weight=True) Xc, yc, _, mc = load(pt.check_correlation_file, pt.check_correlation_prediction_file, mass=True) Xt, scaler = preprocess_data(Xt) Xa = preprocess_data(Xa, scaler)[0] Xc = preprocess_data(Xc, scaler)[0] with open(pt.transductor_scaler_file, 'wb') as fid: cPickle.dump(scaler, fid) AUC = roc_auc_truncated(yt, Xt[:, -1]) print('AUC before transductor', AUC) model = create_model(Xt.shape[1]) pretrain = True if pretrain: # pretrain model print("Pretrain model") yt_categorical = np_utils.to_categorical(yt, nb_classes=2) model.fit(Xt, yt_categorical, batch_size=64, nb_epoch=1, validation_data=None, verbose=2,
def stacked_models(train, features, test, in_sample=True): """ Build stacked generalization models, set in_sample to False to predict on test set. """ if in_sample: np.random.seed(1) new_indices = np.asarray(train.index.copy()) np.random.shuffle(new_indices) train = train.iloc[new_indices].reset_index(drop=True).copy() # not used in CV testing.. del test cutoff = int(new_indices.shape[0] * 0.75) X_dev = train[:cutoff].reset_index(drop=True).copy() Y_dev = train[:cutoff]['signal'].reset_index(drop=True).copy() X_test = train[cutoff:][ train[cutoff:]['min_ANNmuon'] > 0.4].reset_index(drop=True).copy() Y_test = train[cutoff:][ train[cutoff:]['min_ANNmuon'] > 0.4]['signal'].reset_index( drop=True).copy() else: np.random.seed(1) new_indices = np.asarray(train.index.copy()) np.random.shuffle(new_indices) train = train.iloc[new_indices].reset_index(drop=True).copy() X_dev = train.reset_index(drop=True).copy() Y_dev = train['signal'].reset_index(drop=True).copy() X_test = test.reset_index(drop=True).copy() Y_test = None n_folds = 5 # put ur parameter tuned CLFs in this list. clfs = [ RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=20, n_jobs=-1), RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=20, n_jobs=-1, max_depth=6), ExtraTreesClassifier(n_estimators=200, criterion='entropy', random_state=50, n_jobs=-1), ExtraTreesClassifier(n_estimators=200, criterion='entropy', random_state=50, n_jobs=-1, max_depth=6), Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())]), UGradientBoostingClassifier(loss=BinFlatnessLossFunction( ['mass'], n_bins=15, uniform_label=0), n_estimators=150, subsample=0.1, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), UGradientBoostingClassifier(loss=KnnFlatnessLossFunction( ['mass'], n_neighbours=30, uniform_label=0), n_estimators=150, subsample=0.1, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), UGradientBoostingClassifier(loss=BinFlatnessLossFunction( ['mass'], n_bins=15, uniform_label=0), n_estimators=100, subsample=0.8, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), UGradientBoostingClassifier(loss=KnnFlatnessLossFunction( ['mass'], n_neighbours=30, uniform_label=0), n_estimators=100, subsample=0.8, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), XGBoostClassifier(eval_metric='auc', objective='binary:logistic', num_class=2, nthread=4, silent=1, colsample_bytree=0.6, eta=0.005, max_depth=6, min_child_weight=13, seed=1337, subsample=0.7), NN1(len(features)), NN2(len(features)), NN3(len(features)), NN4(len(features)) ] skf = list(StratifiedKFold(Y_dev, n_folds)) # Number of training data x Number of classifiers blend_train = np.zeros((X_dev.shape[0], len(clfs))) # Number of testing data x Number of classifiers blend_test = np.zeros((X_test.shape[0], len(clfs))) print 'X_test.shape = %s' % (str(X_test.shape)) print 'blend_train.shape = %s' % (str(blend_train.shape)) print 'blend_test.shape = %s' % (str(blend_test.shape)) # For each classifier, we train the number of fold times (=len(skf)) for j, clf in enumerate(clfs): print 'Training classifier [%s]' % (j) # Number of testing data x Number of folds , we will take the mean of # the predictions later blend_test_j = np.zeros((X_test.shape[0], len(skf))) for i, (train_index, cv_index) in enumerate(skf): print 'Fold [%s]' % (i) # This is the training and validation set X_train = X_dev.iloc[train_index].copy() Y_train = Y_dev.iloc[train_index].copy() X_cv = X_dev.iloc[cv_index].copy() Y_cv = Y_dev.iloc[cv_index].copy() # handle the case of hep.ml stuff if type(clf) == type(UGradientBoostingClassifier()): clf.fit(X_train[features + ['mass']], Y_train.values.astype(np.int32)) else: clf.fit(X_train[features], Y_train.values.astype(np.int32)) # This output will be the basis for our blended classifier to train against, # which is also the output of our classifiers blend_train[cv_index, j] = clf.predict_proba(X_cv[features])[:, 1] blend_test_j[:, i] = clf.predict_proba(X_test[features])[:, 1] # Take the mean of the predictions of the cross validation set blend_test[:, j] = blend_test_j.mean(1) print 'Y_dev.shape = %s' % (Y_dev.shape) # blend with LR... bclf = LogisticRegression() bclf.fit(blend_train, Y_dev) bclf2 = GradientBoostingClassifier(n_estimators=150, learning_rate=0.02, max_depth=4, subsample=0.9, verbose=3, random_state=1337) bclf2.fit(blend_train, Y_dev) bclf3 = NeuralNet( layers=[('input', layers.InputLayer), ('hidden', layers.DenseLayer), ('output', layers.DenseLayer)], # layer parameters: input_shape=(None, blend_train.shape[1]), hidden_num_units=blend_train.shape[1], output_nonlinearity=nonlinearities. softmax, # output layer uses identity function output_num_units=2, # 2 target values # optimization method: update=nesterov_momentum, update_learning_rate=0.01, update_momentum=0.9, regression= False, # flag to indicate we're dealing with regression problem max_epochs=53, # TRY 50 and 46 epochs! verbose=1, eval_size=0.10) bclf3.fit(blend_train.astype(np.float32), Y_dev.astype(np.int32)) bclf4 = AdaBoostClassifier(n_estimators=400, random_state=88) bclf4.fit(blend_train, Y_dev) # Predict now Y_test_predict = bclf.predict_proba(blend_test)[:, 1] Y_test_predict2 = bclf2.predict_proba(blend_test)[:, 1] Y_test_predict3 = bclf3.predict_proba(blend_test.astype(np.float32))[:, 1] Y_test_predict4 = bclf4.predict_proba(blend_test)[:, 1] print 'Logit Coefs:', bclf.coef_ if in_sample: score = evaluation.roc_auc_truncated(Y_test, Y_test_predict) score2 = evaluation.roc_auc_truncated(Y_test, Y_test_predict2) score3 = evaluation.roc_auc_truncated(Y_test, blend_test.mean(1)) score4 = evaluation.roc_auc_truncated( Y_test, scipy_opt(blend_train, Y_dev, blend_test)) score5 = evaluation.roc_auc_truncated( Y_test, (Y_test_predict + Y_test_predict2) / 2.0) score6 = evaluation.roc_auc_truncated(Y_test, Y_test_predict3) score7 = evaluation.roc_auc_truncated( Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3) / 3.0) score8 = evaluation.roc_auc_truncated(Y_test, Y_test_predict4) score9 = evaluation.roc_auc_truncated( Y_test, (Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 3.0) score10 = evaluation.roc_auc_truncated( Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 4.0) print 'LR Score = %s' % (score) print 'GB Score = %s' % (score2) print 'MEAN Score = %s' % (score3) print 'Scipy Score = %s' % (score4) print 'LR + GB score = %s' % (score5) print 'ANN Score= %s' % (score6) print 'LR + GB + ANN Score = %s' % (score7) print 'ADA Score = %s' % (score8) print 'GB + ANN + ADA Score = %s' % (score9) print 'LR + GB + ANN + ADA Score = %s' % (score10) return blend_train, Y_dev, blend_test, Y_test # average of ADA, ANN and GBM. return (Y_test_predict + Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 4.0
#clf2 = SVC(kernel='linear', C=1) #X_train, X_test, y_train, y_test = train_test_split(train[variables], train['signal'], test_size=0.4, random_state=0) clf1.fit(X_train, y_train) #X_test_eval = X_test[X_test['min_ANNmuon'] > 0.4] #y_test_eval = t_test[y_test['min_ANNmuon']>0.4] train_probs = clf1.predict_proba(X_test)[:, 1] AUC = evaluation.roc_auc_truncated(y_test, train_probs) print('AUC', AUC) results.append(AUC) #print(train_probs) #cross_val_score(clf1, train, train['signal'], cv=10) # Check agreement test evaluation.check_agreement(baseline) # Check correlation test
def score(self, X, y): Y = self.predict_proba(X) return evaluation.roc_auc_truncated(y, Y[:, 1])
def AUC(model, dataX, dataY, AUCindex): return evaluation.roc_auc_truncated(np.asarray(dataY.eval())[AUCindex], np.asarray(model.predict()(np.asarray(dataX.eval())[AUCindex])).T[0])
return loss return objective Xt, yt, _, _ = load(pt.training_file, pt.train_prediction_file) # shuffled Xa, ya, wa, _ = load(pt.check_agreement_file, pt.check_agreement_prediction_file, tail=len(yt), weight=True) Xc, yc, _, mc = load(pt.check_correlation_file, pt.check_correlation_prediction_file, mass=True) Xt, scaler = preprocess_data(Xt) Xa = preprocess_data(Xa, scaler)[0] Xc = preprocess_data(Xc, scaler)[0] with open(pt.transductor_scaler_file, 'wb') as fid: cPickle.dump(scaler, fid) AUC = roc_auc_truncated(yt, Xt[:,-1]) print ('AUC before transductor', AUC) model = create_model(Xt.shape[1]) pretrain = True if pretrain: # pretrain model print("Pretrain model") yt_categorical = np_utils.to_categorical(yt, nb_classes=2) model.fit(Xt, yt_categorical, batch_size=64, nb_epoch=1, validation_data=None, verbose=2, show_accuracy=True) print("Save pretrained model") with open(pt.transductor_pretrained_model_file, 'wb') as fid: cPickle.dump(model, fid) else:
def evalerror(preds, dtrain): labels = dtrain.get_label() # return a pair metric_name, result # since preds are margin(before logistic transformation, cutoff at 0) return 'truncated AUC', -evaluation.roc_auc_truncated(labels, preds)
def _score_func(estimator, X, y): pred_probs = estimator.predict_proba(X)[:, 1] return evaluation.roc_auc_truncated(y, pred_probs)
import pandas as pd from sklearn.ensemble import RandomForestClassifier from blue.featurelist import FeatureList from blue.pandas_utils import get_columns_in_df from blue.estimators import HyperoptEstimator from evaluation import roc_auc_truncated train_file = './data/training.csv' test_file = './data/test.csv' flist = FeatureList(train_file, spec='features.yml', derived_list=None) df_train = pd.read_csv(train_file, index_col='id') df_train = get_columns_in_df(df_train, flist.universe) df_test = pd.read_csv(test_file) df_test = get_columns_in_df(df_test, flist.predictors) hpest = HyperoptEstimator(RandomForestClassifier, max_evals=5, n_jobs=3, metric=lambda x,y : - roc_auc_truncated(x,y)) hpest.fit(df_train[flist.predictors].values, df_train[flist.target].values)
def search(self): ''' main function to perform searching... ''' # initial status current_cv = self.init_cv current_pred = self.init_pred current_agree = self.init_agree current_corr = self.init_correlation train = pd.read_csv( "../data/training.csv") # for cross validation purposes label = train['signal'] current_ks = self.check_agreement_func(self.init_agree) current_cvm = self.check_corr_func(self.init_correlation) current_auc = evaluation.roc_auc_truncated( label[train['min_ANNmuon'] > 0.4], pd.Series(current_cv)[train['min_ANNmuon'] > 0.4]) print "the initial test results..." print('KS metric', current_ks, current_ks <= 0.09) print('Cvm metric', current_cvm, current_cvm <= 0.002) # start searching num_round = 0 while current_ks > 0.09 or current_cvm > 0.002: num_round += 1 print "doing round %i..." % num_round if num_round > 10: print "finished all the rounds and can't find a solution..." break random_files = random.sample(self.files, len( self.files)) # add some randomness for f in random_files: # read all the files tmp_cv = pd.read_csv(self.cv_folder + f) tmp_pred = pd.read_csv(self.pred_folder + f) tmp_agree = pd.read_csv(self.agree_folder + f) tmp_corr = pd.read_csv(self.correlation_folder + f) tmp_agree_average = (tmp_agree['prediction'].values + current_agree) * 0.5 tmp_corr_average = (tmp_corr['prediction'].values + current_corr) * 0.5 tmp_cv_average = (tmp_cv['prediction'].values + current_cv) * 0.5 tmp_auc = evaluation.roc_auc_truncated( label[train['min_ANNmuon'] > 0.4], pd.Series(current_cv)[train['min_ANNmuon'] > 0.4]) if self.check_agreement_func( tmp_agree_average ) < current_ks and self.check_corr_func( tmp_corr_average) <= 0.002: # update them current_ks = self.check_agreement_func(tmp_agree_average) current_cvm = self.check_corr_func(tmp_corr_average) current_cv = tmp_cv_average current_pred = (tmp_pred['prediction'].values + current_pred) * 0.5 current_agree, current_corr = tmp_agree_average, tmp_corr_average print "find a reduced ks score %.3f..." % current_ks current_auc = tmp_auc print "the corresponding AUC score is %.5f" % current_auc if current_ks <= 0.09: print "found one that passes the test, and now start to optimize the AUC" print "doing 5 rounds..." n_r = 0 while n_r < 2: n_r += 1 print n_r for e2, f2 in enumerate(random_files): # read all the files tmp_cv2 = pd.read_csv(self.cv_folder + f2) tmp_pred2 = pd.read_csv(self.pred_folder + f2) tmp_agree2 = pd.read_csv(self.agree_folder + f2) tmp_corr2 = pd.read_csv( self.correlation_folder + f2) tmp_agree_average2 = ( tmp_agree2['prediction'].values + current_agree) * 0.5 tmp_corr_average2 = ( tmp_corr2['prediction'].values + current_corr) * 0.5 tmp_cv_average2 = (tmp_cv2['prediction'].values + current_cv) * 0.5 tmp_auc2 = evaluation.roc_auc_truncated( label[train['min_ANNmuon'] > 0.4], pd.Series(tmp_cv_average2)[ train['min_ANNmuon'] > 0.4]) if self.check_agreement_func(tmp_agree_average2) <= 0.09 and self.check_corr_func(tmp_corr_average2) <= 0.002 and \ tmp_auc2 > current_auc: # update them current_ks = self.check_agreement_func( tmp_agree_average2) current_cvm = self.check_corr_func( tmp_corr_average2) current_cv = tmp_cv_average2 current_pred = ( tmp_pred2['prediction'].values + current_pred) * 0.5 current_agree, current_corr = tmp_agree_average2, tmp_corr_average2 print "current ks score %.3f..." % current_ks current_auc = tmp_auc2 print "the corresponding AUC score is %.5f" % current_auc print "Yeah! We've found one good prediction!" submission = pd.DataFrame({ "id": tmp_pred['id'], "prediction": current_pred }) submission.to_csv( "../submissions/xgb_search_%.4f.csv" % current_auc, index=False) break
'gamma': 0.01, # 0.005 "min_child_weight": 5, "silent": 1, "subsample": 0.7, "colsample_bytree": 0.7, 'nthread': 4, "seed": 1} num_trees=600 #gbm = xgb.train(params, xgb.DMatrix(train[features], train["signal"]), num_trees) agreement_probs= rf.predict_proba(check_agreement[features])[:,1] print('Checking agreement...') ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print ('KS metric UB =', ks, ks < 0.09) train_eval_probs1 = rf.predict_proba(train_eval[features])[:,1] AUC1 = evaluation.roc_auc_truncated(train_eval['signal'], train_eval_probs1) print ('AUC UB ', AUC1) print("Make predictions on the test set") rfpred = rf.predict_proba(test[features])[:,1] test_probs = rfpred submission = pd.DataFrame({"id": test["id"], "prediction": test_probs}) submission.to_csv("ub_only_submission.csv", index=False)
def train_test_predict(classifier, classifier_name, features, features_name, data_directory, training_data): """ """ # Fit the classifier with the training data. start = time.time() classifier.fit(training_data[features], training_data['signal']) end = time.time() print("time to fit the classifier: {} seconds".format(end - start)) print() # Check the agreement test. start = time.time() check_agreement = pandas.read_csv(data_directory + 'check_agreement.csv', index_col='id') agreement_probs = classifier.predict_proba(check_agreement[features])[:, 1] ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print('KS metric', ks, ks < 0.09) end = time.time() print("time to check the agreement test: {} seconds".format(end - start)) print() # Check the correlation test. start = time.time() check_correlation = pandas.read_csv(data_directory + 'check_correlation.csv', index_col='id') correlation_probs = classifier.predict_proba(check_correlation[features])[:, 1] cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) print('CvM metric', cvm, cvm < 0.002) end = time.time() print("time to check the correlation test: {} seconds".format(end - start)) print() # Compute weighted AUC on the training data with min_ANNmuon > 0.4. start = time.time() train_eval = training_data[training_data['min_ANNmuon'] > 0.4] train_probs = classifier.predict_proba(train_eval[features])[:, 1] AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs) print('AUC', AUC) end = time.time() print("time to compute the weighted AUC: {} seconds".format(end - start)) print() # Make predictions on the test data. start = time.time() testing_data = pandas.read_csv(data_directory + 'test.csv', index_col='id') result = pandas.DataFrame({'id': testing_data.index}) result['prediction'] = classifier.predict_proba(testing_data[features])[:, 1] end = time.time() print("time to make predictions: {} seconds".format(end - start)) print() predictions_name = classifier_name + '-' + features_name # Generate the csv file for Kaggle. result.to_csv(predictions_name + '.csv', index=False, sep=',') # Run the shell commands to generate the final archive through # the subprocess module calls. print(subprocess.check_output(['rm', '-f', predictions_name+'.7z'])) print(subprocess.check_output(['7z', 'a', predictions_name+'.7z', predictions_name+'.csv'])) print(subprocess.check_output(['ls', '-Ahl', predictions_name+'.csv'])) print(subprocess.check_output(['ls', '-Ahl', predictions_name+'.7z']))
agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print 'KS metric', ks, ks < 0.09 # Check correlation test check_correlation = pandas.read_csv(folder + 'check_correlation.csv', index_col='id') xg_check_correlation = xgb.DMatrix(check_correlation.values) correlation_probs = xgb_model.predict(xg_check_correlation) cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) print 'CvM metric', cvm, cvm < 0.002 # Compute weighted AUC on the training data with min_ANNmuon > 0.4 train_eval = train[train['min_ANNmuon'] > 0.4] train_eval_X = train_eval.drop(variables_to_drop, 1).values xg_train_eval = xgb.DMatrix(train_eval_X) train_probs = xgb_model.predict(xg_train_eval) AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs) print 'AUC', AUC # Predict test, create file for kaggle test = pandas.read_csv(folder + 'test.csv', index_col='id') test_X = test.values xg_test = xgb.DMatrix(test_X) result = pandas.DataFrame({'id': test.index}) result['prediction'] = xgb_model.predict(xg_test) result.to_csv('../submissions/xgb.csv', index=False, sep=',')
def stacked_models(train, features, test, in_sample=True): """ Build stacked generalization models, set in_sample to False to predict on test set. """ if in_sample: np.random.seed(1) new_indices = np.asarray(train.index.copy()) np.random.shuffle(new_indices) train = train.iloc[new_indices].reset_index(drop=True).copy() # not used in CV testing.. del test cutoff = int(new_indices.shape[0] * 0.75) X_dev = train[:cutoff].reset_index(drop=True).copy() Y_dev = train[:cutoff]['signal'].reset_index(drop=True).copy() X_test = train[cutoff:][train[cutoff:]['min_ANNmuon'] > 0.4].reset_index(drop=True).copy() Y_test = train[cutoff:][train[cutoff:]['min_ANNmuon'] > 0.4]['signal'].reset_index(drop=True).copy() else: np.random.seed(1) new_indices = np.asarray(train.index.copy()) np.random.shuffle(new_indices) train = train.iloc[new_indices].reset_index(drop=True).copy() X_dev = train.reset_index(drop=True).copy() Y_dev = train['signal'].reset_index(drop=True).copy() X_test = test.reset_index(drop=True).copy() Y_test = None n_folds = 5 # put ur parameter tuned CLFs in this list. clfs = [ RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=20, n_jobs=-1), RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=20, n_jobs=-1, max_depth=6), ExtraTreesClassifier(n_estimators=200, criterion='entropy', random_state=50, n_jobs=-1), ExtraTreesClassifier(n_estimators=200, criterion='entropy', random_state=50, n_jobs=-1, max_depth=6), Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())]), UGradientBoostingClassifier(loss=BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0), n_estimators=150, subsample=0.1, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), UGradientBoostingClassifier(loss=KnnFlatnessLossFunction(['mass'], n_neighbours=30, uniform_label=0), n_estimators=150, subsample=0.1, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), UGradientBoostingClassifier(loss=BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0), n_estimators=100, subsample=0.8, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), UGradientBoostingClassifier(loss=KnnFlatnessLossFunction(['mass'], n_neighbours=30, uniform_label=0), n_estimators=100, subsample=0.8, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), XGBoostClassifier(eval_metric='auc', objective='binary:logistic', num_class=2, nthread=4, silent=1, colsample_bytree=0.6, eta=0.005, max_depth=6, min_child_weight=13, seed=1337, subsample=0.7 ), NN1(len(features)), NN2(len(features)), NN3(len(features)), NN4(len(features)) ] skf = list(StratifiedKFold(Y_dev, n_folds)) # Number of training data x Number of classifiers blend_train = np.zeros((X_dev.shape[0], len(clfs))) # Number of testing data x Number of classifiers blend_test = np.zeros((X_test.shape[0], len(clfs))) print 'X_test.shape = %s' % (str(X_test.shape)) print 'blend_train.shape = %s' % (str(blend_train.shape)) print 'blend_test.shape = %s' % (str(blend_test.shape)) # For each classifier, we train the number of fold times (=len(skf)) for j, clf in enumerate(clfs): print 'Training classifier [%s]' % (j) # Number of testing data x Number of folds , we will take the mean of # the predictions later blend_test_j = np.zeros((X_test.shape[0], len(skf))) for i, (train_index, cv_index) in enumerate(skf): print 'Fold [%s]' % (i) # This is the training and validation set X_train = X_dev.iloc[train_index].copy() Y_train = Y_dev.iloc[train_index].copy() X_cv = X_dev.iloc[cv_index].copy() Y_cv = Y_dev.iloc[cv_index].copy() # handle the case of hep.ml stuff if type(clf) == type(UGradientBoostingClassifier()): clf.fit(X_train[features + ['mass']], Y_train.values.astype(np.int32)) else: clf.fit(X_train[features], Y_train.values.astype(np.int32)) # This output will be the basis for our blended classifier to train against, # which is also the output of our classifiers blend_train[cv_index, j] = clf.predict_proba(X_cv[features])[:, 1] blend_test_j[:, i] = clf.predict_proba(X_test[features])[:, 1] # Take the mean of the predictions of the cross validation set blend_test[:, j] = blend_test_j.mean(1) print 'Y_dev.shape = %s' % (Y_dev.shape) # blend with LR... bclf = LogisticRegression() bclf.fit(blend_train, Y_dev) bclf2 = GradientBoostingClassifier(n_estimators=150, learning_rate=0.02, max_depth=4, subsample=0.9, verbose=3, random_state=1337) bclf2.fit(blend_train, Y_dev) bclf3 = NeuralNet(layers=[ ('input', layers.InputLayer), ('hidden', layers.DenseLayer), ('output', layers.DenseLayer)], # layer parameters: input_shape=(None, blend_train.shape[1]), hidden_num_units = blend_train.shape[1], output_nonlinearity=nonlinearities.softmax, # output layer uses identity function output_num_units=2, # 2 target values # optimization method: update=nesterov_momentum, update_learning_rate=0.01, update_momentum=0.9, regression=False, # flag to indicate we're dealing with regression problem max_epochs=53, # TRY 50 and 46 epochs! verbose=1, eval_size=0.10 ) bclf3.fit(blend_train.astype(np.float32), Y_dev.astype(np.int32)) bclf4 = AdaBoostClassifier(n_estimators=400, random_state=88) bclf4.fit(blend_train, Y_dev) # Predict now Y_test_predict = bclf.predict_proba(blend_test)[:, 1] Y_test_predict2 = bclf2.predict_proba(blend_test)[:, 1] Y_test_predict3 = bclf3.predict_proba(blend_test.astype(np.float32))[:, 1] Y_test_predict4 = bclf4.predict_proba(blend_test)[:, 1] print 'Logit Coefs:', bclf.coef_ if in_sample: score = evaluation.roc_auc_truncated(Y_test, Y_test_predict) score2 = evaluation.roc_auc_truncated(Y_test, Y_test_predict2) score3 = evaluation.roc_auc_truncated(Y_test, blend_test.mean(1)) score4 = evaluation.roc_auc_truncated(Y_test, scipy_opt(blend_train, Y_dev, blend_test)) score5 = evaluation.roc_auc_truncated(Y_test, (Y_test_predict + Y_test_predict2) / 2.0) score6 = evaluation.roc_auc_truncated(Y_test, Y_test_predict3) score7 = evaluation.roc_auc_truncated(Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3) / 3.0) score8 = evaluation.roc_auc_truncated(Y_test, Y_test_predict4) score9 = evaluation.roc_auc_truncated(Y_test, (Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 3.0) score10 = evaluation.roc_auc_truncated(Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 4.0) print 'LR Score = %s' % (score) print 'GB Score = %s' % (score2) print 'MEAN Score = %s' % (score3) print 'Scipy Score = %s' % (score4) print 'LR + GB score = %s' % (score5) print 'ANN Score= %s' % (score6) print 'LR + GB + ANN Score = %s' % (score7) print 'ADA Score = %s' % (score8) print 'GB + ANN + ADA Score = %s' % (score9) print 'LR + GB + ANN + ADA Score = %s' % (score10) return blend_train, Y_dev, blend_test, Y_test # average of ADA, ANN and GBM. return (Y_test_predict + Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 4.0
# Grid search to compute best score def multichoose(n,k): if k < 0 or n < 0: return "Error" if not k: return [[0]*n] if not n: return [] if n == 1: return [[k]] return [[0]+val for val in multichoose(n-1,k)] + \ [[val[0]+1]+val[1:] for val in multichoose(n,k-1)] n = 2 k = 1000 for xs in multichoose(n,k): #print xs preds = (xs[0]*keras_pred + xs[1]*gbm_pred)/float(k) score = evaluation.roc_auc_truncated(train_eval_table[i]['signal'], preds) if score>=train_eval_score[i]: train_eval_score[i] = score print score best_xs = xs print train_eval_score[i] print best_xs test["prediction_%i" %i] = (best_xs[0]*keras_test + best_xs[1]*gbm_test)/float(k) with open('./output/semi_strong_submission_%i.csv' %i, 'w') as f: f.write('id,prediction\n') for ID, p in zip(test['id'], test["prediction_%i" %i]): f.write('%s,%.8f\n' % (ID, p)) # Save best combination weight
def fit(self, data, data_to_predict=test, pred_cv=False): # pred_cv is used for predicing on data_to_predict using cross validation, it's easier to include it in the fit function print('Fitting ' + self.model_name + ' model with ' + self.var_name + ' variables using ' + str(self.cv.n_splits) + '-fold Cross Validation\n') X = data[self.variables].values y = data['signal'].values trained = np.zeros(len(y)) for i, (train_ind, test_ind) in enumerate(self.cv.split(X, y)): mod = self.create_model() scaler = StandardScaler() X[train_ind] = scaler.fit_transform(X[train_ind]) X[test_ind] = scaler.transform(X[test_ind]) if (self.train_params is not {}) and self.fig_name == 'xgb': self.train_params['eval_set'] = [(X[test_ind], y[test_ind]) ] # for xgb models if (self.train_params is not {}) and self.fig_name == 'nn': self.train_params['validation_data'] = (X[test_ind], y[test_ind] ) # for nn models hist = mod.fit(X[train_ind], y[train_ind], **self.train_params) if pred_cv == True: X_pred = scaler.transform(data_to_predict[self.variables]) if self.nn == True: y_pred_val = mod.predict(X[test_ind]) y_pred_train = mod.predict(X[train_ind]) trained[test_ind] = y_pred_val.reshape((test_ind.shape[0], )) self.val_history_nn.append(hist.history['val_loss']) self.train_history_nn.append(hist.history['loss']) if pred_cv == True: self.predicted_cv.append(mod.predict(X_pred)) else: y_pred_val = mod.predict_proba(X[test_ind])[:, 1] y_pred_train = mod.predict_proba(X[train_ind])[:, 1] trained[test_ind] = y_pred_val if pred_cv == True: self.predicted_cv.append(mod.predict_proba(X_pred)[:, 1]) result_val = evaluation.roc_auc_truncated(y[test_ind], y_pred_val) result_train = evaluation.roc_auc_truncated( y[train_ind], y_pred_train) self.scores_val.append(result_val) self.scores_train.append(result_train) print( 'Iteration {} gave ROC AUC score of {} for validation set and {} for training set \n' .format(i + 1, np.round(result_val, 4), np.round(result_train, 4))) print( 'Mean ROC AUC score for {}-fold CV is:\n{} for validation set \n{} for training set\n' .format(self.cv.n_splits, np.round(np.mean(self.scores_val), 4), np.round(np.mean(self.scores_train), 4))) self.trained = trained