def check_agreement_func(self, agreement_probs): ks = evaluation.compute_ks( agreement_probs[self.check_agreement['signal'].values == 0], agreement_probs[self.check_agreement['signal'].values == 1], self.check_agreement[self.check_agreement['signal'] == 0]['weight'].values, self.check_agreement[self.check_agreement['signal'] == 1]['weight'].values) return ks
def objective(parameters): i.append(0) set_weights(model, parameters) p = model.predict(X, batch_size=256, verbose=0)[:, 1] auc = roc_auc_truncated(y, p) pa = model.predict(Xa, batch_size=256, verbose=0)[:, 1] ks = compute_ks(pa[ya == 0], pa[ya == 1], wa[ya == 0], wa[ya == 1]) pc = model.predict(Xc, batch_size=256, verbose=0)[:, 1] cvm = compute_cvm(pc, mc) ks_importance = 1 # relative KS importance ks_target = ks_threshold cvm_importance = 1 # relative CVM importance cvm_target = cvm_threshold alpha = 0.001 # LeakyReLU ks_loss = (1 if ks > ks_target else alpha) * (ks - ks_target) cvm_loss = (1 if cvm > cvm_target else alpha) * (cvm - cvm_target) loss = -auc + ks_importance*ks_loss + cvm_importance*cvm_loss if ks < ks_threshold and cvm < cvm_threshold and auc > auc_log[0]: d.append(0) dump_transductor_model(model, transductor_model_file.format(len(d))) auc_log.pop() auc_log.append(auc) message = "iteration {:7}: Best AUC={:7.5f} achieved, KS={:7.5f}, CVM={:7.5f}".format(len(i), auc, ks, cvm) logger.info(message) if verbose: print("iteration {:7}: AUC: {:7.5f}, KS: {:7.5f}, CVM: {:7.5f}, loss: {:8.5f}".format(len(i), auc, ks, cvm, loss)) return loss
def cv_model(model_list): print "generating cv csv files...." train, test = gen_data() label = train['signal'] train_id = train.id test_id = test.id train_del, test_del = delete_features(train), delete_features(test) check_agreement = pd.read_csv('../data/check_agreement.csv') check_correlation = pd.read_csv('../data/check_correlation.csv') check_agreement= add_features(check_agreement) check_correlation = add_features(check_correlation) X, X_test = train_del.as_matrix(), test_del.as_matrix() print X.shape, X_test.shape kf = KFold(label, n_folds = 4) for j, (clf, clf_name) in enumerate(model_list): print "modelling model %i ...."%j cv_train = np.zeros(len(label)) for i, (train_fold, validate) in enumerate(kf): X_train, X_validate, label_train, label_validate = X[train_fold,:], X[validate,:], label[train_fold], label[validate] clf.fit(X_train,label_train) cv_train[validate] = clf.predict_proba(X_validate)[:,1] auc_score = evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], pd.Series(cv_train)[train['min_ANNmuon'] > 0.4]) print "the true roc_auc_truncated is %.6f"%auc_score clf.fit(X, label) test_probs = clf.predict_proba(X_test)[:,1] # check if it passes the tests print "check if it passes the tests" agreement_probs = clf.predict_proba(delete_features(check_agreement).as_matrix())[:,1] ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print ('KS metric', ks, ks <= 0.09) correlation_probs = clf.predict_proba(delete_features(check_correlation).as_matrix())[:,1] print ('Checking correlation...') cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) print ('CvM metric', cvm, cvm <= 0.002) #if ks <= 0.09 and cvm <= 0.002 and auc_score > 0.975: # no need to check here if auc_score > 0.965: # the minimum threshold # save the cv cv_sub = pd.DataFrame({"id": train_id, "prediction": cv_train, "label": label}) cv_sub.to_csv("../data/cv_folder/xgb%i.csv"%j, index=False) # save the prediction submission = pd.DataFrame({"id": test_id, "prediction": test_probs}) submission.to_csv("../data/pred_folder/xgb%i.csv"%j, index=False) # save agreement submission = pd.DataFrame({"id": check_agreement['id'], "prediction": agreement_probs}) submission.to_csv("../data/agree_folder/xgb%i.csv"%j, index=False) # save correlation submission = pd.DataFrame({"id": check_correlation['id'], "prediction": correlation_probs}) submission.to_csv("../data/correlation_folder/xgb%i.csv"%j, index=False)
def check_a(agreement_probs): check_agreement = pd.read_csv('check_agreement.csv') ks = e.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print 'KS metric', ks, ks < 0.09
def check_agreement_func(self, agreement_probs): ks = evaluation.compute_ks( agreement_probs[self.check_agreement['signal'].values == 0], agreement_probs[self.check_agreement['signal'].values == 1], self.check_agreement[self.check_agreement['signal'] == 0] ['weight'].values, self.check_agreement[ self.check_agreement['signal'] == 1]['weight'].values) return ks
def is_conform_ks(check_agreement, variables, model): agreement_probs = model.predict_proba(check_agreement[variables])[:, 1] ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) return (ks < 0.09)
def agreement(model, variables, mu, sig): check_agreement = read_csv('E:\\FlavoursOfPhysics\\flavours-of-physics-start-master\\tau_data\\' + 'check_agreement.csv', index_col='id') predictionFunction = model.predict() split = 6 splitPredictionData = np.array_split(featureNormalization(check_agreement[variables].values, mu, sig), split, axis=0) agreement_probs = np.asarray(predictionFunction(splitPredictionData[0])) for i in xrange(1, split): agreement_probs = np.append(agreement_probs, np.asarray(predictionFunction(splitPredictionData[i])), axis = 0) return evaluation.compute_ks(agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values)
def check_ks_and_cvm(self, data, check_agreement, check_correlation): print('Checking KS and CVM for ' + self.model_name + ' model with ' + self.var_name + ' variables\n') mod = self.create_model() X = data[self.variables].values y = data['signal'].values if self.nn == True: train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2) self.train_params['validation_data'] = (val_X, val_y) scaler = StandardScaler() train_X = scaler.fit_transform(train_X) val_X = scaler.transform(val_X) ch_agr = scaler.transform(check_agreement[self.variables]) ch_cor = scaler.transform(check_correlation[self.variables]) self.scaler = scaler mod.fit(X, y, **self.train_params) if self.nn == True: agreement_probs = mod.predict(ch_agr) correlation_probs = mod.predict(ch_cor).reshape( (ch_cor.shape[0], )) else: agreement_probs = mod.predict_proba( check_agreement[self.variables].values)[:, 1] correlation_probs = mod.predict_proba( check_correlation[self.variables].values)[:, 1] ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) print('KS metric = {}. Is it smaller than 0.09? {}'.format( ks, ks < 0.09)) print('CVM metric = {}. Is it smaller than 0.002? {}\n'.format( cvm, cvm < 0.002)) self.ks = ks self.cvm = cvm self.trained_model = mod self.agreement_probs = agreement_probs self.correlation_probs = correlation_probs
def objective(parameters): i.append(0) set_weights(model, parameters) p = model.predict(X, batch_size=256, verbose=0)[:, 1] auc = roc_auc_truncated(y, p) pa = model.predict(Xa, batch_size=256, verbose=0)[:, 1] ks = compute_ks(pa[ya == 0], pa[ya == 1], wa[ya == 0], wa[ya == 1]) pc = model.predict(Xc, batch_size=256, verbose=0)[:, 1] cvm = compute_cvm(pc, mc) ks_importance = 1 # relative KS importance ks_target = ks_threshold cvm_importance = 1 # relative CVM importance cvm_target = cvm_threshold alpha = 0.001 # LeakyReLU ks_loss = (1 if ks > ks_target else alpha) * (ks - ks_target) cvm_loss = (1 if cvm > cvm_target else alpha) * (cvm - cvm_target) loss = -auc + ks_importance * ks_loss + cvm_importance * cvm_loss if ks < ks_threshold and cvm < cvm_threshold and auc > auc_log[0]: d.append(0) dump_transductor_model(model, transductor_model_file.format(len(d))) auc_log.pop() auc_log.append(auc) message = "iteration {:7}: Best AUC={:7.5f} achieved, KS={:7.5f}, CVM={:7.5f}".format( len(i), auc, ks, cvm) logger.info(message) if verbose: print( "iteration {:7}: AUC: {:7.5f}, KS: {:7.5f}, CVM: {:7.5f}, loss: {:8.5f}" .format(len(i), auc, ks, cvm, loss)) return loss
num_trees = 250 n_rounds = 120 watchlist = [(xg_train, 'train')] xgb_model = xgb.train(params, xg_train, num_trees, watchlist) # xgb_model = xgb.train(params, xg_train, n_rounds, watchlist) # Check agreement test check_agreement = pandas.read_csv(folder + 'check_agreement.csv', index_col='id') xg_check_agreement = xgb.DMatrix(check_agreement.values) agreement_probs = xgb_model.predict(xg_check_agreement) ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print 'KS metric', ks, ks < 0.09 # Check correlation test check_correlation = pandas.read_csv(folder + 'check_correlation.csv', index_col='id') xg_check_correlation = xgb.DMatrix(check_correlation.values) correlation_probs = xgb_model.predict(xg_check_correlation) cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) print 'CvM metric', cvm, cvm < 0.002 # Compute weighted AUC on the training data with min_ANNmuon > 0.4 train_eval = train[train['min_ANNmuon'] > 0.4] train_eval_X = train_eval.drop(variables_to_drop, 1).values xg_train_eval = xgb.DMatrix(train_eval_X)
gd.fit(train[features], train["signal"]) check_agreement = pd.read_csv('C:/Users/sony/Downloads/Compressed/CERN/check_agreement.csv', index_col='id') agreement_probs = gd.predict_proba(check_agreement[features])[:, 1] """ ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print 'KS metric gb', ks, ks < 0.09 """ agreement_probs1 = rf.predict_proba(check_agreement[features])[:, 1] ks1 = evaluation.compute_ks( 0.3*agreement_probs1[check_agreement['signal'].values == 0]+ 0.7*agreement_probs[check_agreement['signal'].values == 0], 0.3*agreement_probs1[check_agreement['signal'].values == 1]+ 0.7*agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print 'KS metric rf', ks1, ks1 < 0.09 check_correlation = pd.read_csv('C:/Users/sony/Downloads/Compressed/CERN/check_correlation.csv', index_col='id') correlation_probs = gd.predict_proba(check_correlation[features])[:, 1] #cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) #print 'CvM metric for gb', cvm, cvm < 0.002 correlation_probs1 = rf.predict_proba(check_correlation[features])[:, 1]
'gamma': 0.01, # 0.005 "min_child_weight": 5, "silent": 1, "subsample": 0.7, "colsample_bytree": 0.7, 'nthread': 4, "seed": 1} num_trees=600 #gbm = xgb.train(params, xgb.DMatrix(train[features], train["signal"]), num_trees) agreement_probs= rf.predict_proba(check_agreement[features])[:,1] print('Checking agreement...') ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print ('KS metric UB =', ks, ks < 0.09) train_eval_probs1 = rf.predict_proba(train_eval[features])[:,1] AUC1 = evaluation.roc_auc_truncated(train_eval['signal'], train_eval_probs1) print ('AUC UB ', AUC1) print("Make predictions on the test set") rfpred = rf.predict_proba(test[features])[:,1] test_probs = rfpred submission = pd.DataFrame({"id": test["id"], "prediction": test_probs}) submission.to_csv("ub_only_submission.csv", index=False)
def train_test_predict(classifier, classifier_name, features, features_name, data_directory, training_data): """ """ # Fit the classifier with the training data. start = time.time() classifier.fit(training_data[features], training_data['signal']) end = time.time() print("time to fit the classifier: {} seconds".format(end - start)) print() # Check the agreement test. start = time.time() check_agreement = pandas.read_csv(data_directory + 'check_agreement.csv', index_col='id') agreement_probs = classifier.predict_proba(check_agreement[features])[:, 1] ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print('KS metric', ks, ks < 0.09) end = time.time() print("time to check the agreement test: {} seconds".format(end - start)) print() # Check the correlation test. start = time.time() check_correlation = pandas.read_csv(data_directory + 'check_correlation.csv', index_col='id') correlation_probs = classifier.predict_proba(check_correlation[features])[:, 1] cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) print('CvM metric', cvm, cvm < 0.002) end = time.time() print("time to check the correlation test: {} seconds".format(end - start)) print() # Compute weighted AUC on the training data with min_ANNmuon > 0.4. start = time.time() train_eval = training_data[training_data['min_ANNmuon'] > 0.4] train_probs = classifier.predict_proba(train_eval[features])[:, 1] AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs) print('AUC', AUC) end = time.time() print("time to compute the weighted AUC: {} seconds".format(end - start)) print() # Make predictions on the test data. start = time.time() testing_data = pandas.read_csv(data_directory + 'test.csv', index_col='id') result = pandas.DataFrame({'id': testing_data.index}) result['prediction'] = classifier.predict_proba(testing_data[features])[:, 1] end = time.time() print("time to make predictions: {} seconds".format(end - start)) print() predictions_name = classifier_name + '-' + features_name # Generate the csv file for Kaggle. result.to_csv(predictions_name + '.csv', index=False, sep=',') # Run the shell commands to generate the final archive through # the subprocess module calls. print(subprocess.check_output(['rm', '-f', predictions_name+'.7z'])) print(subprocess.check_output(['7z', 'a', predictions_name+'.7z', predictions_name+'.csv'])) print(subprocess.check_output(['ls', '-Ahl', predictions_name+'.csv'])) print(subprocess.check_output(['ls', '-Ahl', predictions_name+'.7z']))
# since preds are margin(before logistic transformation, cutoff at 0) return "truncated AUC", -evaluation.roc_auc_truncated(labels, preds) bst = xgb.train(param, dtrain, num_round, watchlist, feval=evalerror, early_stopping_rounds=10) # check agreement with noise check_agreement = all_data.ix[all_data["dataset"] == "agreement", :] dagreement = xgb.DMatrix(np.array(check_agreement[variables])) for noise_level in np.linspace(0, 0.2, 6): agreement_probs = add_noise(bst.predict(dagreement), noise_level) ks = evaluation.compute_ks( agreement_probs[check_agreement["signal"].values == 0], agreement_probs[check_agreement["signal"].values == 1], check_agreement[check_agreement["signal"] == 0]["weight"].values, check_agreement[check_agreement["signal"] == 1]["weight"].values, ) print "KS metric with noise level %f " % noise_level, ks, ks < 0.09 # Correlation probs with noise check_correlation = all_data.ix[all_data["dataset"] == "correlation", :] dcorrelation = xgb.DMatrix(np.array(check_correlation[variables])) for noise_level in np.linspace(0, 0.2, 6): correlation_probs = add_noise(bst.predict(dcorrelation), noise_level) cvm = evaluation.compute_cvm(correlation_probs, check_correlation["mass"]) print "CvM metric with noise level %f " % noise_level, cvm, cvm < 0.002 # Validation with noise train_view = all_data.ix[all_data["dataset"] == "train", :]