def objective(parameters): i.append(0) set_weights(model, parameters) p = model.predict(X, batch_size=256, verbose=0)[:, 1] auc = roc_auc_truncated(y, p) pa = model.predict(Xa, batch_size=256, verbose=0)[:, 1] ks = compute_ks(pa[ya == 0], pa[ya == 1], wa[ya == 0], wa[ya == 1]) pc = model.predict(Xc, batch_size=256, verbose=0)[:, 1] cvm = compute_cvm(pc, mc) ks_importance = 1 # relative KS importance ks_target = ks_threshold cvm_importance = 1 # relative CVM importance cvm_target = cvm_threshold alpha = 0.001 # LeakyReLU ks_loss = (1 if ks > ks_target else alpha) * (ks - ks_target) cvm_loss = (1 if cvm > cvm_target else alpha) * (cvm - cvm_target) loss = -auc + ks_importance*ks_loss + cvm_importance*cvm_loss if ks < ks_threshold and cvm < cvm_threshold and auc > auc_log[0]: d.append(0) dump_transductor_model(model, transductor_model_file.format(len(d))) auc_log.pop() auc_log.append(auc) message = "iteration {:7}: Best AUC={:7.5f} achieved, KS={:7.5f}, CVM={:7.5f}".format(len(i), auc, ks, cvm) logger.info(message) if verbose: print("iteration {:7}: AUC: {:7.5f}, KS: {:7.5f}, CVM: {:7.5f}, loss: {:8.5f}".format(len(i), auc, ks, cvm, loss)) return loss
def cv_model(model_list): print "generating cv csv files...." train, test = gen_data() label = train['signal'] train_id = train.id test_id = test.id train_del, test_del = delete_features(train), delete_features(test) check_agreement = pd.read_csv('../data/check_agreement.csv') check_correlation = pd.read_csv('../data/check_correlation.csv') check_agreement= add_features(check_agreement) check_correlation = add_features(check_correlation) X, X_test = train_del.as_matrix(), test_del.as_matrix() print X.shape, X_test.shape kf = KFold(label, n_folds = 4) for j, (clf, clf_name) in enumerate(model_list): print "modelling model %i ...."%j cv_train = np.zeros(len(label)) for i, (train_fold, validate) in enumerate(kf): X_train, X_validate, label_train, label_validate = X[train_fold,:], X[validate,:], label[train_fold], label[validate] clf.fit(X_train,label_train) cv_train[validate] = clf.predict_proba(X_validate)[:,1] auc_score = evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], pd.Series(cv_train)[train['min_ANNmuon'] > 0.4]) print "the true roc_auc_truncated is %.6f"%auc_score clf.fit(X, label) test_probs = clf.predict_proba(X_test)[:,1] # check if it passes the tests print "check if it passes the tests" agreement_probs = clf.predict_proba(delete_features(check_agreement).as_matrix())[:,1] ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print ('KS metric', ks, ks <= 0.09) correlation_probs = clf.predict_proba(delete_features(check_correlation).as_matrix())[:,1] print ('Checking correlation...') cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) print ('CvM metric', cvm, cvm <= 0.002) #if ks <= 0.09 and cvm <= 0.002 and auc_score > 0.975: # no need to check here if auc_score > 0.965: # the minimum threshold # save the cv cv_sub = pd.DataFrame({"id": train_id, "prediction": cv_train, "label": label}) cv_sub.to_csv("../data/cv_folder/xgb%i.csv"%j, index=False) # save the prediction submission = pd.DataFrame({"id": test_id, "prediction": test_probs}) submission.to_csv("../data/pred_folder/xgb%i.csv"%j, index=False) # save agreement submission = pd.DataFrame({"id": check_agreement['id'], "prediction": agreement_probs}) submission.to_csv("../data/agree_folder/xgb%i.csv"%j, index=False) # save correlation submission = pd.DataFrame({"id": check_correlation['id'], "prediction": correlation_probs}) submission.to_csv("../data/correlation_folder/xgb%i.csv"%j, index=False)
def check_ks_and_cvm(self, data, check_agreement, check_correlation): print('Checking KS and CVM for ' + self.model_name + ' model with ' + self.var_name + ' variables\n') mod = self.create_model() X = data[self.variables].values y = data['signal'].values if self.nn == True: train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2) self.train_params['validation_data'] = (val_X, val_y) scaler = StandardScaler() train_X = scaler.fit_transform(train_X) val_X = scaler.transform(val_X) ch_agr = scaler.transform(check_agreement[self.variables]) ch_cor = scaler.transform(check_correlation[self.variables]) self.scaler = scaler mod.fit(X, y, **self.train_params) if self.nn == True: agreement_probs = mod.predict(ch_agr) correlation_probs = mod.predict(ch_cor).reshape( (ch_cor.shape[0], )) else: agreement_probs = mod.predict_proba( check_agreement[self.variables].values)[:, 1] correlation_probs = mod.predict_proba( check_correlation[self.variables].values)[:, 1] ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) print('KS metric = {}. Is it smaller than 0.09? {}'.format( ks, ks < 0.09)) print('CVM metric = {}. Is it smaller than 0.002? {}\n'.format( cvm, cvm < 0.002)) self.ks = ks self.cvm = cvm self.trained_model = mod self.agreement_probs = agreement_probs self.correlation_probs = correlation_probs
def objective(parameters): i.append(0) set_weights(model, parameters) p = model.predict(X, batch_size=256, verbose=0)[:, 1] auc = roc_auc_truncated(y, p) pa = model.predict(Xa, batch_size=256, verbose=0)[:, 1] ks = compute_ks(pa[ya == 0], pa[ya == 1], wa[ya == 0], wa[ya == 1]) pc = model.predict(Xc, batch_size=256, verbose=0)[:, 1] cvm = compute_cvm(pc, mc) ks_importance = 1 # relative KS importance ks_target = ks_threshold cvm_importance = 1 # relative CVM importance cvm_target = cvm_threshold alpha = 0.001 # LeakyReLU ks_loss = (1 if ks > ks_target else alpha) * (ks - ks_target) cvm_loss = (1 if cvm > cvm_target else alpha) * (cvm - cvm_target) loss = -auc + ks_importance * ks_loss + cvm_importance * cvm_loss if ks < ks_threshold and cvm < cvm_threshold and auc > auc_log[0]: d.append(0) dump_transductor_model(model, transductor_model_file.format(len(d))) auc_log.pop() auc_log.append(auc) message = "iteration {:7}: Best AUC={:7.5f} achieved, KS={:7.5f}, CVM={:7.5f}".format( len(i), auc, ks, cvm) logger.info(message) if verbose: print( "iteration {:7}: AUC: {:7.5f}, KS: {:7.5f}, CVM: {:7.5f}, loss: {:8.5f}" .format(len(i), auc, ks, cvm, loss)) return loss
check_agreement = pandas.read_csv(folder + 'check_agreement.csv', index_col='id') xg_check_agreement = xgb.DMatrix(check_agreement.values) agreement_probs = xgb_model.predict(xg_check_agreement) ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print 'KS metric', ks, ks < 0.09 # Check correlation test check_correlation = pandas.read_csv(folder + 'check_correlation.csv', index_col='id') xg_check_correlation = xgb.DMatrix(check_correlation.values) correlation_probs = xgb_model.predict(xg_check_correlation) cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) print 'CvM metric', cvm, cvm < 0.002 # Compute weighted AUC on the training data with min_ANNmuon > 0.4 train_eval = train[train['min_ANNmuon'] > 0.4] train_eval_X = train_eval.drop(variables_to_drop, 1).values xg_train_eval = xgb.DMatrix(train_eval_X) train_probs = xgb_model.predict(xg_train_eval) AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs) print 'AUC', AUC # Predict test, create file for kaggle test = pandas.read_csv(folder + 'test.csv', index_col='id') test_X = test.values xg_test = xgb.DMatrix(test_X) result = pandas.DataFrame({'id': test.index})
print("Experiment with ensembling") # I accidentaly found, that raising XGB prediction to some exponent helps. # I don't know why :-) # The higher exponent the better it seems to work. for exponent in [1, 16, 256]: print('\nPredictions for exponent = %d:' % (exponent)); for q in np.arange(0.1, 1.01, 0.1): sl_cvP = q * (sl_cvPStrong ** exponent) + (1 - q) * sl_cvPWeak; sl_cvP[sl_cvP < 0] = 0; sl_cvP[sl_cvP > 1] = 1; agrP = q * (agrPStrong ** exponent) + (1 - q) * agrPWeak; correlP = q * (correlPStrong ** exponent) + (1 - q) * correlPWeak; sqErr = np.mean((sl_cvP - sl_cvTarget) ** 2); roc = getErrROC(sl_cvP, sl_cvTarget); agr = getErrAGR(agrP, agrTarget, agrW); correl = evaluation.compute_cvm(correlP, check_correlation['mass']); print("mix %4.2f - sqErr: %6.4f, ROC: %6.4f, AGR: %6.4f, correl: %6.4f" \ % (q, sqErr, roc, agr, correl)) print("Train XGBoost models on full data"); gbmPcaStrong = xgb.train(xgbParams, trainInputStrong, numTreesStrong); gbmPcaWeak = xgb.train(xgbParams, trainInputWeak, numTreesWeak); print("Make predictions on full data"); testPStrong = gbmPcaStrong.predict(testInputStrong); testPWeak = gbmPcaWeak.predict(testInputWeak); # mix xgb and nn prediction on test data, save output for q in np.arange(0.1,1.01,0.1): testP = q * (testPStrong ** 256) + (1 - q) * testPWeak; testP[testP < 0] = 0;
def check_corr_func(self, correlation_probs): cvm = evaluation.compute_cvm(correlation_probs, self.check_correlation['mass']) return cvm
0.3*agreement_probs1[check_agreement['signal'].values == 0]+ 0.7*agreement_probs[check_agreement['signal'].values == 0], 0.3*agreement_probs1[check_agreement['signal'].values == 1]+ 0.7*agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print 'KS metric rf', ks1, ks1 < 0.09 check_correlation = pd.read_csv('C:/Users/sony/Downloads/Compressed/CERN/check_correlation.csv', index_col='id') correlation_probs = gd.predict_proba(check_correlation[features])[:, 1] #cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) #print 'CvM metric for gb', cvm, cvm < 0.002 correlation_probs1 = rf.predict_proba(check_correlation[features])[:, 1] cvm1 = evaluation.compute_cvm(0.3*correlation_probs1+0.7*correlation_probs, check_correlation['mass']) print 'CvM metric for rf', cvm1, cvm1 < 0.002 train_eval = train[train['min_ANNmuon'] > 0.4] train_probs = gd.predict_proba(train_eval[features])[:, 1] #AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs) #print 'AUC metric for gb', AUC #train_eval1 = train[train['min_ANNmuon'] > 0.4] train_probs1 = rf.predict_proba(train_eval[features])[:, 1] AUC = evaluation.roc_auc_truncated(train_eval['signal'], .3*train_probs1+0.7*train_probs) print 'AUC metric for rf', AUC """ print("Make predictions on the test set")
def train_test_predict(classifier, classifier_name, features, features_name, data_directory, training_data): """ """ # Fit the classifier with the training data. start = time.time() classifier.fit(training_data[features], training_data['signal']) end = time.time() print("time to fit the classifier: {} seconds".format(end - start)) print() # Check the agreement test. start = time.time() check_agreement = pandas.read_csv(data_directory + 'check_agreement.csv', index_col='id') agreement_probs = classifier.predict_proba(check_agreement[features])[:, 1] ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print('KS metric', ks, ks < 0.09) end = time.time() print("time to check the agreement test: {} seconds".format(end - start)) print() # Check the correlation test. start = time.time() check_correlation = pandas.read_csv(data_directory + 'check_correlation.csv', index_col='id') correlation_probs = classifier.predict_proba(check_correlation[features])[:, 1] cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) print('CvM metric', cvm, cvm < 0.002) end = time.time() print("time to check the correlation test: {} seconds".format(end - start)) print() # Compute weighted AUC on the training data with min_ANNmuon > 0.4. start = time.time() train_eval = training_data[training_data['min_ANNmuon'] > 0.4] train_probs = classifier.predict_proba(train_eval[features])[:, 1] AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs) print('AUC', AUC) end = time.time() print("time to compute the weighted AUC: {} seconds".format(end - start)) print() # Make predictions on the test data. start = time.time() testing_data = pandas.read_csv(data_directory + 'test.csv', index_col='id') result = pandas.DataFrame({'id': testing_data.index}) result['prediction'] = classifier.predict_proba(testing_data[features])[:, 1] end = time.time() print("time to make predictions: {} seconds".format(end - start)) print() predictions_name = classifier_name + '-' + features_name # Generate the csv file for Kaggle. result.to_csv(predictions_name + '.csv', index=False, sep=',') # Run the shell commands to generate the final archive through # the subprocess module calls. print(subprocess.check_output(['rm', '-f', predictions_name+'.7z'])) print(subprocess.check_output(['7z', 'a', predictions_name+'.7z', predictions_name+'.csv'])) print(subprocess.check_output(['ls', '-Ahl', predictions_name+'.csv'])) print(subprocess.check_output(['ls', '-Ahl', predictions_name+'.7z']))
def correlation(model, variables, mu, sig): check_correlation = read_csv('E:\\FlavoursOfPhysics\\flavours-of-physics-start-master\\tau_data\\' + 'check_correlation.csv', index_col='id') correlation_probs = np.asarray(model.predict()(np.matrix(featureNormalization(check_correlation[variables].values, mu, sig)).astype(theano.config.floatX))).T[0] return evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
def is_conform_correlation(check_correlation, variables, model): correlation_probs = model.predict_proba(check_correlation[variables])[:, 1] cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) return (cvm < 0.002)