def objective(parameters):
        i.append(0)
        set_weights(model, parameters)
        p = model.predict(X, batch_size=256, verbose=0)[:, 1]
        auc = roc_auc_truncated(y, p)

        pa = model.predict(Xa, batch_size=256, verbose=0)[:, 1]
        ks = compute_ks(pa[ya == 0], pa[ya == 1], wa[ya == 0], wa[ya == 1])
        
        pc = model.predict(Xc, batch_size=256, verbose=0)[:, 1]        
        cvm = compute_cvm(pc, mc)

        ks_importance = 1  # relative KS importance
        ks_target = ks_threshold
        cvm_importance = 1  # relative CVM importance
        cvm_target = cvm_threshold
        
        alpha = 0.001        # LeakyReLU
        ks_loss = (1 if ks > ks_target else alpha) * (ks - ks_target)
        cvm_loss = (1 if cvm > cvm_target else alpha) * (cvm - cvm_target)
        loss = -auc + ks_importance*ks_loss + cvm_importance*cvm_loss        

        if ks < ks_threshold and cvm < cvm_threshold and auc > auc_log[0]:
            d.append(0)
            dump_transductor_model(model, transductor_model_file.format(len(d)))
            auc_log.pop()
            auc_log.append(auc)
            message = "iteration {:7}: Best AUC={:7.5f} achieved, KS={:7.5f}, CVM={:7.5f}".format(len(i), auc, ks, cvm)
            logger.info(message)

        if verbose:
            print("iteration {:7}: AUC: {:7.5f}, KS: {:7.5f}, CVM: {:7.5f}, loss: {:8.5f}".format(len(i), 
                  auc, ks, cvm, loss))
        return loss
Example #2
0
def cv_model(model_list):
	print "generating cv csv files...."
	train, test = gen_data()
	label = train['signal']
	train_id = train.id
	test_id = test.id

	train_del, test_del = delete_features(train), delete_features(test)

	check_agreement = pd.read_csv('../data/check_agreement.csv')
	check_correlation = pd.read_csv('../data/check_correlation.csv')
	check_agreement= add_features(check_agreement)
	check_correlation  = add_features(check_correlation)

	X, X_test = train_del.as_matrix(), test_del.as_matrix()
	print X.shape, X_test.shape

	kf = KFold(label, n_folds = 4)
	for j, (clf, clf_name) in enumerate(model_list):
		
		print "modelling model %i ...."%j
		cv_train = np.zeros(len(label))
		for i, (train_fold, validate) in enumerate(kf):
			X_train, X_validate, label_train, label_validate = X[train_fold,:], X[validate,:], label[train_fold], label[validate]
			clf.fit(X_train,label_train)
			cv_train[validate] = clf.predict_proba(X_validate)[:,1]
		auc_score = evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], 
			pd.Series(cv_train)[train['min_ANNmuon'] > 0.4])
		print "the true roc_auc_truncated is %.6f"%auc_score

		clf.fit(X, label)
		test_probs = clf.predict_proba(X_test)[:,1]
		# check if it passes the tests
		print "check if it passes the tests"
		agreement_probs = clf.predict_proba(delete_features(check_agreement).as_matrix())[:,1]
		ks = evaluation.compute_ks(
			agreement_probs[check_agreement['signal'].values == 0],
			agreement_probs[check_agreement['signal'].values == 1],
			check_agreement[check_agreement['signal'] == 0]['weight'].values,
			check_agreement[check_agreement['signal'] == 1]['weight'].values)
		print ('KS metric', ks, ks <= 0.09)

		correlation_probs = clf.predict_proba(delete_features(check_correlation).as_matrix())[:,1]
		print ('Checking correlation...')
		cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
		print ('CvM metric', cvm, cvm <= 0.002)
		#if ks <= 0.09 and cvm <= 0.002 and auc_score > 0.975: # no need to check here
		if auc_score > 0.965: # the minimum threshold
			# save the cv
			cv_sub = pd.DataFrame({"id": train_id, "prediction": cv_train, "label": label})
			cv_sub.to_csv("../data/cv_folder/xgb%i.csv"%j, index=False)
			# save the prediction
			submission = pd.DataFrame({"id": test_id, "prediction": test_probs})
			submission.to_csv("../data/pred_folder/xgb%i.csv"%j, index=False)
			# save agreement
			submission = pd.DataFrame({"id": check_agreement['id'], "prediction": agreement_probs})
			submission.to_csv("../data/agree_folder/xgb%i.csv"%j, index=False)
			# save correlation
			submission = pd.DataFrame({"id": check_correlation['id'], "prediction": correlation_probs})
			submission.to_csv("../data/correlation_folder/xgb%i.csv"%j, index=False)
Example #3
0
    def check_ks_and_cvm(self, data, check_agreement, check_correlation):
        print('Checking KS and CVM for ' + self.model_name + ' model with ' +
              self.var_name + ' variables\n')

        mod = self.create_model()

        X = data[self.variables].values
        y = data['signal'].values

        if self.nn == True:
            train_X, val_X, train_y, val_y = train_test_split(X,
                                                              y,
                                                              test_size=0.2)
            self.train_params['validation_data'] = (val_X, val_y)
            scaler = StandardScaler()
            train_X = scaler.fit_transform(train_X)
            val_X = scaler.transform(val_X)
            ch_agr = scaler.transform(check_agreement[self.variables])
            ch_cor = scaler.transform(check_correlation[self.variables])
            self.scaler = scaler

        mod.fit(X, y, **self.train_params)

        if self.nn == True:
            agreement_probs = mod.predict(ch_agr)
            correlation_probs = mod.predict(ch_cor).reshape(
                (ch_cor.shape[0], ))
        else:
            agreement_probs = mod.predict_proba(
                check_agreement[self.variables].values)[:, 1]
            correlation_probs = mod.predict_proba(
                check_correlation[self.variables].values)[:, 1]

        ks = evaluation.compute_ks(
            agreement_probs[check_agreement['signal'].values == 0],
            agreement_probs[check_agreement['signal'].values == 1],
            check_agreement[check_agreement['signal'] == 0]['weight'].values,
            check_agreement[check_agreement['signal'] == 1]['weight'].values)
        cvm = evaluation.compute_cvm(correlation_probs,
                                     check_correlation['mass'])
        print('KS metric = {}. Is it smaller than 0.09? {}'.format(
            ks, ks < 0.09))
        print('CVM metric = {}. Is it smaller than 0.002? {}\n'.format(
            cvm, cvm < 0.002))
        self.ks = ks
        self.cvm = cvm
        self.trained_model = mod
        self.agreement_probs = agreement_probs
        self.correlation_probs = correlation_probs
Example #4
0
    def objective(parameters):
        i.append(0)
        set_weights(model, parameters)
        p = model.predict(X, batch_size=256, verbose=0)[:, 1]
        auc = roc_auc_truncated(y, p)

        pa = model.predict(Xa, batch_size=256, verbose=0)[:, 1]
        ks = compute_ks(pa[ya == 0], pa[ya == 1], wa[ya == 0], wa[ya == 1])

        pc = model.predict(Xc, batch_size=256, verbose=0)[:, 1]
        cvm = compute_cvm(pc, mc)

        ks_importance = 1  # relative KS importance
        ks_target = ks_threshold
        cvm_importance = 1  # relative CVM importance
        cvm_target = cvm_threshold

        alpha = 0.001  # LeakyReLU
        ks_loss = (1 if ks > ks_target else alpha) * (ks - ks_target)
        cvm_loss = (1 if cvm > cvm_target else alpha) * (cvm - cvm_target)
        loss = -auc + ks_importance * ks_loss + cvm_importance * cvm_loss

        if ks < ks_threshold and cvm < cvm_threshold and auc > auc_log[0]:
            d.append(0)
            dump_transductor_model(model,
                                   transductor_model_file.format(len(d)))
            auc_log.pop()
            auc_log.append(auc)
            message = "iteration {:7}: Best AUC={:7.5f} achieved, KS={:7.5f}, CVM={:7.5f}".format(
                len(i), auc, ks, cvm)
            logger.info(message)

        if verbose:
            print(
                "iteration {:7}: AUC: {:7.5f}, KS: {:7.5f}, CVM: {:7.5f}, loss: {:8.5f}"
                .format(len(i), auc, ks, cvm, loss))
        return loss
check_agreement = pandas.read_csv(folder + 'check_agreement.csv', index_col='id')
xg_check_agreement = xgb.DMatrix(check_agreement.values)
agreement_probs = xgb_model.predict(xg_check_agreement)

ks = evaluation.compute_ks(
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)
print 'KS metric', ks, ks < 0.09

# Check correlation test
check_correlation = pandas.read_csv(folder + 'check_correlation.csv', index_col='id')
xg_check_correlation = xgb.DMatrix(check_correlation.values)
correlation_probs = xgb_model.predict(xg_check_correlation)
cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
print 'CvM metric', cvm, cvm < 0.002

# Compute weighted AUC on the training data with min_ANNmuon > 0.4
train_eval = train[train['min_ANNmuon'] > 0.4]
train_eval_X = train_eval.drop(variables_to_drop, 1).values
xg_train_eval = xgb.DMatrix(train_eval_X)
train_probs = xgb_model.predict(xg_train_eval)
AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs)
print 'AUC', AUC

# Predict test, create file for kaggle
test = pandas.read_csv(folder + 'test.csv', index_col='id')
test_X = test.values
xg_test = xgb.DMatrix(test_X)
result = pandas.DataFrame({'id': test.index})
print("Experiment with ensembling")
# I accidentaly found, that raising XGB prediction to some exponent helps.
# I don't know why :-)
# The higher exponent the better it seems to work. 
for exponent in [1, 16, 256]:
    print('\nPredictions for exponent = %d:' % (exponent));
    for q in np.arange(0.1, 1.01, 0.1):
        sl_cvP = q * (sl_cvPStrong ** exponent) + (1 - q) * sl_cvPWeak;
        sl_cvP[sl_cvP < 0] = 0;
        sl_cvP[sl_cvP > 1] = 1;
        agrP = q * (agrPStrong ** exponent) + (1 - q) * agrPWeak;
        correlP = q * (correlPStrong ** exponent) + (1 - q) * correlPWeak;
        sqErr = np.mean((sl_cvP - sl_cvTarget) ** 2);
        roc = getErrROC(sl_cvP, sl_cvTarget);
        agr = getErrAGR(agrP, agrTarget, agrW);
        correl = evaluation.compute_cvm(correlP, check_correlation['mass']);
        print("mix %4.2f - sqErr: %6.4f, ROC: %6.4f, AGR: %6.4f, correl: %6.4f" \
                % (q, sqErr, roc, agr, correl))

print("Train XGBoost models on full data");
gbmPcaStrong = xgb.train(xgbParams, trainInputStrong, numTreesStrong);
gbmPcaWeak = xgb.train(xgbParams, trainInputWeak, numTreesWeak);

print("Make predictions on full data");
testPStrong = gbmPcaStrong.predict(testInputStrong);
testPWeak = gbmPcaWeak.predict(testInputWeak);

# mix xgb and nn prediction on test data, save output
for q in np.arange(0.1,1.01,0.1):
    testP = q * (testPStrong ** 256) + (1 - q) * testPWeak;
    testP[testP < 0] = 0;
	def check_corr_func(self, correlation_probs):
		cvm = evaluation.compute_cvm(correlation_probs, self.check_correlation['mass'])
		return cvm
    0.3*agreement_probs1[check_agreement['signal'].values == 0]+
    0.7*agreement_probs[check_agreement['signal'].values == 0],
    0.3*agreement_probs1[check_agreement['signal'].values == 1]+
    0.7*agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)

print 'KS metric rf', ks1, ks1 < 0.09

check_correlation = pd.read_csv('C:/Users/sony/Downloads/Compressed/CERN/check_correlation.csv', index_col='id')
correlation_probs = gd.predict_proba(check_correlation[features])[:, 1]
#cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
#print 'CvM metric for gb', cvm, cvm < 0.002

correlation_probs1 = rf.predict_proba(check_correlation[features])[:, 1]
cvm1 = evaluation.compute_cvm(0.3*correlation_probs1+0.7*correlation_probs, check_correlation['mass'])
print 'CvM metric for rf', cvm1, cvm1 < 0.002

train_eval = train[train['min_ANNmuon'] > 0.4]
train_probs = gd.predict_proba(train_eval[features])[:, 1]
#AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs)
#print 'AUC metric for gb', AUC

#train_eval1 = train[train['min_ANNmuon'] > 0.4]
train_probs1 = rf.predict_proba(train_eval[features])[:, 1]
AUC = evaluation.roc_auc_truncated(train_eval['signal'], .3*train_probs1+0.7*train_probs)
print 'AUC metric for rf', AUC


"""
print("Make predictions on the test set")
 def check_corr_func(self, correlation_probs):
     cvm = evaluation.compute_cvm(correlation_probs,
                                  self.check_correlation['mass'])
     return cvm
def train_test_predict(classifier, classifier_name, features, features_name,
        data_directory, training_data):
    """
    """
    # Fit the classifier with the training data.
    start = time.time()
    classifier.fit(training_data[features], training_data['signal'])
    end = time.time()
    print("time to fit the classifier: {} seconds".format(end - start))
    print()

    # Check the agreement test.
    start = time.time()
    check_agreement = pandas.read_csv(data_directory + 'check_agreement.csv', index_col='id')
    agreement_probs = classifier.predict_proba(check_agreement[features])[:, 1]
    ks = evaluation.compute_ks(
        agreement_probs[check_agreement['signal'].values == 0],
        agreement_probs[check_agreement['signal'].values == 1],
        check_agreement[check_agreement['signal'] == 0]['weight'].values,
        check_agreement[check_agreement['signal'] == 1]['weight'].values)
    print('KS metric', ks, ks < 0.09)
    end = time.time()
    print("time to check the agreement test: {} seconds".format(end - start))
    print()

    # Check the correlation test.
    start = time.time()
    check_correlation = pandas.read_csv(data_directory + 'check_correlation.csv', index_col='id')
    correlation_probs = classifier.predict_proba(check_correlation[features])[:, 1]
    cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
    print('CvM metric', cvm, cvm < 0.002)
    end = time.time()
    print("time to check the correlation test: {} seconds".format(end - start))
    print()

    # Compute weighted AUC on the training data with min_ANNmuon > 0.4.
    start = time.time()
    train_eval = training_data[training_data['min_ANNmuon'] > 0.4]
    train_probs = classifier.predict_proba(train_eval[features])[:, 1]
    AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs)
    print('AUC', AUC)
    end = time.time()
    print("time to compute the weighted AUC: {} seconds".format(end - start))
    print()

    # Make predictions on the test data.
    start = time.time()
    testing_data = pandas.read_csv(data_directory + 'test.csv', index_col='id')
    result = pandas.DataFrame({'id': testing_data.index})
    result['prediction'] = classifier.predict_proba(testing_data[features])[:, 1]
    end = time.time()
    print("time to make predictions: {} seconds".format(end - start))
    print()

    predictions_name = classifier_name + '-' + features_name

    # Generate the csv file for Kaggle.
    result.to_csv(predictions_name + '.csv', index=False, sep=',')

    # Run the shell commands to generate the final archive through
    # the subprocess module calls.
    print(subprocess.check_output(['rm', '-f', predictions_name+'.7z']))

    print(subprocess.check_output(['7z', 'a', predictions_name+'.7z', predictions_name+'.csv']))

    print(subprocess.check_output(['ls', '-Ahl', predictions_name+'.csv']))
    print(subprocess.check_output(['ls', '-Ahl', predictions_name+'.7z']))
def correlation(model, variables, mu, sig):
    check_correlation = read_csv('E:\\FlavoursOfPhysics\\flavours-of-physics-start-master\\tau_data\\' + 'check_correlation.csv', index_col='id')
    correlation_probs = np.asarray(model.predict()(np.matrix(featureNormalization(check_correlation[variables].values, mu, sig)).astype(theano.config.floatX))).T[0]
    return evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
def is_conform_correlation(check_correlation, variables, model):
  correlation_probs = model.predict_proba(check_correlation[variables])[:, 1]
  cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
  return (cvm < 0.002)