def check_agreement_func(self, agreement_probs):
		ks = evaluation.compute_ks(
			agreement_probs[self.check_agreement['signal'].values == 0],
			agreement_probs[self.check_agreement['signal'].values == 1],
			self.check_agreement[self.check_agreement['signal'] == 0]['weight'].values,
			self.check_agreement[self.check_agreement['signal'] == 1]['weight'].values)
		return ks
    def objective(parameters):
        i.append(0)
        set_weights(model, parameters)
        p = model.predict(X, batch_size=256, verbose=0)[:, 1]
        auc = roc_auc_truncated(y, p)

        pa = model.predict(Xa, batch_size=256, verbose=0)[:, 1]
        ks = compute_ks(pa[ya == 0], pa[ya == 1], wa[ya == 0], wa[ya == 1])
        
        pc = model.predict(Xc, batch_size=256, verbose=0)[:, 1]        
        cvm = compute_cvm(pc, mc)

        ks_importance = 1  # relative KS importance
        ks_target = ks_threshold
        cvm_importance = 1  # relative CVM importance
        cvm_target = cvm_threshold
        
        alpha = 0.001        # LeakyReLU
        ks_loss = (1 if ks > ks_target else alpha) * (ks - ks_target)
        cvm_loss = (1 if cvm > cvm_target else alpha) * (cvm - cvm_target)
        loss = -auc + ks_importance*ks_loss + cvm_importance*cvm_loss        

        if ks < ks_threshold and cvm < cvm_threshold and auc > auc_log[0]:
            d.append(0)
            dump_transductor_model(model, transductor_model_file.format(len(d)))
            auc_log.pop()
            auc_log.append(auc)
            message = "iteration {:7}: Best AUC={:7.5f} achieved, KS={:7.5f}, CVM={:7.5f}".format(len(i), auc, ks, cvm)
            logger.info(message)

        if verbose:
            print("iteration {:7}: AUC: {:7.5f}, KS: {:7.5f}, CVM: {:7.5f}, loss: {:8.5f}".format(len(i), 
                  auc, ks, cvm, loss))
        return loss
Example #3
0
def cv_model(model_list):
	print "generating cv csv files...."
	train, test = gen_data()
	label = train['signal']
	train_id = train.id
	test_id = test.id

	train_del, test_del = delete_features(train), delete_features(test)

	check_agreement = pd.read_csv('../data/check_agreement.csv')
	check_correlation = pd.read_csv('../data/check_correlation.csv')
	check_agreement= add_features(check_agreement)
	check_correlation  = add_features(check_correlation)

	X, X_test = train_del.as_matrix(), test_del.as_matrix()
	print X.shape, X_test.shape

	kf = KFold(label, n_folds = 4)
	for j, (clf, clf_name) in enumerate(model_list):
		
		print "modelling model %i ...."%j
		cv_train = np.zeros(len(label))
		for i, (train_fold, validate) in enumerate(kf):
			X_train, X_validate, label_train, label_validate = X[train_fold,:], X[validate,:], label[train_fold], label[validate]
			clf.fit(X_train,label_train)
			cv_train[validate] = clf.predict_proba(X_validate)[:,1]
		auc_score = evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], 
			pd.Series(cv_train)[train['min_ANNmuon'] > 0.4])
		print "the true roc_auc_truncated is %.6f"%auc_score

		clf.fit(X, label)
		test_probs = clf.predict_proba(X_test)[:,1]
		# check if it passes the tests
		print "check if it passes the tests"
		agreement_probs = clf.predict_proba(delete_features(check_agreement).as_matrix())[:,1]
		ks = evaluation.compute_ks(
			agreement_probs[check_agreement['signal'].values == 0],
			agreement_probs[check_agreement['signal'].values == 1],
			check_agreement[check_agreement['signal'] == 0]['weight'].values,
			check_agreement[check_agreement['signal'] == 1]['weight'].values)
		print ('KS metric', ks, ks <= 0.09)

		correlation_probs = clf.predict_proba(delete_features(check_correlation).as_matrix())[:,1]
		print ('Checking correlation...')
		cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
		print ('CvM metric', cvm, cvm <= 0.002)
		#if ks <= 0.09 and cvm <= 0.002 and auc_score > 0.975: # no need to check here
		if auc_score > 0.965: # the minimum threshold
			# save the cv
			cv_sub = pd.DataFrame({"id": train_id, "prediction": cv_train, "label": label})
			cv_sub.to_csv("../data/cv_folder/xgb%i.csv"%j, index=False)
			# save the prediction
			submission = pd.DataFrame({"id": test_id, "prediction": test_probs})
			submission.to_csv("../data/pred_folder/xgb%i.csv"%j, index=False)
			# save agreement
			submission = pd.DataFrame({"id": check_agreement['id'], "prediction": agreement_probs})
			submission.to_csv("../data/agree_folder/xgb%i.csv"%j, index=False)
			# save correlation
			submission = pd.DataFrame({"id": check_correlation['id'], "prediction": correlation_probs})
			submission.to_csv("../data/correlation_folder/xgb%i.csv"%j, index=False)
Example #4
0
def check_a(agreement_probs):
    check_agreement = pd.read_csv('check_agreement.csv')
    ks = e.compute_ks(
        agreement_probs[check_agreement['signal'].values == 0],
        agreement_probs[check_agreement['signal'].values == 1],
        check_agreement[check_agreement['signal'] == 0]['weight'].values,
        check_agreement[check_agreement['signal'] == 1]['weight'].values)
    print 'KS metric', ks, ks < 0.09
 def check_agreement_func(self, agreement_probs):
     ks = evaluation.compute_ks(
         agreement_probs[self.check_agreement['signal'].values == 0],
         agreement_probs[self.check_agreement['signal'].values == 1],
         self.check_agreement[self.check_agreement['signal'] == 0]
         ['weight'].values, self.check_agreement[
             self.check_agreement['signal'] == 1]['weight'].values)
     return ks
Example #6
0
def check_a(agreement_probs):
    check_agreement = pd.read_csv('check_agreement.csv')
    ks = e.compute_ks(
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)
    print 'KS metric', ks, ks < 0.09
def is_conform_ks(check_agreement, variables, model):
  agreement_probs = model.predict_proba(check_agreement[variables])[:, 1]

  ks = evaluation.compute_ks(
      agreement_probs[check_agreement['signal'].values == 0],
      agreement_probs[check_agreement['signal'].values == 1],
      check_agreement[check_agreement['signal'] == 0]['weight'].values,
      check_agreement[check_agreement['signal'] == 1]['weight'].values)

  return (ks < 0.09)
def agreement(model, variables, mu, sig):
    check_agreement = read_csv('E:\\FlavoursOfPhysics\\flavours-of-physics-start-master\\tau_data\\' + 'check_agreement.csv', index_col='id')
    predictionFunction = model.predict()
    split = 6
    splitPredictionData = np.array_split(featureNormalization(check_agreement[variables].values, mu, sig), split, axis=0)
    agreement_probs = np.asarray(predictionFunction(splitPredictionData[0]))
    for i in xrange(1, split):
        agreement_probs = np.append(agreement_probs, np.asarray(predictionFunction(splitPredictionData[i])), axis = 0)
    return evaluation.compute_ks(agreement_probs[check_agreement['signal'].values == 0],
                                 agreement_probs[check_agreement['signal'].values == 1],
                                 check_agreement[check_agreement['signal'] == 0]['weight'].values,
                                 check_agreement[check_agreement['signal'] == 1]['weight'].values)
Example #9
0
    def check_ks_and_cvm(self, data, check_agreement, check_correlation):
        print('Checking KS and CVM for ' + self.model_name + ' model with ' +
              self.var_name + ' variables\n')

        mod = self.create_model()

        X = data[self.variables].values
        y = data['signal'].values

        if self.nn == True:
            train_X, val_X, train_y, val_y = train_test_split(X,
                                                              y,
                                                              test_size=0.2)
            self.train_params['validation_data'] = (val_X, val_y)
            scaler = StandardScaler()
            train_X = scaler.fit_transform(train_X)
            val_X = scaler.transform(val_X)
            ch_agr = scaler.transform(check_agreement[self.variables])
            ch_cor = scaler.transform(check_correlation[self.variables])
            self.scaler = scaler

        mod.fit(X, y, **self.train_params)

        if self.nn == True:
            agreement_probs = mod.predict(ch_agr)
            correlation_probs = mod.predict(ch_cor).reshape(
                (ch_cor.shape[0], ))
        else:
            agreement_probs = mod.predict_proba(
                check_agreement[self.variables].values)[:, 1]
            correlation_probs = mod.predict_proba(
                check_correlation[self.variables].values)[:, 1]

        ks = evaluation.compute_ks(
            agreement_probs[check_agreement['signal'].values == 0],
            agreement_probs[check_agreement['signal'].values == 1],
            check_agreement[check_agreement['signal'] == 0]['weight'].values,
            check_agreement[check_agreement['signal'] == 1]['weight'].values)
        cvm = evaluation.compute_cvm(correlation_probs,
                                     check_correlation['mass'])
        print('KS metric = {}. Is it smaller than 0.09? {}'.format(
            ks, ks < 0.09))
        print('CVM metric = {}. Is it smaller than 0.002? {}\n'.format(
            cvm, cvm < 0.002))
        self.ks = ks
        self.cvm = cvm
        self.trained_model = mod
        self.agreement_probs = agreement_probs
        self.correlation_probs = correlation_probs
Example #10
0
    def objective(parameters):
        i.append(0)
        set_weights(model, parameters)
        p = model.predict(X, batch_size=256, verbose=0)[:, 1]
        auc = roc_auc_truncated(y, p)

        pa = model.predict(Xa, batch_size=256, verbose=0)[:, 1]
        ks = compute_ks(pa[ya == 0], pa[ya == 1], wa[ya == 0], wa[ya == 1])

        pc = model.predict(Xc, batch_size=256, verbose=0)[:, 1]
        cvm = compute_cvm(pc, mc)

        ks_importance = 1  # relative KS importance
        ks_target = ks_threshold
        cvm_importance = 1  # relative CVM importance
        cvm_target = cvm_threshold

        alpha = 0.001  # LeakyReLU
        ks_loss = (1 if ks > ks_target else alpha) * (ks - ks_target)
        cvm_loss = (1 if cvm > cvm_target else alpha) * (cvm - cvm_target)
        loss = -auc + ks_importance * ks_loss + cvm_importance * cvm_loss

        if ks < ks_threshold and cvm < cvm_threshold and auc > auc_log[0]:
            d.append(0)
            dump_transductor_model(model,
                                   transductor_model_file.format(len(d)))
            auc_log.pop()
            auc_log.append(auc)
            message = "iteration {:7}: Best AUC={:7.5f} achieved, KS={:7.5f}, CVM={:7.5f}".format(
                len(i), auc, ks, cvm)
            logger.info(message)

        if verbose:
            print(
                "iteration {:7}: AUC: {:7.5f}, KS: {:7.5f}, CVM: {:7.5f}, loss: {:8.5f}"
                .format(len(i), auc, ks, cvm, loss))
        return loss
num_trees = 250

n_rounds = 120
watchlist = [(xg_train, 'train')]

xgb_model = xgb.train(params, xg_train, num_trees, watchlist)
# xgb_model = xgb.train(params, xg_train, n_rounds, watchlist)

# Check agreement test
check_agreement = pandas.read_csv(folder + 'check_agreement.csv', index_col='id')
xg_check_agreement = xgb.DMatrix(check_agreement.values)
agreement_probs = xgb_model.predict(xg_check_agreement)

ks = evaluation.compute_ks(
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)
print 'KS metric', ks, ks < 0.09

# Check correlation test
check_correlation = pandas.read_csv(folder + 'check_correlation.csv', index_col='id')
xg_check_correlation = xgb.DMatrix(check_correlation.values)
correlation_probs = xgb_model.predict(xg_check_correlation)
cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
print 'CvM metric', cvm, cvm < 0.002

# Compute weighted AUC on the training data with min_ANNmuon > 0.4
train_eval = train[train['min_ANNmuon'] > 0.4]
train_eval_X = train_eval.drop(variables_to_drop, 1).values
xg_train_eval = xgb.DMatrix(train_eval_X)
gd.fit(train[features], train["signal"])

check_agreement = pd.read_csv('C:/Users/sony/Downloads/Compressed/CERN/check_agreement.csv', index_col='id')
agreement_probs = gd.predict_proba(check_agreement[features])[:, 1]
"""
ks = evaluation.compute_ks(
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)

print 'KS metric gb', ks, ks < 0.09
"""
agreement_probs1 = rf.predict_proba(check_agreement[features])[:, 1]

ks1 = evaluation.compute_ks(
    0.3*agreement_probs1[check_agreement['signal'].values == 0]+
    0.7*agreement_probs[check_agreement['signal'].values == 0],
    0.3*agreement_probs1[check_agreement['signal'].values == 1]+
    0.7*agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)

print 'KS metric rf', ks1, ks1 < 0.09

check_correlation = pd.read_csv('C:/Users/sony/Downloads/Compressed/CERN/check_correlation.csv', index_col='id')
correlation_probs = gd.predict_proba(check_correlation[features])[:, 1]
#cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
#print 'CvM metric for gb', cvm, cvm < 0.002

correlation_probs1 = rf.predict_proba(check_correlation[features])[:, 1]
Example #13
0
          'gamma': 0.01, # 0.005
          "min_child_weight": 5,
          "silent": 1,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          'nthread': 4,
          "seed": 1}

num_trees=600

#gbm = xgb.train(params, xgb.DMatrix(train[features], train["signal"]), num_trees)

agreement_probs= rf.predict_proba(check_agreement[features])[:,1]
print('Checking agreement...')
ks = evaluation.compute_ks(
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)
print ('KS metric UB =', ks, ks < 0.09)

train_eval_probs1 = rf.predict_proba(train_eval[features])[:,1]
AUC1 = evaluation.roc_auc_truncated(train_eval['signal'], train_eval_probs1)
print ('AUC UB ', AUC1)

print("Make predictions on the test set")
rfpred = rf.predict_proba(test[features])[:,1]
test_probs = rfpred
submission = pd.DataFrame({"id": test["id"], "prediction": test_probs})
submission.to_csv("ub_only_submission.csv", index=False)
def train_test_predict(classifier, classifier_name, features, features_name,
        data_directory, training_data):
    """
    """
    # Fit the classifier with the training data.
    start = time.time()
    classifier.fit(training_data[features], training_data['signal'])
    end = time.time()
    print("time to fit the classifier: {} seconds".format(end - start))
    print()

    # Check the agreement test.
    start = time.time()
    check_agreement = pandas.read_csv(data_directory + 'check_agreement.csv', index_col='id')
    agreement_probs = classifier.predict_proba(check_agreement[features])[:, 1]
    ks = evaluation.compute_ks(
        agreement_probs[check_agreement['signal'].values == 0],
        agreement_probs[check_agreement['signal'].values == 1],
        check_agreement[check_agreement['signal'] == 0]['weight'].values,
        check_agreement[check_agreement['signal'] == 1]['weight'].values)
    print('KS metric', ks, ks < 0.09)
    end = time.time()
    print("time to check the agreement test: {} seconds".format(end - start))
    print()

    # Check the correlation test.
    start = time.time()
    check_correlation = pandas.read_csv(data_directory + 'check_correlation.csv', index_col='id')
    correlation_probs = classifier.predict_proba(check_correlation[features])[:, 1]
    cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
    print('CvM metric', cvm, cvm < 0.002)
    end = time.time()
    print("time to check the correlation test: {} seconds".format(end - start))
    print()

    # Compute weighted AUC on the training data with min_ANNmuon > 0.4.
    start = time.time()
    train_eval = training_data[training_data['min_ANNmuon'] > 0.4]
    train_probs = classifier.predict_proba(train_eval[features])[:, 1]
    AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs)
    print('AUC', AUC)
    end = time.time()
    print("time to compute the weighted AUC: {} seconds".format(end - start))
    print()

    # Make predictions on the test data.
    start = time.time()
    testing_data = pandas.read_csv(data_directory + 'test.csv', index_col='id')
    result = pandas.DataFrame({'id': testing_data.index})
    result['prediction'] = classifier.predict_proba(testing_data[features])[:, 1]
    end = time.time()
    print("time to make predictions: {} seconds".format(end - start))
    print()

    predictions_name = classifier_name + '-' + features_name

    # Generate the csv file for Kaggle.
    result.to_csv(predictions_name + '.csv', index=False, sep=',')

    # Run the shell commands to generate the final archive through
    # the subprocess module calls.
    print(subprocess.check_output(['rm', '-f', predictions_name+'.7z']))

    print(subprocess.check_output(['7z', 'a', predictions_name+'.7z', predictions_name+'.csv']))

    print(subprocess.check_output(['ls', '-Ahl', predictions_name+'.csv']))
    print(subprocess.check_output(['ls', '-Ahl', predictions_name+'.7z']))
    # since preds are margin(before logistic transformation, cutoff at 0)
    return "truncated AUC", -evaluation.roc_auc_truncated(labels, preds)


bst = xgb.train(param, dtrain, num_round, watchlist, feval=evalerror, early_stopping_rounds=10)

# check agreement with noise
check_agreement = all_data.ix[all_data["dataset"] == "agreement", :]
dagreement = xgb.DMatrix(np.array(check_agreement[variables]))

for noise_level in np.linspace(0, 0.2, 6):
    agreement_probs = add_noise(bst.predict(dagreement), noise_level)

    ks = evaluation.compute_ks(
        agreement_probs[check_agreement["signal"].values == 0],
        agreement_probs[check_agreement["signal"].values == 1],
        check_agreement[check_agreement["signal"] == 0]["weight"].values,
        check_agreement[check_agreement["signal"] == 1]["weight"].values,
    )
    print "KS metric with noise level %f " % noise_level, ks, ks < 0.09

# Correlation probs with noise
check_correlation = all_data.ix[all_data["dataset"] == "correlation", :]
dcorrelation = xgb.DMatrix(np.array(check_correlation[variables]))

for noise_level in np.linspace(0, 0.2, 6):
    correlation_probs = add_noise(bst.predict(dcorrelation), noise_level)
    cvm = evaluation.compute_cvm(correlation_probs, check_correlation["mass"])
    print "CvM metric with noise level %f " % noise_level, cvm, cvm < 0.002

# Validation with noise
train_view = all_data.ix[all_data["dataset"] == "train", :]