Exemple #1
0
def cv_model(model_list):
	print "generating cv csv files...."
	train, test = gen_data()
	label = train['signal']
	train_id = train.id
	test_id = test.id

	train_del, test_del = delete_features(train), delete_features(test)

	check_agreement = pd.read_csv('../data/check_agreement.csv')
	check_correlation = pd.read_csv('../data/check_correlation.csv')
	check_agreement= add_features(check_agreement)
	check_correlation  = add_features(check_correlation)

	X, X_test = train_del.as_matrix(), test_del.as_matrix()
	print X.shape, X_test.shape

	kf = KFold(label, n_folds = 4)
	for j, (clf, clf_name) in enumerate(model_list):
		
		print "modelling model %i ...."%j
		cv_train = np.zeros(len(label))
		for i, (train_fold, validate) in enumerate(kf):
			X_train, X_validate, label_train, label_validate = X[train_fold,:], X[validate,:], label[train_fold], label[validate]
			clf.fit(X_train,label_train)
			cv_train[validate] = clf.predict_proba(X_validate)[:,1]
		auc_score = evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], 
			pd.Series(cv_train)[train['min_ANNmuon'] > 0.4])
		print "the true roc_auc_truncated is %.6f"%auc_score

		clf.fit(X, label)
		test_probs = clf.predict_proba(X_test)[:,1]
		# check if it passes the tests
		print "check if it passes the tests"
		agreement_probs = clf.predict_proba(delete_features(check_agreement).as_matrix())[:,1]
		ks = evaluation.compute_ks(
			agreement_probs[check_agreement['signal'].values == 0],
			agreement_probs[check_agreement['signal'].values == 1],
			check_agreement[check_agreement['signal'] == 0]['weight'].values,
			check_agreement[check_agreement['signal'] == 1]['weight'].values)
		print ('KS metric', ks, ks <= 0.09)

		correlation_probs = clf.predict_proba(delete_features(check_correlation).as_matrix())[:,1]
		print ('Checking correlation...')
		cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
		print ('CvM metric', cvm, cvm <= 0.002)
		#if ks <= 0.09 and cvm <= 0.002 and auc_score > 0.975: # no need to check here
		if auc_score > 0.965: # the minimum threshold
			# save the cv
			cv_sub = pd.DataFrame({"id": train_id, "prediction": cv_train, "label": label})
			cv_sub.to_csv("../data/cv_folder/xgb%i.csv"%j, index=False)
			# save the prediction
			submission = pd.DataFrame({"id": test_id, "prediction": test_probs})
			submission.to_csv("../data/pred_folder/xgb%i.csv"%j, index=False)
			# save agreement
			submission = pd.DataFrame({"id": check_agreement['id'], "prediction": agreement_probs})
			submission.to_csv("../data/agree_folder/xgb%i.csv"%j, index=False)
			# save correlation
			submission = pd.DataFrame({"id": check_correlation['id'], "prediction": correlation_probs})
			submission.to_csv("../data/correlation_folder/xgb%i.csv"%j, index=False)
Exemple #2
0
def gen_data():
	path = '../data/'
	print "loading data..."
	train = pd.read_csv(path + "training.csv")
	test  = pd.read_csv(path + "test.csv")
	train, test = add_features(train), add_features(test)

	return train, test
        print 'ANN Score= %s' % (score6)
        print 'LR + GB + ANN Score = %s' % (score7)
        print 'ADA Score = %s' % (score8)
        print 'GB + ANN + ADA Score = %s' % (score9)
        print 'LR + GB + ANN + ADA Score = %s' % (score10)
        return blend_train, Y_dev, blend_test, Y_test

    # average of ADA, ANN and GBM.
    return (Y_test_predict + Y_test_predict2 + Y_test_predict3 +
            Y_test_predict4) / 4.0


train = pd.read_csv('../input/training.csv')
test = pd.read_csv('../input/test.csv')

train = add_features(train)
test = add_features(test)

# add SPDHITS back...

filter_out = [
    'id', 'min_ANNmuon', 'production', 'mass', 'signal', 'p0_eta', 'p1_eta',
    'p2_eta', 'LifeTime', 'FlightDistanceError'
]

#features = list(train.columns)
features = list(f for f in train.columns if f not in filter_out)

is_test = False

res = stacked_models(train, features, test, is_test)
        print 'Scipy Score = %s' % (score4)
        print 'LR + GB score = %s' % (score5)
        print 'ANN Score= %s' % (score6)
        print 'LR + GB + ANN Score = %s' % (score7)
        print 'ADA Score = %s' % (score8)
        print 'GB + ANN + ADA Score = %s' % (score9)
        print 'LR + GB + ANN + ADA Score = %s' % (score10)
        return blend_train, Y_dev, blend_test, Y_test

    # average of ADA, ANN and GBM.
    return (Y_test_predict + Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 4.0

train = pd.read_csv('../input/training.csv')
test  = pd.read_csv('../input/test.csv')

train = add_features(train)
test = add_features(test)

# add SPDHITS back...

filter_out = ['id', 'min_ANNmuon', 'production', 'mass', 'signal', 'p0_eta','p1_eta','p2_eta','LifeTime',
              'FlightDistanceError']

#features = list(train.columns)
features = list(f for f in train.columns if f not in filter_out)

is_test = False

res = stacked_models(train, features, test, is_test)

if not is_test:
Exemple #5
0
import evaluation
from sklearn.ensemble import GradientBoostingClassifier
from hep_ml.uboost import uBoostClassifier
from hep_ml.gradientboosting import UGradientBoostingClassifier,LogLossFunction
from hep_ml.losses import BinFlatnessLossFunction, KnnFlatnessLossFunction

print("Load the training/test data using pandas")
train = pd.read_csv("../input/training.csv")
test  = pd.read_csv("../input/test.csv")
check_agreement = pd.read_csv('../input/check_agreement.csv')
check_correlation = pd.read_csv('../input/check_correlation.csv')

from feat import add_features

print("Adding features to both training and testing")
train = add_features(train)
test = add_features(test)

check_agreement = add_features(check_agreement)
check_correlation = add_features(check_correlation)

print("Eliminate SPDhits, which makes the agreement check fail")
from feat import filter_out

features = list(f for f in train.columns if f not in filter_out)

train_eval = train[train['min_ANNmuon'] > 0.4]

print("features:",features)
#train[features] = train[features].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
#test[features] = test[features].apply(lambda x: (x - x.min()) / (x.max() - x.min()))