def my_obj(args): threshold = args threshold = int(threshold[0]) # Apply a stacking method if METHOD == 'mean': prob_predict_train = np.apply_along_axis(np.mean, 1, X_train) prob_predict_valid = np.apply_along_axis(np.mean, 1, X_valid) elif METHOD == 'geomean': def p(x): z = np.apply_along_axis(np.cumproduct, 1, x)[:, -1] return np.power(z, 1.0 / x.shape[1]) prob_predict_train = p(X_train) prob_predict_valid = p(X_valid) else: print 'METHOD must be mean or prob' stop() # Choose cut point pcut = np.percentile(prob_predict_train, threshold) # These are the final signal and background predictions Yhat_train = prob_predict_train > pcut Yhat_valid = prob_predict_valid > pcut # Calc numeber of s and b TruePos and True Neg for training and validation s_train, b_train = higgs_lib.count_s_b(W_train, Y_train, Yhat_train) s_valid, b_valid = higgs_lib.count_s_b(W_valid, Y_valid, Yhat_valid) # Now calculate the invers AMS scores def inv_AMSScore(s, b): try: inv_ams = 1 / math.sqrt(2. * ((s + b + 10.) * math.log(1. + s / (b + 10.)) - s)) except: inv_ams = 1 pass return inv_ams trial_results = {} trial_results['loss'] = inv_AMSScore(s_train, b_train) trial_results['valid_loss'] = inv_AMSScore(s_valid, b_valid) trial_results['status'] = STATUS_OK return trial_results
def my_obj(args): threshold = args threshold = int(threshold[0]) # Apply a stacking method if METHOD == 'mean': prob_predict_train = np.apply_along_axis(np.mean,1,X_train) prob_predict_valid = np.apply_along_axis(np.mean,1,X_valid) elif METHOD == 'geomean': def p(x): z=np.apply_along_axis(np.cumproduct,1,x)[:,-1] return np.power(z, 1.0/x.shape[1]) prob_predict_train = p(X_train) prob_predict_valid = p(X_valid) else: print 'METHOD must be mean or prob' stop() # Choose cut point pcut = np.percentile(prob_predict_train,threshold) # These are the final signal and background predictions Yhat_train = prob_predict_train > pcut Yhat_valid = prob_predict_valid > pcut # Calc numeber of s and b TruePos and True Neg for training and validation s_train, b_train = higgs_lib.count_s_b(W_train,Y_train,Yhat_train) s_valid, b_valid = higgs_lib.count_s_b(W_valid,Y_valid,Yhat_valid) # Now calculate the invers AMS scores def inv_AMSScore(s,b): try: inv_ams = 1/math.sqrt (2.*( (s + b + 10.)*math.log(1.+s/(b+10.))-s)) except: inv_ams= 1 pass return inv_ams trial_results={} trial_results['loss'] = inv_AMSScore(s_train,b_train) trial_results['valid_loss'] = inv_AMSScore(s_valid,b_valid) trial_results['status'] = STATUS_OK return trial_results
def objective(args): depth,eta,threshold = args threshold = int(threshold) depth = int(depth) param = {} # use logistic regression loss, use raw prediction before logistic transformation # since we only need the rank param['objective'] = 'binary:logitraw' # scale weight of positive examples param['scale_pos_weight'] = sum_wneg/sum_wpos param['bst:eta'] = eta param['bst:max_depth'] = depth param['eval_metric'] = 'auc' param['silent'] = 0 param['nthread'] = number_threads # specify validations set to watch performance evallist = [(dvalid,'eval'), (dtrain,'train')] # Train the GradientBoostingClassifier bst = xgb.train( param, dtrain, n_boost_iter, evallist ) # Get the probaility output from the trained method, using the 10% for testing predict_train = bst.predict(dtrain) predict_valid = bst.predict(dvalid) # Select a cutoff point for assign signal and background labels pcut = np.percentile(predict_train,threshold) # These are the final signal and background predictions Yhat_train = predict_train > pcut Yhat_valid = predict_valid > pcut # Calc numeber of s and b TruePos and True Neg for training and validation s_train, b_train = higgs_lib.count_s_b(W_train,Y_train,Yhat_train) s_valid, b_valid = higgs_lib.count_s_b(W_valid,Y_valid,Yhat_valid) trial_results={} trial_results['loss'] = higgs_lib.inv_AMSScore(s_train,b_train) trial_results['valid_loss'] = higgs_lib.inv_AMSScore(s_valid,b_valid) trial_results['status'] = STATUS_OK return trial_results
# Train bst = xgb.train( param, dtrain, n_boost_iter, evallist ) # Predict predict_train = bst.predict(dtrain) predict_valid = bst.predict(dvalid) # Select a cutoff point for assign signal and background labels pcut = np.percentile(predict_train,threshold) # These are the final signal and background predictions Yhat_train = predict_train > pcut Yhat_valid = predict_valid > pcut # Calc numeber of s and b TruePos and True Neg for training and validation s_train, b_train = higgs_lib.count_s_b(W_train,Y_train,Yhat_train) s_valid, b_valid = higgs_lib.count_s_b(W_valid,Y_valid,Yhat_valid) # Now calculate the AMS scores print 'Calculating AMS score for a probability cutoff %s' % pcut print ' - AMS based on 90% training sample:',higgs_lib.AMSScore(s_train,b_train) print ' - AMS based on 10% validation sample:',higgs_lib.AMSScore(s_valid,b_valid) # <codecell> # Optionally write a submission csv file # Better submissions result from applying bagging first # Load the testing data print 'Loading testing data' data_test = np.loadtxt( 'test.csv', delimiter=',', skiprows=1 ) X_test = data_test[:,1:31]
# Make predictions with bagging models # Use full training set, not bag ratioed dtrain = xgb.DMatrix(X_train,label=Y_train, weight = weight) predict_train = bag_predict(models,dtrain) predict_valid = bag_predict(models,dvalid) # Select a cutoff point for assign signal and background labels pcut = np.percentile(predict_train,best['threshold']) # This are the final signal and background predictions Yhat_train = predict_train > pcut Yhat_valid = predict_valid > pcut # Calc numeber of s and b TruePos and True Neg for training and validation s_train, b_train = higgs_lib.count_s_b(W_train,Y_train,Yhat_train) s_valid, b_valid = higgs_lib.count_s_b(W_valid,Y_valid,Yhat_valid) # Now calculate the AMS scores print 'Calculating AMS score for a probability cutoff pcut=',pcut print ' - AMS based on 90% training sample:',higgs_lib.AMSScore(s_train,b_train) print ' - AMS based on 10% validation sample:',higgs_lib.AMSScore(s_valid,b_valid) # <codecell> # Now we load the testing data, storing the data (X) and index (I) print 'Loading testing data' data_test = np.loadtxt( 'test_log.csv', delimiter=',', skiprows=1 ) X_test = data_test[:,1:] I_test = list(data_test[:,0])