Beispiel #1
0
def my_obj(args):
    threshold = args
    threshold = int(threshold[0])

    # Apply a stacking method
    if METHOD == 'mean':
        prob_predict_train = np.apply_along_axis(np.mean, 1, X_train)
        prob_predict_valid = np.apply_along_axis(np.mean, 1, X_valid)
    elif METHOD == 'geomean':

        def p(x):
            z = np.apply_along_axis(np.cumproduct, 1, x)[:, -1]
            return np.power(z, 1.0 / x.shape[1])

        prob_predict_train = p(X_train)
        prob_predict_valid = p(X_valid)
    else:
        print 'METHOD must be mean or prob'
        stop()

    # Choose cut point
    pcut = np.percentile(prob_predict_train, threshold)

    # These are the final signal and background predictions
    Yhat_train = prob_predict_train > pcut
    Yhat_valid = prob_predict_valid > pcut

    # Calc numeber of s and b TruePos and True Neg for training and validation
    s_train, b_train = higgs_lib.count_s_b(W_train, Y_train, Yhat_train)
    s_valid, b_valid = higgs_lib.count_s_b(W_valid, Y_valid, Yhat_valid)

    # Now calculate the invers AMS scores
    def inv_AMSScore(s, b):
        try:
            inv_ams = 1 / math.sqrt(2. *
                                    ((s + b + 10.) * math.log(1. + s /
                                                              (b + 10.)) - s))
        except:
            inv_ams = 1
            pass
        return inv_ams

    trial_results = {}
    trial_results['loss'] = inv_AMSScore(s_train, b_train)
    trial_results['valid_loss'] = inv_AMSScore(s_valid, b_valid)
    trial_results['status'] = STATUS_OK
    return trial_results
Beispiel #2
0
def my_obj(args):
    threshold = args
    threshold = int(threshold[0])

    # Apply a stacking method
    if METHOD == 'mean':
        prob_predict_train = np.apply_along_axis(np.mean,1,X_train)
        prob_predict_valid = np.apply_along_axis(np.mean,1,X_valid) 
    elif METHOD == 'geomean':
        def p(x):
            z=np.apply_along_axis(np.cumproduct,1,x)[:,-1]
            return np.power(z, 1.0/x.shape[1])
        prob_predict_train = p(X_train)
        prob_predict_valid = p(X_valid)
    else:
        print 'METHOD must be mean or prob'
        stop()
        
    # Choose cut point
    pcut = np.percentile(prob_predict_train,threshold)
     
    # These are the final signal and background predictions
    Yhat_train = prob_predict_train > pcut 
    Yhat_valid = prob_predict_valid > pcut

    # Calc numeber of s and b TruePos and True Neg for training and validation
    s_train, b_train = higgs_lib.count_s_b(W_train,Y_train,Yhat_train)
    s_valid, b_valid = higgs_lib.count_s_b(W_valid,Y_valid,Yhat_valid)

   
    # Now calculate the invers AMS scores
    def inv_AMSScore(s,b):
        try:
            inv_ams = 1/math.sqrt (2.*( (s + b + 10.)*math.log(1.+s/(b+10.))-s))
        except:
            inv_ams= 1
            pass
        return inv_ams
    trial_results={}
    trial_results['loss'] = inv_AMSScore(s_train,b_train)
    trial_results['valid_loss'] = inv_AMSScore(s_valid,b_valid)
    trial_results['status'] = STATUS_OK
    return trial_results
def objective(args):
    depth,eta,threshold = args
    threshold = int(threshold)
    depth = int(depth)
    param = {}
    # use logistic regression loss, use raw prediction before logistic transformation
    # since we only need the rank
    param['objective'] = 'binary:logitraw'
    # scale weight of positive examples
    param['scale_pos_weight'] = sum_wneg/sum_wpos
    param['bst:eta'] = eta 
    param['bst:max_depth'] = depth
    param['eval_metric'] = 'auc'
    param['silent'] = 0
    param['nthread'] = number_threads
    
    # specify validations set to watch performance
    evallist  = [(dvalid,'eval'), (dtrain,'train')]

    # Train the GradientBoostingClassifier
    bst = xgb.train( param, dtrain, n_boost_iter, evallist )

    # Get the probaility output from the trained method, using the 10% for testing
    predict_train = bst.predict(dtrain)
    predict_valid = bst.predict(dvalid)

    # Select a cutoff point for assign signal and background labels
    pcut = np.percentile(predict_train,threshold)
    
    # These are the final signal and background predictions
    Yhat_train = predict_train > pcut 
    Yhat_valid = predict_valid > pcut   
    # Calc numeber of s and b TruePos and True Neg for training and validation
    s_train, b_train = higgs_lib.count_s_b(W_train,Y_train,Yhat_train)
    s_valid, b_valid = higgs_lib.count_s_b(W_valid,Y_valid,Yhat_valid)
    trial_results={}
    trial_results['loss'] = higgs_lib.inv_AMSScore(s_train,b_train)
    trial_results['valid_loss'] = higgs_lib.inv_AMSScore(s_valid,b_valid)
    trial_results['status'] = STATUS_OK
    return trial_results
# Train
bst = xgb.train( param, dtrain, n_boost_iter, evallist )
 
# Predict 
predict_train = bst.predict(dtrain)
predict_valid = bst.predict(dvalid)

# Select a cutoff point for assign signal and background labels
pcut = np.percentile(predict_train,threshold)
 
# These are the final signal and background predictions
Yhat_train = predict_train > pcut 
Yhat_valid = predict_valid > pcut

# Calc numeber of s and b TruePos and True Neg for training and validation
s_train, b_train = higgs_lib.count_s_b(W_train,Y_train,Yhat_train)
s_valid, b_valid = higgs_lib.count_s_b(W_valid,Y_valid,Yhat_valid)
 
# Now calculate the AMS scores
print 'Calculating AMS score for a probability cutoff %s' % pcut
print '   - AMS based on 90% training   sample:',higgs_lib.AMSScore(s_train,b_train)
print '   - AMS based on 10% validation sample:',higgs_lib.AMSScore(s_valid,b_valid)

# <codecell>

# Optionally write a submission csv file
# Better submissions result from applying bagging first
# Load the testing data
print 'Loading testing data'
data_test = np.loadtxt( 'test.csv', delimiter=',', skiprows=1 )
X_test = data_test[:,1:31]
Beispiel #5
0
# Make predictions with bagging models
# Use full training set, not bag ratioed
dtrain = xgb.DMatrix(X_train,label=Y_train, weight = weight)
predict_train = bag_predict(models,dtrain)
predict_valid = bag_predict(models,dvalid)

# Select a cutoff point for assign signal and background labels
pcut = np.percentile(predict_train,best['threshold'])

# This are the final signal and background predictions
Yhat_train = predict_train > pcut 
Yhat_valid = predict_valid > pcut
 
# Calc numeber of s and b TruePos and True Neg for training and validation
s_train, b_train = higgs_lib.count_s_b(W_train,Y_train,Yhat_train)
s_valid, b_valid = higgs_lib.count_s_b(W_valid,Y_valid,Yhat_valid)
 
# Now calculate the AMS scores
print 'Calculating AMS score for a probability cutoff pcut=',pcut
print '   - AMS based on 90% training   sample:',higgs_lib.AMSScore(s_train,b_train)
print '   - AMS based on 10% validation sample:',higgs_lib.AMSScore(s_valid,b_valid)
 

# <codecell>

# Now we load the testing data, storing the data (X) and index (I)
print 'Loading testing data'
data_test = np.loadtxt( 'test_log.csv', delimiter=',', skiprows=1 )
X_test = data_test[:,1:]
I_test = list(data_test[:,0])