コード例 #1
0
ファイル: TaggerMVA.py プロジェクト: tibristo/BosonTagging
def main():
    print 'Loading training data ...'
    data_train = pd.read_csv('csv/CamKt12LCTopoSplitFilteredMu100SmallR30YCut414tev_350_500_vxp_0_99-merged.csv')
    r =np.random.rand(data_train.shape[0])
        	#Algorithm = 'AKT10LCTRIM530'
    
    plt.figure(1)
    Y_train = data_train['label'][r<0.9]
#    W_train = data_train['weight'][r<0.9]
    Y_valid = data_train['label'][r>=0.9]
#    W_valid = data_train['weight'][r>=0.9]
#    data_train.drop('AKT10LCTRIM530_MassDropSplit', axis=1, inplace=True)
    for varset in itertools.combinations(data_train.columns.values[1:-1],2):
        print list(varset)
        X_train = data_train[list(varset)][r<0.9]
        X_valid = data_train[list(varset)][r>=0.9]
    
    	  #gbc = Pipeline([("scale", StandardScaler()), ("gbc",GBC(n_estimators=1,verbose=1, max_depth=10,min_samples_leaf=50))])
    	  #	  gbc = GBC(n_estimators=20,verbose=1, max_depth=10,min_samples_leaf=50)
        #gbc = GaussianNB()
        dt = DC(max_depth=3,min_samples_leaf=0.05*len(X_train))
        abc = ABC(dt,algorithm='SAMME',
                         n_estimators=800,
                         learning_rate=0.5)
        print 'Training classifier with all the data..'
        abc.fit(X_train.values, Y_train.values)
#    sample_weight=W_train.values 
        print 'Done.. Applying to validation sample and drawing ROC' 
        prob_predict_valid = abc.predict(X_valid)
        #[:,1]
        #
        print prob_predict_valid
        Y_score = abc.decision_function(X_valid.values)
        print Y_score
        fpr, tpr, _ = roc_curve(Y_valid.values, Y_score)
#        W_valid.values
        labelstring = 'And'.join(var.replace('_','') for var in varset)
        print labelstring    
        plt.plot(tpr, (1-fpr), label=labelstring)
        plt.figure(2)       
        plt.hist(abc.decision_function(X_valid[Y_valid==1.]).ravel(),
         color='r', alpha=0.5, range=(-1.0,1.0), bins=50)
        plt.hist(abc.decision_function(X_valid[Y_valid==0.]).ravel(),
         color='b', alpha=0.5, range=(-1.0,1.0), bins=50)
        plt.xlabel("scikit-learn BDT output")
        plt.savefig(labelstring+'bdtout.pdf')        
    	  #	  labelstring = ' and '.join(var.replace(Algorithm,'') for var in varset)
    plt.figure(1)   
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.ylabel('1- Background Efficiency')
    plt.xlabel('Signal Efficiency')
    plt.title('ROC Curve')
    plt.legend(loc="lower left",prop={'size':6})
    #plt.show()
    plt.savefig('rocmva.pdf')
コード例 #2
0
def test_iris():
    # Check consistency on dataset iris.
    classes = np.unique(iris.target)
    clf_samme = prob_samme = None

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)
        clf.fit(iris.data, iris.target)

        assert_array_equal(classes, clf.classes_)
        proba = clf.predict_proba(iris.data)
        if alg == "SAMME":
            clf_samme = clf
            prob_samme = proba
        assert_equal(proba.shape[1], len(classes))
        assert_equal(clf.decision_function(iris.data).shape[1], len(classes))

        score = clf.score(iris.data, iris.target)
        assert score > 0.9, "Failed with algorithm %s and score = %f" % \
            (alg, score)

    # Somewhat hacky regression test: prior to
    # ae7adc880d624615a34bafdb1d75ef67051b8200,
    # predict_proba returned SAMME.R values for SAMME.
    clf_samme.algorithm = "SAMME.R"
    assert_array_less(0,
                      np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
コード例 #3
0
def test_classification_toy():
    # Check classification on a toy dataset.
    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg, random_state=0)
        clf.fit(X, y_class)
        assert_array_equal(clf.predict(T), y_t_class)
        assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
        assert_equal(clf.predict_proba(T).shape, (len(T), 2))
        assert_equal(clf.decision_function(T).shape, (len(T),))
コード例 #4
0
ファイル: nMinus1Check.py プロジェクト: mrelich/MuonAna
def n1check(d_train, d_test, opts):

    # Load the data with no weights and put it into panda format
    # for easier manipulation
    pd_train = pd.DataFrame(d_train.getDataNoWeight())
    pd_test  = pd.DataFrame(d_test.getDataNoWeight())

    # Holder for results
    results = {}

    # Setup classifier
    clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=opts.maxdepth),
                             n_estimators = opts.ntrees,
                             learning_rate = opts.lrate)

    # Train the classifier on total data set for comparison
    clf.fit(pd_train, d_train.targets)
    results['total'] = roc_auc_score(d_test.targets, clf.decision_function(pd_test))


    # Loop over the variables and store the results in dict
    keys    = d_train.t_varnames
    for i in range(len(keys)):
        
        sub_train = pd_train.drop(i,axis=1)
        sub_test  = pd_test.drop(i,axis=1)

        clf.fit(sub_train, d_train.targets)
        results[keys[i]] = roc_auc_score(d_test.targets, clf.decision_function(sub_test))


    # Now that we have the results, print all information
    print "--------------------------------------------"
    for key in results:
        print "Leaving out ", key, "gives score: ", results[key]
    print ""
コード例 #5
0
def test_iris():
    """Check consistency on dataset iris."""
    classes = np.unique(iris.target)

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)
        clf.fit(iris.data, iris.target)

        assert_array_equal(classes, clf.classes_)
        assert_equal(clf.predict_proba(iris.data).shape[1], len(classes))
        assert_equal(clf.decision_function(iris.data).shape[1], len(classes))

        score = clf.score(iris.data, iris.target)
        assert score > 0.9, "Failed with algorithm %s and score = %f" % \
            (alg, score)
コード例 #6
0
ファイル: TaggerMVA.py プロジェクト: wbhimji/BosonTagging
def main():
    Algorithm = 'CamKt12LCTopoSplitFilteredMu67SmallR0YCut9'
    print 'Loading training data ...'

    data_train = pd.read_csv(Algorithm+'merged.csv')   
    r =np.random.rand(data_train.shape[0])
    
    #Set label and weight vectors - and drop any unwanted tranining one
    Y_train = data_train['label'].values[r<0.5]
    # W_train = data_train['weight'].values[r<0.9]
    Y_valid = data_train['label'].values[r>=0.5]
    # W_valid = data_train['weight'].values[r>=0.9]
    # data_train.drop('AKT10LCTRIM530_MassDropSplit', axis=1, inplace=True)

    varcombinations = itertools.combinations(data_train.columns.values[1:-1],2)
    fac = lambda n: 1 if n < 2 else n * fac(n - 1)
    combos = lambda n, k: fac(n) / fac(k) / fac(n - k)

    colors = plt.get_cmap('jet')(np.linspace(0, 1.0,combos(len(data_train.columns.values[1:-1]),2) ))

    for varset,color in zip(varcombinations, colors):
        print list(varset)
        X_train = data_train[list(varset)].values[r<0.5]
        X_valid = data_train[list(varset)].values[r>=0.5]


        dt = DC(max_depth=3,min_samples_leaf=0.05*len(X_train))
        abc = ABC(dt,algorithm='SAMME',
                 n_estimators=8,
                 learning_rate=0.5)
        print 'Training classifier with all the data..'
        abc.fit(X_train, Y_train)
        print 'Done.. Applying to validation sample and drawing ROC' 
        prob_predict_valid = abc.predict_proba(X_valid)[:,1]
        Y_score = abc.decision_function(X_valid)
        fpr, tpr, _ = roc_curve(Y_valid, prob_predict_valid)
        labelstring = ' And '.join(var.replace('_','') for var in varset)
        print labelstring
        plt.plot(tpr, (1-fpr), label=labelstring, color=color)

        
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.ylabel('1- Background Efficiency')
    plt.xlabel('Signal Efficiency')
    plt.title(Algorithm+' ROC Curve')
    plt.legend(loc="lower left",prop={'size':6})
    plt.savefig(Algorithm+'rocmva.pdf')
コード例 #7
0
def ada_boost(X_train, X_test, y_train, y_test, C=1):
	X1 = []
	X2 = []
	y1 = []
	y2 = []
	for x, y in zip(X_train, y_train):
		if y==1:
			y1.append(y)
			X1.append(x)
		else:
			y2.append(y)
			X2.append(x)

	print(y1.count(1))
	print(y2.count(0))
	X1 =np.asarray(X1)
	X2 =np.asarray(X2)
	y1 = np.asarray(y1)
	y2 = np.asarray(y2)
	# y = np.asarray(y)
	X = np.concatenate((X1, X2))
	y = np.concatenate((y1, y2))

	# Create and fit an AdaBoosted decision tree
	bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
	                         algorithm="SAMME",
	                         n_estimators=200)

	bdt.fit(X, y)

	# Plot the two-class decision scores
	twoclass_output = bdt.decision_function(X)

	print(type(twoclass_output))

	# import IPython
	# IPython.embed()

	y_pre = bdt.predict(X_test)

	return y_pre,classification_report(y_test, y_pre)
コード例 #8
0
ファイル: BDT_analysis.py プロジェクト: jpyne17/msci-hep
#################
#     2 JET     #
#################

# Create BDT object.
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3, min_samples_split=0.05),
                         learning_rate=0.15,
                         algorithm="SAMME",
                         n_estimators=200
                         )

# Train BDT for 2 jet.
bdt.fit(train_2jet, train_2jet_class, sample_weight=train_2jet_weights)

# Get decision scores for test set.
twoclass_output = np.array(bdt.decision_function(test_2jet))

# Plot decision histogram.
plot_range = (twoclass_output.min(), twoclass_output.max())
plt.subplot(122)

plot_colors = 2*"r" + 12*"g" + "y" + 3*"b" + 3*"m"
plot_step = 0.02
class_names = ['qqZvvH125', 'qqWlvH125', 'Wbb', 'Wbc', 'Wcc', 'Wbl', 'Wcl', 'Wl',
               'Zbb', 'Zbc', 'Zcc', 'Zbl', 'Zcl', 'Zl', 'ttbar', 'stopt', 'stops',
               'stopWt', 'WW', 'ZZ', 'WZ']

for n, c in zip(class_names, plot_colors):
    this_data = twoclass_output[test_2jet_processes == n]
    this_weights = test_2jet_weights[test_2jet_processes == n] * SF_map_2jet[n]
    plt.hist(this_data,
コード例 #9
0
clf = AdaBoostClassifier()
clf = clf.fit(X, Y)
X_test = exp_all[testing_idx]
Yprime_test = clf.predict(X_test)


print collections.Counter(health_classes[testing_idx] == Yprime_test)
print collections.Counter(zip(health_classes[testing_idx], health_classes[testing_idx] == Yprime_test))

print collections.Counter(health_classes[testing_idx] == Yprime_test)
print collections.Counter(zip(health_classes[testing_idx], health_classes[testing_idx] == Yprime_test))


# In[180]:

clf.decision_function(X_test[0:10])


# In[213]:

import operator
imp = zip(range(0, 22283), clf.feature_importances_)
imp.sort(key=operator.itemgetter(1), reverse=True)
imp[0:20]


# In[182]:

health_classes[testing_idx][0:10]

コード例 #10
0
ファイル: testscikit.py プロジェクト: tibristo/hbb

# Plot the class probabilities
class_proba = ada.predict_proba(x)[:, -1]
pl.subplot(132)
for i, n, c in zip(xrange(2), class_names, plot_colors):
    pl.hist(class_proba[y == i],
            bins=20,
            range=(0, 1),
            facecolor=c,
            label='Class %s' % n)
pl.legend(loc='upper center')
pl.ylabel('Samples')
pl.xlabel('Class Probability')

# Plot the two-class decision scores
twoclass_output = ada.decision_function(x)
pl.subplot(133)
for i, n, c in zip(xrange(2), class_names, plot_colors):
    pl.hist(twoclass_output[y == i],
            bins=20,
            range=(-1, 1),
            facecolor=c,
            label='Class %s' % n)
pl.legend(loc='upper right')
pl.ylabel('Samples')
pl.xlabel('Two-class Decision Scores')

pl.subplots_adjust(wspace=0.25)
pl.show()
コード例 #11
0
def bdtModel(df_sig_train, df_bkg_train, df_sig_test, df_bkg_test):

    # '---------- Prepare Training ----------'

    X_sig = np.array(df_sig_train)
    y_sig = np.array(X_sig.shape[0] * [1])
    X_bkg = np.array(df_bkg_train)
    y_bkg = np.array(X_bkg.shape[0] * [0])

    X = np.concatenate((X_sig, X_bkg))
    y = np.concatenate((y_sig, y_bkg))

    print 'X_sig.shape: ', X_sig.shape
    print 'y_sig.shape: ', y_sig.shape
    print 'X_bkg.shape: ', X_bkg.shape
    print 'y_bkg.shape: ', y_bkg.shape
    print 'X.shape: ', X.shape
    print 'y.shape: ', y.shape

    # '---------- Prepare Testing ----------'

    X_sig_test = np.array(df_sig_test)
    y_sig_test = np.array(X_sig_test.shape[0] * [1])
    X_bkg_test = np.array(df_bkg_test)
    y_bkg_test = np.array(X_bkg_test.shape[0] * [0])

    X_test = np.concatenate((X_sig_test, X_bkg_test))
    y_test = np.concatenate((y_sig_test, y_bkg_test))

    print 'X_sig_test.shape: ', X_sig_test.shape
    print 'y_sig_test.shape: ', y_sig_test.shape
    print 'X_bkg_test.shape: ', X_bkg_test.shape
    print 'y_bkg_test.shape: ', y_bkg_test.shape
    print 'X_test.shape: ', X_test.shape
    print 'y_test.shape: ', y_test.shape


    # '---------- Model ----------'

    #scaler = preprocessing.StandardScaler().fit(X)
    #X = scaler.transform(X)

    #model = svm.SVC(C = 50, kernel = 'rbf', tol=0.001, gamma=0.005, probability=True)
    #model.fit(X, y)

    dt = DecisionTreeClassifier(max_depth=3,
                                min_samples_leaf=0.05*len(X))
    model = AdaBoostClassifier(dt,
                             algorithm='SAMME',
                             n_estimators=400,
                             learning_rate=0.5)
    
    model.fit(X, y)


    print '---------- Training/Testing info ----------'

    print 'Accuracy (training): ', model.score(X, y)
    print 'Null Error Rate (training): ', y.mean()


    #X_test = scaler.transform(X_test)
    predicted_test = model.predict(X_test)

    predicted_test_clever = (predicted_test + y_test).tolist()
    error_test = float(predicted_test_clever.count(1)) / float(len(predicted_test_clever))
    print "Error: ", error_test

    print "Accuracy (testing): ", metrics.accuracy_score(y_test, predicted_test)
    print "Recall (testing): ",   metrics.recall_score(y_test, predicted_test)
    print "F1 score (testing): ", metrics.f1_score(y_test, predicted_test)
    print "ROC area under curve (testing): ", metrics.roc_auc_score(y_test, predicted_test)

    #'PTS','AST','REB','STL','BLK','FG_PCT','FG3_PCT','FT_PCT','MIN','EFF','WL']
    #user_input = scaler.transform(np.array([10, 1, 2, 0, 2, 0.3, 0.3, 0.3, 10, 5, 1], dtype=float))
    #user_input = scaler.transform(np.array([10,1,2,2,2,2,2,2,2,2,1], dtype=float))
    #user_input = scaler.transform(np.array([10,1,2], dtype=float))
    user_input = np.array([10.15, 1.95, 6.77, 1.12, 0.28, 0.51, 0.37, 0.47, 32.5, 14.8, 0.53], dtype=float)

    score = model.decision_function(user_input)
    print 'Score (user input): ', score
    result = model.predict_proba(user_input)
    print 'Probability of 1 (user input): ', result



    # '--------- Visualization -----------'

    Classifier_training_S = model.decision_function(X[y>0.5]).ravel()
    Classifier_training_B = model.decision_function(X[y<0.5]).ravel()
    Classifier_testing_S = model.decision_function(X_test[y_test>0.5]).ravel()
    Classifier_testing_B = model.decision_function(X_test[y_test<0.5]).ravel()

    (h_test_s, h_test_b) =  visualSigBkg("BDT", Classifier_training_S, Classifier_training_B, Classifier_testing_S, Classifier_testing_B)


    # '-------- Variable Importance ---------'
    feature_importance = model.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    mpl.style.use('ggplot')
    pl.subplot(1, 2, 2)
    pl.barh(pos, feature_importance[sorted_idx], align='center')
    pl.yticks(pos, df_sig_train.columns[sorted_idx])
    pl.xlabel('Relative Importance', fontsize=15)
    pl.title('Variable Importance', fontsize=15)
    #pl.show()
    plt.savefig("Var_importance.pdf")
    plt.close()


    fig = plt.figure()
    ax = fig.add_subplot(111)

    model_err = np.zeros((400,))
    for i, y_pred in enumerate(model.staged_predict(X_test)):
        model_err[i] = zero_one_loss(y_pred, y_test)
    
    model_err_train = np.zeros((400,))
    for i, y_pred in enumerate(model.staged_predict(X)):
        model_err_train[i] = zero_one_loss(y_pred, y)

    ax.plot(np.arange(400) + 1, model_err,
            label='AdaBoost Test Error',
            color='orange')
    ax.plot(np.arange(400) + 1, model_err_train,
            label='AdaBoost Train Error',
            color='green')
    
    ax.set_ylim((0.25, 0.35))
    ax.set_xlabel('Number of Trees')
    ax.set_ylabel('Error Rate')
    
    leg = ax.legend(loc='upper right', fancybox=True)
    leg.get_frame().set_alpha(0.7)

    plt.savefig("ntrees.pdf")
    plt.close()    

    ########################################################### 

    return (model, X, y, result, model.score(X, y), error_test, score, h_test_s, h_test_b)
コード例 #12
0
def bdtModel(df_sig_train, df_bkg_train, df_sig_test, df_bkg_test, sr):

    # '---------- Prepare Training ----------'

    X_sig = np.array(df_sig_train)
    y_sig = np.array(X_sig.shape[0] * [1])
    X_bkg = np.array(df_bkg_train)
    y_bkg = np.array(X_bkg.shape[0] * [0])

    X = np.concatenate((X_sig, X_bkg))
    y = np.concatenate((y_sig, y_bkg))

    print 'X_sig.shape: ', X_sig.shape
    print 'y_sig.shape: ', y_sig.shape
    print 'X_bkg.shape: ', X_bkg.shape
    print 'y_bkg.shape: ', y_bkg.shape
    print 'X.shape: ', X.shape
    print 'y.shape: ', y.shape

    # '---------- Prepare Testing ----------'

    X_sig_test = np.array(df_sig_test)
    y_sig_test = np.array(X_sig_test.shape[0] * [1])
    X_bkg_test = np.array(df_bkg_test)
    y_bkg_test = np.array(X_bkg_test.shape[0] * [0])

    X_test = np.concatenate((X_sig_test, X_bkg_test))
    y_test = np.concatenate((y_sig_test, y_bkg_test))

    print 'X_sig_test.shape: ', X_sig_test.shape
    print 'y_sig_test.shape: ', y_sig_test.shape
    print 'X_bkg_test.shape: ', X_bkg_test.shape
    print 'y_bkg_test.shape: ', y_bkg_test.shape
    print 'X_test.shape: ', X_test.shape
    print 'y_test.shape: ', y_test.shape


    # '---------- Model ----------'

    #scaler = preprocessing.StandardScaler().fit(X)
    #X = scaler.transform(X)

    #model = svm.SVC(C = 50, kernel = 'rbf', tol=0.001, gamma=0.005, probability=True)
    #model.fit(X, y)

    dt = DecisionTreeClassifier(max_depth=3,
                                min_samples_leaf=0.05*len(X))
    model = AdaBoostClassifier(dt,
                             algorithm='SAMME',
                             n_estimators=800,
                             learning_rate=0.5)
    
    model.fit(X, y)


    print '---------- Training/Testing info ----------'

    print 'Accuracy (training): ', model.score(X, y)
    print 'Null Error Rate (training): ', y.mean()


    #X_test = scaler.transform(X_test)
    predicted_test = model.predict(X_test)

    predicted_test_clever = (predicted_test + y_test).tolist()
    error_test = float(predicted_test_clever.count(1)) / float(len(predicted_test_clever))
    print "Error: ", error_test

    print "Accuracy (testing): ", metrics.accuracy_score(y_test, predicted_test)
    print "Recall (testing): ",   metrics.recall_score(y_test, predicted_test)
    print "F1 score (testing): ", metrics.f1_score(y_test, predicted_test)
    print "ROC area under curve (testing): ", metrics.roc_auc_score(y_test, predicted_test)

    #user_input = scaler.transform(np.array([10, 1, 2, 0, 2, 0.3, 0.3, 0.3, 10, 5, 1], dtype=float))
    #user_input = scaler.transform(np.array([10,1,2,2,2,2,2,2,2,2,1], dtype=float))
    #user_input = scaler.transform(np.array([10,1,2], dtype=float))

    user_input = np.array([sr['PTS'],sr['AST'],sr['REB'],sr['STL'],sr['BLK'],sr['FG_PCT'],sr['FG3_PCT'],sr['FT_PCT'],sr['MIN'],sr['EFF'],sr['WL']], dtype=float)
    #user_input = np.array([10,1,2,2,2,2,2,2,2,2,1], dtype=float)
    print user_input
    score = model.decision_function(user_input)
    print 'Score (user input): ', score
    result = model.predict_proba(user_input)
    print 'Probability of 1 (user input): ', result



    # '--------- Visualization -----------'

    #Classifier_training_S = model.decision_function(X[y>0.5]).ravel()
    #Classifier_training_B = model.decision_function(X[y<0.5]).ravel()
    #Classifier_testing_S = model.decision_function(X_test[y_test>0.5]).ravel()
    #Classifier_testing_B = model.decision_function(X_test[y_test<0.5]).ravel()

    #(h_test_s, h_test_b) =  visualSigBkg("BDT", Classifier_training_S, Classifier_training_B, Classifier_testing_S, Classifier_testing_B)
    ########################################################### 

    #return (model, X, y, result, model.score(X, y), error_test, h_test_s, h_test_b)
    return (model, X, y, result, model.score(X, y), error_test, score)
コード例 #13
0
ファイル: Evaluate.py プロジェクト: mrelich/MuonAna
def evaluate(dt_eval, dt_train, opts):

    # If modelinput is specified then read in model
    bdt = None
    if len(opts.modelinput) != 0:
        bdt = joblib.load(opts.modelinput)
        print "Loaded model back ", opts.bdtname
        print bdt
    else:
        print "Model not specified..."
        print "Creating classification and training again"
        bdt = AdaBoostClassifier(
            DecisionTreeClassifier(max_depth=opts.maxdepth),
            algorithm='SAMME',
            n_estimators=opts.ntrees,
            learning_rate=opts.lrate)

        bdt.fit(dt_train.getDataNoWeight(), dt_train.targets)

    # Now get the bdt scores
    sig_scores = bdt.decision_function(
        dt_eval.getDataNoWeight()[dt_eval.targets > 0.5])
    bkg_scores = bdt.decision_function(
        dt_eval.getDataNoWeight()[dt_eval.targets < 0.5])

    # Get weights
    sig_weights = dt_eval.getDataWeights()[dt_eval.targets > 0.5] * dt_eval.sf
    bkg_weights = dt_eval.getDataWeights()[dt_eval.targets < 0.5] * dt_eval.sf

    # Print some information for a set of cuts
    cuts = np.arange(-1, 1, 0.05)
    for cut in cuts:
        print "------------------------------------------"
        print "cut: ", cut
        print "\tSignal:    ", sum(sig_weights[sig_scores > cut])
        print "\tBackground:", sum(bkg_weights[bkg_scores > cut])

    # Make figure and axis
    fig, ax = plt.subplots(ncols=1, figsize=(10, 7))

    # Set minimum and maximum for x-axis
    xmin = -1
    xmax = 1
    nbins = 100

    #plt.yscale("log")
    plt.ylim([1e-2, 1e6])

    # Add error bars
    plotErrorBars(sig_scores, sig_weights, nbins, xmin, xmax, 'r', 'signal')

    # Add error bars
    plotErrorBars(bkg_scores, bkg_weights, nbins, xmin, xmax, 'b',
                  'background')

    # Make hist for signal
    plt.hist(sig_scores,
             weights=sig_weights,
             color='r',
             range=(xmin, xmax),
             alpha=0.5,
             bins=nbins,
             log=True,
             histtype='stepfilled')

    # Make hist for bkg
    plt.hist(bkg_scores,
             weights=bkg_weights,
             color='b',
             range=(xmin, xmax),
             alpha=0.5,
             bins=nbins,
             log=True,
             histtype='stepfilled')

    # Miscellanous
    plt.xlabel("BDT output")
    plt.ylabel("Events / year / bin")
    plt.legend(loc='best')
    plt.grid()
    plt.xticks(np.arange(-1, 1.1, 0.1))
    plt.tight_layout()
    #ax.set_yscale("log")

    plt.savefig("plots/evaluate/WeightedResult_" + opts.bdtname +
                "_fromModel.png")
コード例 #14
0
ファイル: bdtTest.py プロジェクト: aminnj/makers
            signalScore)
        print "- When we predict that we have a signal event, it is actually signal %.1f%% of the time (%i out of %i)" % (
            100.0 * fcorrect, int(fcorrect * len(predictionsForSignal)),
            len(predictionsForSignal))

        ### PLOT

        # plot feature distributions
        if first:
            first = False
            for idx, indicator in enumerate(whichIndicators):
                featureDistributions(Xtrain, Ytrain, indicator, idx)

        # shamelessly stolen from https://dbaumgartel.wordpress.com/2014/03/14/machine-learning-examples-scikit-learn-versus-tmva-cern-root/

        Classifier_training_S = alg.decision_function(
            Xtrain[Ytrain > 0.5]).ravel()
        Classifier_training_B = alg.decision_function(
            Xtrain[Ytrain < 0.5]).ravel()
        Classifier_testing_S = alg.decision_function(
            Xtest[Ytest > 0.5]).ravel()
        Classifier_testing_B = alg.decision_function(
            Xtest[Ytest < 0.5]).ravel()

        # This will be the min/max of our plots
        c_max = 1.5
        c_min = -1.5

        # Get histograms of the classifiers
        Histo_training_S = np.histogram(
            Classifier_training_S, bins=40, range=(c_min, c_max))
        Histo_training_B = np.histogram(
コード例 #15
0
def main():

    args = sys.argv[1:]
    if len(args) < 2:
        return usage()

    print('part1')

    # get root files and convert them to array
    #branch_names = """Px_Z,Py_Z,Pz_Z,E_Z,Px_H,Py_H,Pz_H,E_H,Px_H_Z,Py_H_Z,Pz_H_Z,E_H_Z,Px_H_Zs,Py_H_Zs,Pz_H_Zs,E_H_Zs,Px_Z_Mup,Py_Z_Mup,Pz_Z_Mup,E_Z_Mup,Px_Z_Mum,Py_Z_Mum,Pz_Z_Mum,E_Z_Mum,Px_H_Z_Mup,Py_H_Z_Mup,Pz_H_Z_Mup,E_H_Z_Mup,Px_H_Z_Mum,Py_H_Z_Mum,Pz_H_Z_Mum,E_H_Z_Mum""".split(",")
    #branch_names = """Px_Z,Py_Z,Pz_Z,E_Z,Px_H,Py_H,Pz_H,E_H""".split(",")
    #branch_names = """costheta1,costheta2,phi,M_H,M_Z,M_H_Z,M_H_Zs,M_Z_Mup,M_Z_Mum""".split(",")
    #branch_names = """costheta1,costheta2,phi,phi1,costheta1_H,costheta2_H,phi_H""".split(",")
    #branch_names = """costheta1,costheta2,phi,P_H_Z,P_H_Zs,P_Z_Mup,P_Z_Mum,Px_Z,Py_Z,Pz_Z,Px_H,Py_H,Pz_H""".split(",")  #The last selected feature for training the truth value
    branch_names = """costheta1,costheta2,Px_H,Py_H,Pz_H,Px_Z,Py_Z,Pz_Z,Px_Z_Mup,Py_Z_Mup,Pz_Z_Mup,Pz_Z_Mup,E_Z_Mup,Px_Z_Mum,Py_Z_Mum,Pz_Z_Mum,E_Z_Mum""".split(
        ",")  # new sample
    #    branch_names = """Px_Beamp,Py_Beamp,Pz_Beamp,E_Beamp,Px_Beamm,Py_Beamm,Pz_Beamm,E_Beamm,Px_Z,Py_Z,Pz_Z,E_Z,Px_H,Py_H,Pz_H,E_H,Px_H_Z,Py_H_Z,Pz_H_Z,E_H_Z,Px_H_Zs,Py_H_Zs,Pz_H_Zs,E_H_Zs,Px_Z_Mup,Py_Z_Mup,Pz_Z_Mup,E_Z_Mup,Px_Z_Mum,Py_Z_Mum,Pz_Z_Mum,E_Z_Mum,Px_H_Z_Mup,Py_H_Z_Mup,Pz_H_Z_Mup,E_H_Z_Mup,Px_H_Z_Mum,Py_H_Z_Mum,Pz_H_Z_Mum,E_H_Z_Mum""".split(",")

    fin1 = ROOT.TFile(args[0])
    fin2 = ROOT.TFile(args[1])

    tree1 = fin1.Get("trialTree")  #truth's root tree
    #tree1 = fin1.Get("fancy_tree") #Reconstruction's root  tree
    signal0 = tree1.AsMatrix(columns=branch_names)
    signal = signal0[:100000, :]
    #signal = signal0[:100000,:]
    tree2 = fin2.Get("trialTree")  #truth's root tree
    #tree2 = fin2.Get("fancy_tree") #Reconstruction's root  tree
    backgr0 = tree2.AsMatrix(columns=branch_names)
    backgr = backgr0[:100000, :]
    #backgr = backgr0[:100000,:]

    signal = np.insert(signal, 3, np.full(len(signal), 1), axis=1)
    backgr = np.insert(backgr, 3, np.full(len(backgr), 10), axis=1)

    # for sklearn data is usually organised into one 2D array of shape (n_samples * n_features)
    # containing all the data and one array of categories of length n_samples
    X_raw = np.concatenate((signal, backgr))
    y_raw = np.concatenate(
        (np.ones(signal.shape[0]), np.zeros(backgr.shape[0])))
    print(len(signal))
    print(len(backgr))

    print('part2')

    #imbalanced learn
    n_sig = len(y_raw[y_raw == 1])
    n_bkg = len(y_raw[y_raw == 0])
    print(n_sig)
    print(n_bkg)
    sb_ratio = len(y_raw[y_raw == 1]) / (1.0 * len(y_raw[y_raw == 0]))
    if (sb_ratio > 0.2 and sb_ratio < 0.5):
        smote = SMOTE(ratio=0.5)
        X, y = smote.fit_sample(X_raw, y_raw)
        print('Number of events: ')
        print('before: signal: ', len(y_raw[y_raw == 1]), ' background: ',
              len(y_raw[y_raw == 0]))
        print('after: signal: ', len(y[y == 1]), ' background: ',
              len(y[y == 0]))
    elif (n_sig > 1000 and sb_ratio < 0.2 and sb_ratio > 0.1):
        smote = SMOTE(ratio=0.2)
        X, y = smote.fit_sample(X_raw, y_raw)
        print('Number of events: ')
        print('before: signal: ', len(y_raw[y_raw == 1]), ' background: ',
              len(y_raw[y_raw == 0]))
        print('after: signal: ', len(y[y == 1]), ' background: ',
              len(y[y == 0]))
    elif (n_sig < 1000 and sb_ratio < 0.2 and sb_ratio > 0.1):
        smote = SMOTE(ratio=0.4)
        X, y = smote.fit_sample(X_raw, y_raw)
        print('Number of events: ')
        print('before: signal: ', len(y_raw[y_raw == 1]), ' background: ',
              len(y_raw[y_raw == 0]))
        print('after: signal: ', len(y[y == 1]), ' background: ',
              len(y[y == 0]))
    elif (sb_ratio < 0.1 and sb_ratio > 0.05):
        smote = SMOTE(ratio=0.4)
        X, y = smote.fit_sample(X_raw, y_raw)
        print('Number of events: ')
        print('before: signal: ', len(y_raw[y_raw == 1]), ' background: ',
              len(y_raw[y_raw == 0]))
        print('after: signal: ', len(y[y == 1]), ' background: ',
              len(y[y == 0]))
    elif (sb_ratio < 0.05 and sb_ratio > 0.01):
        smote = SMOTE(ratio=0.1)
        X, y = smote.fit_sample(X_raw, y_raw)
        print('Number of events: ')
        print('before: signal: ', len(y_raw[y_raw == 1]), ' background: ',
              len(y_raw[y_raw == 0]))
        print('after: signal: ', len(y[y == 1]), ' background: ',
              len(y[y == 0]))
    elif (sb_ratio < 0.01):
        smote = SMOTE(ratio=0.03)
        X, y = smote.fit_sample(X_raw, y_raw)
        print('Number of events: ')
        print('before: signal: ', len(y_raw[y_raw == 1]), ' background: ',
              len(y_raw[y_raw == 0]))
        print('after: signal: ', len(y[y == 1]), ' background: ',
              len(y[y == 0]))
    else:
        X = X_raw
        y = y_raw
        print('Number of events: ')
        print('signal: ', len(y[y == 1]), ' background: ', len(y[y == 0]))
    """
    Training Part
    """
    # Train and test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.50,
                                                        random_state=3543)
    weights = X_train[:, 3]
    X_train = np.delete(X_train, 3, 1)
    X_test = np.delete(X_test, 3, 1)

    #dt = DecisionTreeClassifier(max_depth=51, min_samples_leaf=20, min_samples_split=40)

    #bdt = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=250, learning_rate=0.03)
    dt = DecisionTreeClassifier(max_depth=5,
                                min_samples_leaf=100,
                                min_samples_split=10)

    bdt = AdaBoostClassifier(dt,
                             algorithm='SAMME',
                             n_estimators=200,
                             learning_rate=0.2)
    bdt.fit(X_train, y_train, sample_weight=weights)

    importances = bdt.feature_importances_
    f = open('bdt_results/output_importance_New.txt', 'w')
    f.write("%-25s%-15s\n" % ('Variable Name', 'Output Importance'))
    #for i in range (32):
    for i in range(17):
        f.write("%-25s%-15s\n" % (branch_names[i], importances[i]))
        print("%-25s%-15s\n" % (branch_names[i], importances[i]), file=f)
    f.close()

    y_predicted = bdt.predict(X_train)
    print(
        classification_report(y_train,
                              y_predicted,
                              target_names=["background", "signal"]))
    print("Area under ROC curve: %.4f" %
          (roc_auc_score(y_train, bdt.decision_function(X_train))))
    y_trainacc = accuracy_score(y_train, y_predicted)
    print("Area under ACC curve: %.4f" % y_trainacc)

    y_predicted = bdt.predict(X_test)
    print(
        classification_report(y_test,
                              y_predicted,
                              target_names=["background", "signal"]))
    print("Area under ROC curve: %.4f" %
          (roc_auc_score(y_test, bdt.decision_function(X_test))))
    y_trainacc = accuracy_score(y_test, y_predicted)
    print("Area under ACC curve: %.4f" % y_trainacc)

    decisions1 = bdt.decision_function(X_train)
    decisions2 = bdt.decision_function(X_test)

    filepath = 'SM-vs-BSM-CPeven'

    # Compute ROC curve and area under the curve
    fpr1, tpr1, thresholds1 = roc_curve(y_train, decisions1)
    fpr2, tpr2, thresholds2 = roc_curve(y_test, decisions2)
    roc_auc1 = auc(fpr1, tpr1)
    roc_auc2 = auc(fpr2, tpr2)
    fig = plt.figure(figsize=(8, 6))
    fig.patch.set_color('white')
    plt.plot(fpr1,
             tpr1,
             lw=1.2,
             label='train:ROC (area = %0.4f)' % (roc_auc1),
             color="r")
    plt.plot(fpr2,
             tpr2,
             lw=1.2,
             label='test: ROC (area = %0.4f)' % (roc_auc2),
             color="b")
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating  characteristic')
    plt.legend(loc="lower right")
    plt.grid()
    plt.savefig('./bdt_results/' + filepath + '/ROC_Hbb.png')
    #    plt.show()

    compare_train_test(bdt, X_train, y_train, X_test, y_test, filepath)

    joblib.dump(bdt, './bdt_results/' + filepath + '/bdt_model_New.pkl')
コード例 #16
0
ファイル: BDT_tune.py プロジェクト: jpyne17/msci-hep
                         n_estimators=200
                         )

bdt_k2_2jet = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3, min_samples_split=0.05),
                         learning_rate=0.15,
                         algorithm="SAMME",
                         n_estimators=200
                         )

# Train BDT for 2 jet.
bdt_k1_2jet.fit(X_k1_2jet, Y_k1_2jet, sample_weight=weights_k1_2jet)
bdt_k2_2jet.fit(X_k2_2jet, Y_k2_2jet, sample_weight=weights_k2_2jet)

# Get decision scores.
# K1 BDT tests on K2 data, and vice-versa.
output_k1_2jet = np.array(bdt_k1_2jet.decision_function(X_k2_2jet))
output_k2_2jet = np.array(bdt_k2_2jet.decision_function(X_k1_2jet))
output_2jet = np.append(output_k2_2jet, output_k1_2jet) # IMPORTANT: order reversal


# In[32]:




# ### Hyperparameter Scan

# In[33]:

param_grid = {"n_estimators": np.arange(100,350,50),
              "learning_rate": np.arange(0.1,0.4,0.1),
コード例 #17
0
def classifier(data, label):
    binary_data = np.zeros((21, data.shape[1]))
    notsure_data = np.zeros((7, data.shape[1]))
    new_label = []
    ind = 0
    ind_1 = 0
    for i, l in enumerate(label):
        if l == 2:
            notsure_data[ind_1] = data[i]
            ind_1 = ind_1 + 1
            continue
        binary_data[ind] = data[i]
        new_label.append(l)
        ind = ind + 1
    binary_data = preprocessing.normalize(binary_data)
    #pca = decomposition.PCA(n_components=256)
    #binary_data = pca.fit_transform(binary_data)
    new_label = np.array(new_label)
    X_train, X_test, y_train, y_test = train_test_split(
        binary_data,
        new_label,
        test_size=.3,
        random_state=np.random.RandomState(0))
    '''X_train = np.vstack((binary_data[0:7,:],binary_data[7:12,:]))
    y_train = np.array([1,1,1,1,1,1,1,
                               0,0,0,0,0])
    X_test = np.vstack((binary_data[16:21,:],binary_data[12:16,:]))
    y_test = np.array([1,1,1,1,1,
                              0,0,0,0])'''
    clf_1 = MLPClassifier()
    clf_1.fit(X_train, y_train)
    pre_1 = clf_1.predict(X_test)
    p_1 = clf_1.predict_proba(notsure_data)
    pnotsure_1 = clf_1.predict(notsure_data)
    #t_score_1 = clf_1.score(X_test,y_test)
    #t_score_1 = clf_1.decision_function(X_test)
    print(
        metrics.classification_report(y_test,
                                      pre_1,
                                      target_names=['Fake', 'True']))

    clf_2 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                               algorithm="SAMME",
                               n_estimators=25)
    clf_2.fit(X_train, y_train)
    t_score_2 = clf_2.decision_function(X_test)
    pre_2 = clf_2.predict(X_test)
    print(
        metrics.classification_report(y_test,
                                      pre_2,
                                      target_names=['Fake', 'True']))
    C_range = 2.**np.arange(-5, 15)
    gamma_range = 2.**np.arange(-15, 3)
    param_grid = dict(gamma=gamma_range, C=C_range)
    clf_3 = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid)
    clf_3.fit(X_train, y_train)
    t_score_3 = clf_3.decision_function(X_test)
    pre_3 = clf_3.predict(X_test)
    draw_pre_recall(t_score_2, t_score_3, y_test)
    '''scores = clf.cv_results_['mean_test_score'].reshape(len(C_range),
                                                     len(gamma_range))
    plt.figure(figsize=(8, 6))
    plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
    plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
               norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
    plt.xlabel('gamma')
    plt.ylabel('C')
    plt.colorbar()
    plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
    plt.yticks(np.arange(len(C_range)), C_range)
    plt.title('Validation accuracy')
    plt.show()'''

    print(
        metrics.classification_report(y_test,
                                      pre_3,
                                      target_names=['Fake', 'True']))
    '''loo = LeaveOneOut()
    acc = []
    for train_index, test_index in loo.split(binary_data):
        X_train, X_test = binary_data[train_index], binary_data[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(X_train, y_train)
        pre = clf.predict(X_test)
        pre_tr = clf.predict(X_train)
        print('===================================')
        print('Real: ',y_test)
        print('Pre: ',pre)
        print('Real_tr: ', y_train)
        print('Pre_tr: ',pre_tr)
        print((sum(y_train==pre_tr)+sum(y_test==pre))/21.0)
        acc.append((sum(y_train==pre_tr)+sum(y_test==pre))/21.0)
    print('Summary: ', sum(acc)/21.0)'''
    p_2 = clf_2.predict_proba(notsure_data)
    pnotsure_2 = clf_2.predict(notsure_data)
    p_3 = clf_3.predict_proba(notsure_data)
    pnotsure_3 = clf_3.predict(notsure_data)
    return (p_1, p_2, p_3, pnotsure_1, pnotsure_2, pnotsure_3)
コード例 #18
0
ファイル: credit4.py プロジェクト: atullegendx/FraudDetection
pred = clf.predict(testX)
print "Confusion Matrix of AdaBoost is :-\n"
print confusion_matrix(testY, pred)
print "\n\nClassification report for AdaBoost:-"
print classification_report(testY, pred)

#Isolation Forest

clf = IsolationForest(contamination=outlier_fraction,
                      random_state=state,
                      n_jobs=4)
testX = testX.drop([
    'errorBalanceOrig',
], axis=1)
clf.fit(testX)
scores_pred = clf.decision_function(testX)
y_pred = clf.predict(testX)

#reshape theprediction values to 0 for valid and 1 for fraud

y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

n_errors = (y_pred != testY).sum()
#run classification metrics

#print ('{}'.format(clf_name,n_errors))
#print(accuracy_score(y,y_pred))  #since it's an unbalanced class problem the accurancy score will be inappropriate
print "Classification report for Isolation Forest:-"
print(classification_report(testY, y_pred))
コード例 #19
0
    X_deti_test = min_max_scaler.fit_transform(X_deti_test)
    X_dech_train = min_max_scaler.fit_transform(X_dech_train)
    X_dech_test = min_max_scaler.fit_transform(X_dech_test)
    X_deca_train = min_max_scaler.fit_transform(X_deca_train)
    X_deca_test = min_max_scaler.fit_transform(X_deca_test)

    classifier = AdaBoostClassifier(DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf),
                                    n_estimators=n_estimators,
                                    learning_rate=learning_rate)

    #demo
    classifier.fit(X_demo_train, y_demo_train)
    y_demo_test_pred = classifier.decision_function(
        X_demo_test)  #可以加weight 0.5
    basemodelperc = np.percentile(y_demo_test_pred, [95, 90, 80, 70, 60, 50])
    base_rej_perc_5 = basemodelperc[0]
    base_rej_perc_10 = basemodelperc[1]
    base_rej_perc_20 = basemodelperc[2]
    base_rej_perc_30 = basemodelperc[3]
    base_rej_perc_40 = basemodelperc[4]
    base_rej_perc_50 = basemodelperc[5]
    print("baseline model rejection rate[5,10,20,30,40,50]: %s" %
          basemodelperc)  # get percentile of array y_test_pred
    #记录base model该循环中的rejection rate为5%,10%,20%,30%,40%,50%时候的违约率
    df_demo = np.vstack((y_test, y_demo_test_pred))
    df_demo = pd.DataFrame(df_demo)
    df_demo = df_demo.transpose()
    df_demo.columns = ["label", "pred_prob"]
    def_rate_5_demo = df_demo[
コード例 #20
0
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
plt.axis("tight")

# Plot the training points
for i, n, c in zip(range(2), class_names, plot_colors):
    idx = np.where(Ydf_train['default_Yes'] == i)
    plt.scatter(Xdf_train[Xdf_train['student']==1.0].ix[idx].ix[:,0], Xdf_train[Xdf_train['student']==1.0].ix[idx].ix[:,2],c=c, cmap=plt.cm.Paired,label="Class %s" % n)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.legend(loc='upper right')
plt.xlabel("Decision Boundary")
plt.show()


# Plot the two-class decision scores
twoclass_output = clf.decision_function(Xdf_train)
plot_range = (twoclass_output.min(), twoclass_output.max())
plt.subplot(132)
for i, n, c in zip(range(2), class_names, plot_colors):
    idx = np.where(Ydf_train['default_Yes'] == i)
    plt.hist(twoclass_output[idx],
             bins=10,
             range=plot_range,
             facecolor=c,
             label='Class %s' % n,
             alpha=.5)
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, y1, y2 * 1.2))
plt.legend(loc='upper left')
plt.ylabel('Samples')
plt.xlabel('Decision Scores')
コード例 #21
0
g_w_train_sum_bsm = 0.0

#Create the tree
dt_k = DecisionTreeClassifier(max_depth= args.max_depth, criterion=kldc)
dt_g = DecisionTreeClassifier(max_depth= args.max_depth, criterion='gini')

# Create and fit an AdaBoosted decision tree
bdt_k = AdaBoostClassifier(dt_k, algorithm= args.boost_algorithm,n_estimators= args.est_num)
bdt_k.fit(X_train, y_train, w_train)

# Create and fit an AdaBoosted decision tree
bdt_g = AdaBoostClassifier(dt_g, algorithm= args.boost_algorithm,n_estimators= args.est_num)
bdt_g.fit(X_train, y_train, w_train)

#setup the decision functions, which will be used by the histograms as well
k_test_decision_function = bdt_k.decision_function(X_test)
k_train_decision_function = bdt_k.decision_function(X_train)

#setup the decision functions, which will be used by the histograms as well
g_test_decision_function = bdt_g.decision_function(X_test)
g_train_decision_function = bdt_g.decision_function(X_train)

ende_training = time.time()
logger.info('Time to train the tree ' +  '{:5.3f}s'.format(ende_training-start))

#get the directory for data
output_dir = os.path.join(tmp_directory, args.data_version)
if not os.path.exists(output_dir):
    os.makedirs( output_dir)

#get the output directory for plots
コード例 #22
0
ファイル: classify.py プロジェクト: mrelich/MuonAna
                           rnd_state=42,
                           name="trainsplit")

print len(d_trn.data), len(d_tst.data)

# Make BDT
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=opts.maxdepth),    
                         algorithm = 'SAMME',
                         n_estimators=opts.ntrees,
                         learning_rate=opts.lrate)

print "Fitting data"
bdt.fit(d_trn.getDataNoWeight(), d_trn.targets)

print "Evaluating"
pred = bdt.decision_function(d_tst.getDataNoWeight())
#pred_eval = bdt.decision_function(d_eval.getDataNoWeight())

# Import ROOT stuff and save
from ROOT import TH1F, TFile
h_sig = TH1F("h_sig","h",100,-1,1)
h_bkg = TH1F("h_bkg","h",100,-1,1)
h_sig_eval = TH1F("h_sig_eval","h",100,-1,1)
h_bkg_eval = TH1F("h_bkg_eval","h",100,-1,1)

# fill hist
weights = d_tst.getDataWeights()
for i in range(len(pred)):
    if d_tst.targets[i]: 
        h_sig.Fill(pred[i],weights[i])
    else:
コード例 #23
0
def AdaBoost_model(X_train, X_test, y_train, y_test):
    #standar
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    X = X_train
    y = y_train.reshape(1, -1)[0]

    # Create and fit an AdaBoosted decision tree
    bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                             algorithm="SAMME",
                             n_estimators=200)

    bdt.fit(X, y)
    predict = bdt.predict(X_test)

    plot_colors = "br"
    plot_step = 0.02
    class_names = "AB"

    plt.figure(figsize=(10, 5))

    # Plot the decision boundaries
    plt.subplot(121)
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    print(x_min, x_max, y_min, y_max)
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
    plt.axis("tight")

    # Plot the training points
    for i, n, c in zip(range(2), class_names, plot_colors):
        idx = np.where(y == i)
        plt.scatter(X[idx, 0],
                    X[idx, 1],
                    c=c,
                    cmap=plt.cm.Paired,
                    s=20,
                    edgecolor='k',
                    label="Class %s" % n)
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.legend(loc='upper right')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title('Decision Boundary')

    # Plot the two-class decision scores
    twoclass_output = bdt.decision_function(X)
    plot_range = (twoclass_output.min(), twoclass_output.max())
    plt.subplot(122)
    for i, n, c in zip(range(2), class_names, plot_colors):
        plt.hist(twoclass_output[y == i],
                 bins=10,
                 range=plot_range,
                 facecolor=c,
                 label='Class %s' % n,
                 alpha=.5,
                 edgecolor='k')
    x1, x2, y1, y2 = plt.axis()
    plt.axis((x1, x2, y1, y2 * 1.2))
    plt.legend(loc='upper right')
    plt.ylabel('Samples')
    plt.xlabel('Score')
    plt.title('Decision Scores')

    plt.tight_layout()
    plt.subplots_adjust(wspace=0.35)
    plt.show()

    return predict
コード例 #24
0
    plot_colors = "br"
    plot_step = 0.2
    class_names = ["W Jets","QCD"]
    
    plt.figure(figsize=(10, 5))
    
    # Plot the decision boundaries
    plt.subplot(121)
    x_min, x_max = X[:, 0].min() , X[:, 0].max()
    y_min, y_max = X[:, 1].min() , X[:, 1].max()
    xx, yy = np.meshgrid(np.arange(x_min, x_max, (x_max-x_min)/10000),
                         np.arange(y_min, y_max, (y_max-y_min)/10000))
    print 'made mesh'

    Z = abc.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdBu)
    plt.axis("tight")
    print 'Drawing fancy plots - train points'

    # Plot the training points
    for i, n, c in zip(range(2), class_names, plot_colors):
        idx = np.where(Y_valid == i)
        plt.scatter(X_valid[idx, 0], X_valid[idx, 1],
                    c=c, cmap=plt.cm.Paired,
                    label="%s" % n)
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.legend(loc='upper right')
    plt.ylabel(list(varset)[1])
コード例 #25
0
X_train_sig = df_cheat.query(
    hlt2_cut_string)[features][int(0.2*n_events):n_events]
X_train = X_train_bkg.append(X_train_sig, ignore_index=True).values

# DEFINE WHICH PARTS OF TEST AND TRAINING SAMPLES CONTAIN SIGNAL OR BACKGROUND
y_test = int(0.2*n_events)*[0]+int(0.2*n_events)*[1]
y_train = int(0.8*n_events)*[0]+int(0.8*n_events)*[1]

# DEFINE BDT ALGORITHM
dt = DecisionTreeClassifier(max_depth=3,
                            min_samples_leaf=0.05*len(X_train))
bdt = AdaBoostClassifier(dt,
                         algorithm='SAMME',
                         n_estimators=800,
                         learning_rate=0.5)

# RUN BDT TRAINING AND SHOW RESULTS
bdt.fit(X_train, y_train)
sk_y_predicted = bdt.predict(X_test)
print classification_report(y_test, sk_y_predicted,
                            target_names=["background", "signal"])
print "Area under ROC curve: %.4f" % (roc_auc_score(y_test, sk_y_predicted))

plt.hist(bdt.decision_function(X_test_bkg).ravel(), color='r', alpha=0.5,
         range=(-0.4, 0.4), bins=30)
plt.hist(bdt.decision_function(X_test_sig).ravel(), color='b', alpha=0.5,
         range=(-0.4, 0.4), bins=30)
plt.xlabel("scikit-learn BDT output")

plt.savefig('BDT.pdf')
コード例 #26
0
# Plot the training points
for i, n, c in zip(range(2), class_names, plot_colors):
    idx = np.where(y == i)
    plt.scatter(X[idx, 0], X[idx, 1],
                c=c, cmap=plt.cm.Paired,
                s=20, edgecolor='k',
                label="Class %s" % n)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.legend(loc='upper right')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Decision Boundary')

# Plot the two-class decision scores
twoclass_output = bdt.decision_function(X)
plot_range = (twoclass_output.min(), twoclass_output.max())
plt.subplot(122)
for i, n, c in zip(range(2), class_names, plot_colors):
    plt.hist(twoclass_output[y == i],
             bins=10,
             range=plot_range,
             facecolor=c,
             label='Class %s' % n,
             alpha=.5,
             edgecolor='k')
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, y1, y2 * 1.2))
plt.legend(loc='upper right')
plt.ylabel('Samples')
plt.xlabel('Score')
コード例 #27
0
        trainingDataTemp, [trainingDataTemp.shape[1] - 1], axis=1)
    testingVarsTemp, testingTargetTemp = np.split(
        testingDataTemp, [testingDataTemp.shape[1] - 1], axis=1)

    ## scale variables. Map all variables to values between 0 and 1. This is to prevent large numbers from dominating in the testing.
    min_max_scaler = preprocessing.MinMaxScaler()
    trainingVarsTemp = min_max_scaler.fit_transform(trainingVarsTemp)
    testingVarsTemp = min_max_scaler.transform(testingVarsTemp)

    #build and train bdt
    bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=depth),
                             n_estimators=trees)
    bdt.fit(trainingVarsTemp, np.ravel(trainingTargetTemp))

    #bdt score for testing sample
    output_test = bdt.decision_function(testingVarsTemp)

    #calculate area under curve
    auc = roc_auc_score(testingTargetTemp, output_test)
    if verbose: print "Area under ROC = ", auc

    #append AUC and size to lists for plotting after loop
    sizes.append(trainingSize)
    AUC.append(auc)

    #update Size (if while loop)
#    trainingSize += updateSize

######################################
# Information Sheet for the Plot PDF #
######################################
コード例 #28
0
ファイル: TaggerMVA.py プロジェクト: tibristo/BosonTagger
def main():
    
    Algorithm = 'AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_matchedL_ranged_v2_1000_1500'
    pca(Algorithm)
    #plotVars(Algorithm)
    return
    #Algorithm = sys.argv[1]
    #Algorithm = 'CamKt12LCTopoSplitFilteredMu100SmallR30YCut414tev_350_500_vxp_0_99'
    print 'Loading training data ...'

    data_train = pd.read_csv('csv/'+Algorithm+'_merged.csv')   
    #standardise data
    for t in trainvars:
        minx = np.amin(data_train[t])
        maxx = np.amax(data_train[t])
        data_train[t] = (data_train[t] - minx)/(maxx-minx)
        #data_train[t] = (data_train[t] - np.mean(data_train[t]))/np.std(data_train[t])
    r =np.random.rand(data_train.shape[0])
    
    #Set label and weight vectors - and drop any unwanted tranining one
    Y_train = data_train['label'].values[r<0.5]
    # W_train = data_train['weight'].values[r<0.9]
    Y_valid = data_train['label'].values[r>=0.5]
    # W_valid = data_train['weight'].values[r>=0.9]
    # data_train.drop('AKT10LCTRIM530_MassDropSplit',axis=1, inplace=True)
    print data_train.columns.values[1:-1]
    #varcombinations = itertools.combinations(data_train.columns.values[1:-1],2)
    varcombinations = itertools.combinations(trainvars[:],26)
    fac = lambda n: 1 if n < 2 else n * fac(n - 1)
    combos = lambda n, k: fac(n) / fac(k) / fac(n - k)

    #colors = plt.get_cmap('jet')(np.linspace(0, 1.0,combos(len(data_train.columns.values[1:-1]),2) ))
    colors = plt.get_cmap('jet')(np.linspace(0, 1.0,combos(len(trainvars),2) ))

    for varset,color in zip(varcombinations, colors):
        print list(varset)
        X_train = data_train[list(varset)].values[r<0.5]
        X_valid = data_train[list(varset)].values[r>=0.5]


        dt = DC(max_depth=3,min_samples_leaf=0.05*len(X_train))
        abc = ABC(dt,algorithm='SAMME',
                 n_estimators=8,
                 learning_rate=0.5)
        print 'Training classifier with all the data..'
        abc.fit(X_train, Y_train)
        print 'Done.. Applying to validation sample and drawing ROC' 
        prob_predict_valid = abc.predict_proba(X_valid)[:,1]
        Y_score = abc.decision_function(X_valid)
        fpr, tpr, _ = roc_curve(Y_valid, prob_predict_valid)

        # if we want to compare directly with the cut-based method we need to calculate 1/(1-roc(0.5)).
        # however, this is what we do when we've already applied the mass window. This does not do so.
        labelstring = ' And '.join(var.replace('_','') for var in varset)
        print labelstring
        plt.plot(tpr, (1-fpr), label=labelstring, color=color)
        print abc.feature_importances_

        
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.ylabel('1- Background Efficiency')
    plt.xlabel('Signal Efficiency')
    plt.title(Algorithm+' ROC Curve')
    plt.legend(loc="lower left",prop={'size':6})
    plt.savefig(Algorithm+'rocmva.pdf')
コード例 #29
0
    #create the n_estimators to loop over
    parameters = np.linspace(args.n_est_start,
                             args.n_est_end,
                             num=args.est_num,
                             dtype=np.int32)

    for para in parameters:
        # Create and fit an AdaBoosted decision tree for the selected criterion
        bdt = AdaBoostClassifier(dt,
                                 algorithm=args.boost_algorithm,
                                 n_estimators=para)
        bdt.fit(X_train, y_train, w_train)

        #get the decision functions from the kule tree
        test_dec_fct = bdt.decision_function(X_test)
        train_dec_fct = bdt.decision_function(X_train)

        #get the histograms for kule
        h_dis_train_SM, h_dis_train_BSM, h_dis_test_SM, h_dis_test_BSM = get_histograms(
            X_test, X_train, y_test, y_train, w_test, w_train, test_dec_fct,
            train_dec_fct)

        #Berechne die Kule Div und den Gini
        kule_test, k_error_test = kl.kule_div(h_dis_test_SM, h_dis_test_BSM)
        kule_train, k_error_train = kl.kule_div(h_dis_train_SM,
                                                h_dis_train_BSM)
        gini_test, g_error_test = gi.gini(h_dis_test_SM, h_dis_test_BSM)
        gini_train, g_error_train = gi.gini(h_dis_train_SM, h_dis_train_BSM)

        #reset the histogramms to fill them again next iteration
コード例 #30
0
depth = 3  #1 = stumps

print "Declaring Classifier"
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=depth),
                         n_estimators=trees)

print "Training BDT"
bdt.fit(trainingVars, np.ravel(trainingTarget))

#######################################################################################
# Calculate Rate of Correct and Incorrect Classification in Training and Testing Data #
#######################################################################################

pred_train = bdt.predict(trainingVars)
pred_test = bdt.predict(testingVars)
output_train = bdt.decision_function(trainingVars)
output_test = bdt.decision_function(testingVars)

train_SS = 0
train_SB = 0
train_BS = 0
train_BB = 0

for number, entry in enumerate(trainingTarget):
    if entry == 1:
        if pred_train[number] == 1:
            train_SS += 1
        elif pred_train[number] == 0:
            train_SB += 1
    elif entry == 0:
        if pred_train[number] == 1:
コード例 #31
0
def train_kfold(clf_type,
                X,
                y,
                folds=6,
                show_plots=False,
                write_decisions=False,
                state=0,
                **kwargs):
    """Uses kFolding to train a certain classifier.

    Keyword arguments:
        clf_type: classifier type as string, currently supported:
                 ['AdaBoostClassifier', 'GradientBoostingClassifier']
        X: complete dataset (note: you don't need to split your dataset into a train and test
           dataset using kFolding!)
        y: corresponding flags
        folds: number of folds (default: 6)
        show_plots: if True, shows probability distributions from training and testing dataset,
                    using the 'plot_train_test_comparison' function (default: False)
        write_decisions: if True, appends decision columns to given DataFrame X (default: False)
        kwargs: key word arguments for KFold

    Returns:
        list of trained classifiers
    """
    if clf_type not in ['AdaBoostClassifier', 'GradientBoostingClassifier']:
        raise ValueError(
            'Classifier type {} is not supported for kfolding right now!'.
            format(clf_type))

    decision_col_name = clf_type + '_decision'
    clfs = []

    kf = KFold(len(X), n_folds=folds, **kwargs)

    for i, (train_index, test_index) in tqdm_notebook(enumerate(kf, start=1),
                                                      total=len(kf)):
        train_cols = list(X.columns)

        if write_decisions and decision_col_name in X.columns:
            train_cols.remove(decision_col_name)

        X_train, X_test = X[train_cols].iloc[train_index], X[train_cols].iloc[
            test_index]
        y_train, y_test = y[train_index], y[test_index]

        if clf_type == 'AdaBoostClassifier':
            clf = AdaBoostClassifier(random_state=state)
        elif clf_type == 'GradientBoostingClassifier':
            clf = GradientBoostingClassifier(random_state=state)

        clf.fit(X_train.as_matrix(), y_train)

        if show_plots:
            plot_classifier_output(clf,
                                   X_train,
                                   y_train,
                                   X_test,
                                   y_test,
                                   title='Classifier iteration {}'.format(i))

        if write_decisions:
            X.set_value(test_index, decision_col_name,
                        clf.decision_function(X_test))

        clfs.append(clf)

    return clfs
コード例 #32
0
X_test_ref_ae = min_max_scaler.transform(X_test_ref)
X_test_new_ae = min_max_scaler.transform(X_test_new)

#############################
# Assembly, training & testing
#############################

# Boosted decision tree classifier
if runBDT:
    print "Building and training BDT"
    bdt = AdaBoostClassifier(n_estimators=100,base_estimator=DecisionTreeClassifier(max_depth=1))
    bdt.fit(X_train,Y)
    # Testing
    pred_train_bdt = bdt.predict(X_train)
    pred_test_bdt = bdt.predict(X_test)
    output_train_bdt = bdt.decision_function(X_train)
    output_test_bdt = bdt.decision_function(X_test)
    # Results print-out
    print "BDT classifier esults...."
    printResults(pred_train_bdt,pred_test_bdt,nReferenceEvents,nNewEvents)

# Neural network classifier
if runNN:
    # Training
    if runTraining:
        print "Building and training neural network"
        nn = Sequential()
        from keras.layers import Dense, Activation
        nn.add(Dense(71, input_dim=71))
        nn.add(Activation("relu"))
        nn.add(Dense(1))
コード例 #33
0
ファイル: bdtTest.py プロジェクト: aminnj/makers
            signalScore)
        print "- When we predict that we have a signal event, it is actually signal %.1f%% of the time (%i out of %i)" % (
            100.0 * fcorrect, int(fcorrect * len(predictionsForSignal)),
            len(predictionsForSignal))

        ### PLOT

        # plot feature distributions
        if first:
            first = False
            for idx, indicator in enumerate(whichIndicators):
                featureDistributions(Xtrain, Ytrain, indicator, idx)

        # shamelessly stolen from https://dbaumgartel.wordpress.com/2014/03/14/machine-learning-examples-scikit-learn-versus-tmva-cern-root/

        Classifier_training_S = alg.decision_function(
            Xtrain[Ytrain > 0.5]).ravel()
        Classifier_training_B = alg.decision_function(
            Xtrain[Ytrain < 0.5]).ravel()
        Classifier_testing_S = alg.decision_function(
            Xtest[Ytest > 0.5]).ravel()
        Classifier_testing_B = alg.decision_function(
            Xtest[Ytest < 0.5]).ravel()

        # This will be the min/max of our plots
        c_max = 1.5
        c_min = -1.5

        # Get histograms of the classifiers
        Histo_training_S = np.histogram(Classifier_training_S,
                                        bins=40,
                                        range=(c_min, c_max))
コード例 #34
0
ファイル: PlotEffArea.py プロジェクト: mrelich/MuonAna
def ploteffarea(dt_eval, dt_train, opts, dt_LowE):

    # If modelinput is specified then read in model
    bdt = None
    if opts.modelinput != "":
        bdt = joblib.load(opts.modelinput)
        print "Loaded model back"
        print bdt        
    else:
        print "Model not specified..."
        print "Creating classification and training again"
        bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=opts.maxdepth),
                                 algorithm = 'SAMME',
                                 n_estimators=opts.ntrees,
                                 learning_rate=opts.lrate)

        bdt.fit(dt_train.getDataNoWeight(), dt_train.targets)

    
    # Now get the bdt scores
    sig_scores = bdt.decision_function(dt_eval.getDataNoWeight()[dt_eval.targets > 0.5])
    sig_data   = dt_eval.data[dt_eval.targets > 0.5]

    # Also for low energy
    le_sig_scores = bdt.decision_function(dt_LowE.getDataNoWeight())

    # Specify the number of bins and the range
    #nbins = int(30)
    #xmin  = float(5)
    #xmax  = float(8)
    # Copying the bins from leif and sebastian for now
    xmin  = 3
    xmax  = 9
    nbins = 20.
    bins = np.arange(3,9.1,0.3)

    # Some constants
    #solidangle = 4*pi
    solidangle = 2 * (1 + cos(85*pi/180)) * pi
    ebins_per_decade = float(nbins/(xmax-xmin))

    # Some stuff from the data
    oneweightloc = len(dt_eval.t_varnames) + dt_eval.w_varnames.index('OneWeight') 
    Eloc         = len(dt_eval.t_varnames) + dt_eval.w_varnames.index('nuE') 

    NEvents = sig_data[0][ len(dt_eval.t_varnames) + dt_eval.w_varnames.index('NEvents') ]


    # Basic methods
    def mcLogEBin(E):
        return int(log10(E)*ebins_per_decade)
    def mcEMin(mc_log_ebin):
        return pow(10,mc_log_ebin/ebins_per_decade)
    def mcEMax(mc_log_ebin):
        return pow(10,(1+mc_log_ebin)/ebins_per_decade)

    # Calculate effective area
    def getEffA(data, sf):
        effA = np.zeros(len(data),dtype=float)
        energy = np.empty(len(data),dtype=float)        
        nfiles  = sf / (NEvents*961)         
        for i in range(len(effA)):

            E = data[i][Eloc]
            OneWeight = data[i][oneweightloc]
            
            mclogebin = mcLogEBin(E)
            mcemin = mcEMin(mclogebin)
            mcemax = mcEMax(mclogebin)
            
            effA[i] = 1e-4 * OneWeight * nfiles * 1/(solidangle*(mcemax-mcemin))
            energy[i] = log10(E)

        return effA, energy

    
    effA, energy = getEffA(sig_data,dt_eval.sf)
    le_effA, le_energy = getEffA(dt_LowE.data,dt_LowE.sf)

    # Now all scale factor info has been added
    # combine the data for ease of plotting
    effA = np.concatenate((effA, le_effA))
    energy = np.concatenate((energy, le_energy))
    sig_scores = np.concatenate((sig_scores,le_sig_scores))

    # Draw eff area
    fig, ax = plt.subplots(ncols=1, figsize=(10,7))
    bdtcut = 0.6
    h, g, v = plt.hist(energy[sig_scores > bdtcut], 
                       weights=effA[sig_scores > bdtcut],
                       color='b', label='NuGen (bdt > %0.2f)'%bdtcut,
                       range=(xmin,xmax),
                       bins=nbins,
                       log=True,
                       histtype='step')

    #hle, gle, vle = plt.hist(le_energy[le_sig_scores > bdtcut], 
    #                         weights=le_effA[le_sig_scores > bdtcut],
    #                         color='r', label='NuGen low# (bdt > %0.2f)'%bdtcut,
    #                         range=(xmin,xmax),
    #                         bins=nbins,
    #                         log=True,
    #                         histtype='step')
    
    plt.ylim([1.e-3, 1.e4])
    plt.xlabel('log$_{10}$(E/GeV)')
    plt.ylabel('Effective Area [m$^2$]')
    plt.grid()
    plt.tight_layout()

    # Dump output
    output = {'logebins': bins,
              'effA': h}
    pickle.dump(output,open('myeffaDump.pkl','w'))



    # Save figure
    #plt.savefig("plots/EffArea/EffArea_bdtcut%0.2f_sep3best.png"%bdtcut)
    plt.show()
コード例 #35
0
ファイル: trainBDT.py プロジェクト: jpyne17/msci-hep
                               algorithm="SAMME",
                               n_estimators=200
                               )

    bdt_B = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3, min_samples_leaf=0.01),
                               learning_rate=0.15,
                               algorithm="SAMME",
                               n_estimators=200
                               )

    bdt_A.fit(X_A, Y_A, sample_weight=w_A)
    bdt_B.fit(X_B, Y_B, sample_weight=w_B)
    print "BDT training completed."

    # Get scores of X_A for BDT_B and vice-versa.
    scores_A = bdt_B.decision_function(X_A).tolist()
    scores_B = bdt_A.decision_function(X_B).tolist()
    print "Non-normalised decision function scores processed."

    # Normalise decision scores between -1 and 1.
    max_score = max([a for a in scores_A + scores_B])
    min_score = min([a for a in scores_A + scores_B])
    score_range = max_score - min_score
    score_midpoint = min_score + score_range / 2
    # Translate and shrink.
    scores_A = map(lambda a: (a - score_midpoint) / (score_range / 2 + 0.000001), scores_A)  # .001 added for bounding
    scores_B = map(lambda a: (a - score_midpoint) / (score_range / 2 + 0.000001), scores_B)


    print "Updating event objects with decision scores..."
コード例 #36
0
ファイル: adaBoost.py プロジェクト: tibristo/mva
class adaBoost:
    __all__=['run','plotFeatureRanking','plotScores']

    def __init__(self, foundVariables, trainingData, trainingClasses, trainingWeights, testingData, testingClasses, adaName, bkg_name):
        """Build a forest and compute the feature importances.
        
        Keyword args:
        foundVariables -- The list of the names of found variabes, can get using Sample_x.returnFoundVariables()
        trainingData -- The training data
        trainingClasses -- The training data classes
        testingData -- the testing data
        testingClasses -- the testing data classes
        adaName -- the name of the object (eg. sig+bkg_name)
        """
        self.ada = AdaBoostClassifier(DecisionTreeClassifier(compute_importances=True,max_depth=4,min_samples_split=2,min_samples_leaf=100),n_estimators=400, learning_rate=0.5, algorithm="SAMME",compute_importances=True)
        #class sklearn.tree.DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_density=0.10000000000000001, max_features=None, compute_importances=False, random_state=None)
        self.foundVariables = foundVariables
        self.trainingData = trainingData
        self.trainingClasses = trainingClasses
        self.testingData = testingData
        self.testingClasses = testingClasses
        self.trainingWeights = trainingWeights
        self.name = adaName
        self.bkg_name = bkg_name
        self.elapsed = 0.0

    def returnName(self):
        return self.name

    def run(self):
        """Run the fitting and testing."""

    #start the fitting and time it
        start = clock()
        print 'starting training on AdaBoostClassifier'
        self.ada.fit(self.trainingData, self.trainingClasses, self.trainingWeights)
        self.elapsed = clock()-start
        print 'time taken for training: ' + str(self.elapsed)
    #set up the arrays for testing/ eval
        #xtA_C = copy.deepcopy(self.testingData)
        #pred = self.ada.predict(xtA_C)
        #import createHists
        #createHists.drawSigBkgDistrib(xtA_C, pred, self.foundVariables) # draw the signal and background distributions together

    # list the importances of each variable in the bdt, get the score on the test data
        self.importancesada = self.ada.feature_importances_
        print 'importances'
        print self.importancesada
        self.score= self.ada.score(self.testingData,self.testingClasses)
        self.params = self.ada.get_params()
        self.std_mat = np.std([tree.feature_importances_ for tree in self.ada.estimators_],
                           axis=0)
        self.indicesada = np.argsort(self.importancesada)[::-1]
        self.variableNamesSorted = []
        for i in self.indicesada:
            self.variableNamesSorted.append(self.foundVariables[i])

# Print the feature ranking
        print "Feature ranking:"

        for f in xrange(12):
            print "%d. feature %d (%f)" % (f + 1, self.indicesada[f], self.importancesada[self.indicesada[f]]) + " " +self.variableNamesSorted[f]
        self.twoclass_output = self.ada.decision_function(self.testingData)
        self.twoclass_output_train = self.ada.decision_function(self.trainingData)
        self.class_proba = self.ada.predict_proba(self.testingData)[:, -1]



    def plotFeatureRanking(self):
        # We need this to run in batch because it complains about not being able to open display
        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
        import matplotlib.pyplot as plt
        import pylab as pl

        #plot the feature ranking
        pl.figure()
        pl.title("Feature importances Ada")
        pl.bar(xrange(len(self.variableNamesSorted)), self.importancesada[self.indicesada],
               color="r", yerr=self.std_mat[self.indicesada], align="center")
        pl.xticks(xrange(12), self.variableNamesSorted)#indicesada)
        pl.xlim([-1, 12])
        pl.show()

    def plotScores(self, returnROC = False, rocInput = []):
        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
        import matplotlib.pyplot as plt
        import pylab as pl
        from sklearn.metrics import roc_curve, auc

        plot_colors = "rb"
        plot_step = 1000.0
        class_names = "AB"
    # Plot the training points 
        pl.subplot(131)
        for i, n, c in zip(xrange(2), class_names, plot_colors):
            idx = np.where(self.trainingClasses == i)
            pl.scatter(self.trainingData[idx, 0], self.trainingData[idx, 1],
                       c=c, cmap=pl.cm.Paired,
                       label="Class %s" % n)
        pl.axis("tight")
        pl.legend(loc='upper right')
        pl.xlabel("Decision Boundary")

    # Plot the class probabilities


        for i, n, c in zip(xrange(2), class_names, plot_colors):
            pl.hist(self.class_proba[self.testingClasses == i],
                    bins=50,
                    range=(0, 1),
                    facecolor=c,
                    label='Class %s' % n)
        pl.legend(loc='upper center')
        pl.ylabel('Samples')
        pl.xlabel('Class Probability')
    # Plot the two-class decision scores/ bdt scores
        pl.subplot(133)
        for i, n, c in zip(xrange(2), class_names, plot_colors):
            pl.hist(self.twoclass_output[self.testingClasses == i],
                    bins=50,
                    range=(-1, 1),
                    facecolor=c,
                    label='Class %s' % n, normed=True)
        pl.legend(loc='upper right')
        pl.ylabel('Samples')
        pl.xlabel('Two-class Decision Scores')
    
        pl.subplots_adjust(wspace=0.25)
        mean_tpr = 0.0
        mean_fpr = pl.linspace(0, 1, 100)
    
        pl.subplot(132)
        beginIdx = 0
        endIdx = len(self.testingData)#/2

        fpr_arr = []
        tpr_arr = []
        roc_auc_arr = []
        rej_arr = []

        for i in range(1):
            probas_ = self.ada.predict_proba(self.testingData[beginIdx:endIdx])
            #probas_ = self.ada.predict_proba(self.testingData[self.testingClasses == i])
    # Compute ROC curve and area the curve
            fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[beginIdx:endIdx], probas_[:,1])
            #fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[self.testingClasses == i], probas_[:,1],i)
    #mean_tpr += interp(mean_fpr, fpr, tpr)
    #mean_tpr[0] = 0.0
            roc_auc = auc(tpr,rej)#auc(fpr, tpr)
            fpr_arr.append(fpr)
            tpr_arr.append(tpr)
            roc_auc_arr.append(roc_auc)
            rej_arr.append(rej)
            pl.plot(tpr_arr[i], rej_arr[i], lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc_arr[i]), color=plot_colors[i])
            beginIdx = endIdx
            endIdx = len(self.testingData)
        if len(rocInput)>0:
            pl.plot(rocInput[1][0], rocInput[3][0], lw=1, label='ROC fold %d (area = %0.2f)' % (2, rocInput[2][0]), color=plot_colors[1])
        if returnROC:
            return [fpr_arr, tpr_arr, roc_auc_arr, rej_arr]

        pl.show()

    def plotBDTScores(self):
        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
        import matplotlib.pyplot as plt
        import pylab as pl

        plot_colors = "rb"
        plot_step = 1000.0
        alpha_h = [1.0, 0.7]
        class_names = ['Background', 'Signal']
        for i, n, c in zip(xrange(2), class_names, plot_colors):
            pl.hist(self.twoclass_output[self.testingClasses == i],
                    bins=50,
                    range=(-1, 1),
                    facecolor=c,
                    alpha=alpha_h[i],
                    label='Class %s' % n, normed=True)
        pl.legend(loc='upper right')
        pl.ylabel('Samples')
        pl.xlabel('BDT Scores')        
        pl.savefig('BDTScores'+self.name+'.png')

    def plotROC(self, returnROC = False, rocInput = []):
        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
        import matplotlib.pyplot as plt
        import pylab as pl
        from sklearn.metrics import roc_curve, auc

        beginIdx = 0
        endIdx = len(self.testingData)#/2
        plot_colors = "rb"
        plot_step = 1000.0
        class_names = "AB"
        fpr_arr = []
        tpr_arr = []
        roc_auc_arr = []
        rej_arr = []
        names = []

        pl.xlabel("Signal Efficiency")
        pl.ylabel("Background Rejection") 
        pl.title("ROC curves")

        for i in range(1):
            probas_ = self.ada.predict_proba(self.testingData[beginIdx:endIdx])
            #probas_ = self.ada.predict_proba(self.testingData[self.testingClasses == i])
    # Compute ROC curve and area the curve
            fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[beginIdx:endIdx], probas_[:,1])
            #fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[self.testingClasses == i], probas_[:,1],i)
    #mean_tpr += interp(mean_fpr, fpr, tpr)
    #mean_tpr[0] = 0.0
            roc_auc = auc(tpr,rej)#auc(fpr, tpr)
            fpr_arr.append(fpr)
            tpr_arr.append(tpr)
            roc_auc_arr.append(roc_auc)
            rej_arr.append(rej)
            names.append(self.name)

            beginIdx = endIdx
            endIdx = len(self.testingData)
        if len(rocInput)>0:
            label_bkg = rocInput[4][0]
            if '_A' in rocInput[4][0]:
                label_bkg = 'even event number'
            pl.plot(rocInput[1][0], rocInput[3][0], lw=1, label='ROC %s (area = %0.2f)' % (label_bkg, rocInput[2][0]), color=plot_colors[1])
    
        if not returnROC:
            label_bkg = self.name
            if '_B' in self.name:
                label_bkg = 'odd event number'
            pl.plot(tpr_arr[i], rej_arr[i], lw=1, label='ROC %s (area = %0.2f)' % (label_bkg, roc_auc_arr[i]), color=plot_colors[i])
        pl.legend(loc='lower left')
        pl.savefig("roc_combined_"+self.name+".png")
        if returnROC:
            return [fpr_arr, tpr_arr, roc_auc_arr, rej_arr, names]
        pl.show()
        
    def plotDecisionBoundaries(self):
        import numpy as np
        import pylab as pl
        from matplotlib.colors import ListedColormap
        from sklearn.preprocessing import StandardScaler
        #from sklearn.cross_validation import train_test_split
         # just plot the dataset first
        cm = pl.cm.RdBu
        cm_bright = ListedColormap(['#FF0000', '#0000FF'])
        #self.trainingData = StandardScaler().fit_transform(self.trainingData)
        #self.testingData = StandardScaler().fit_transform(self.testingData)
        #X_train = StandardScaler().fit_transform(self.twoclass_output_train)
        h = 0.1
        h2 = 0.01
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
        # get most important variable indices
        idx1 = self.foundVariables.index(self.variableNamesSorted[0])
        idx2 = self.foundVariables.index(self.variableNamesSorted[1])
        
        x_min, x_max = self.trainingData[np.argmin(self.trainingData[:, idx1])][idx1] - .1, self.trainingData[np.argmax(self.trainingData[:, idx1])][idx1] + .1
        y_min, y_max = self.trainingData[np.argmin(self.trainingData[:, idx2])][idx2]- .01, self.trainingData[np.argmax(self.trainingData[:,idx2])][idx2] + .01
        x_min2, x_max2 = self.testingData[np.argmin(self.testingData[:, idx1])][idx1] - .1, self.testingData[np.argmax(self.testingData[:, idx1])][idx1] + .1
        y_min2, y_max2 = self.testingData[np.argmin(self.testingData[:, idx2])][idx2] - .01, self.testingData[np.argmax(self.testingData[:, idx2])][idx2] + .01

        xmin = min(x_min,x_min2)
        xmax = max(x_max,x_max2)
        ymin = min(y_min, y_min2)
        ymax = max(y_max,y_max2)
        xx, yy = np.meshgrid(np.arange(xmin, xmax, float((xmax-xmin)/25.0)),
                             np.arange(ymin, ymax, float((ymax-ymin)/25.0)))

        # get mean values for other variables
        means = np.mean(self.testingData, axis=0)
        means = np.tile(means, (xx.shape[1]*xx.shape[0],1))
        for j in xrange(xx.shape[0]):
            for k in xrange(xx.shape[1]):
                means[(j+1)*(k+1)-1][idx1] = xx[0][j]
                means[(j+1)*(k+1)-1][idx2] = yy[k][0]
        #print 'shape X: '
        #print X.shape
        print 'shape xx: '
        print xx.shape
        print 'shape yy: '
        print yy.shape

        #rav = np.c_[xx.ravel(), yy.ravel()]
        print 'shape means: '
        print means.shape
        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        #if hasattr(clf, "decision_function"):
        #    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        #else:
        Z = self.ada.predict_proba(means)[:, 1]
        print 'Z shape:'
        print Z.shape
        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        figure = pl.figure()
        ax = pl.axes()
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot also the training points
        #for i, n in zip(xrange(2), class_names):
        #    idx = np.where(self.trainingClasses == i)
        ax.scatter(self.trainingData[:, idx1], self.trainingData[:, idx2],
                   c=self.trainingClasses[:], cmap=cm_bright)
        #for i, n in zip(xrange(2), class_names):
        #    idx = np.where(self.testingClasses == i)
        ax.scatter(self.testingData[:, idx1], self.testingData[:, idx2],
                       c=self.testingClasses[:], cmap=cm_bright, alpha=0.6)

        #ax.scatter(X_train[:, 0], X_training[:, 1], c=self.trainingClasses, cmap=cm_bright)
        # and testing points
        #ax.scatter(X[:, 0], X[:, 1], c=self.testingClasses, cmap=cm_bright,
        #           alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        ax.set_title("adaBoost")
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % self.score).lstrip('0'),
                size=15, horizontalalignment='right')
        pl.savefig("adaBoostDecisionBoundaries"+self.name+".png")
        pl.show()
コード例 #37
0
#log
precision_l, recall_l, thresholds_l = precision_recall_curve(test["los"], log.decision_function(test_variables))
pl.plot(recall_l, precision_l)
pl.xlabel("precision")
pl.ylabel("recall")
pl.title("LogisticRegression")
pl.show()
#cart
precision_c, recall_c, thresholds_c = precision_recall_curve(test["los"], test_cart_prob[::,1])
pl.plot(recall_c, precision_c)
pl.xlabel("precision")
pl.ylabel("recall")
pl.title("CART")
pl.show()
#ad
precision_ad, recall_ad, thresholds_ad = precision_recall_curve(test["los"], ad.decision_function(test_variables))
pl.plot(recall_ad, precision_ad)
pl.xlabel("precision")
pl.ylabel("recall")
pl.title("AdBoosting")
pl.show()
#Naive
precision_n, recall_n, thresholds_n = precision_recall_curve(test["los"], test_naive_prob[::,1])
pl.plot(recall_n, precision_n)
pl.xlabel("precision")
pl.ylabel("recall")
pl.title("NaiveBayes")
pl.show()
#integral
plt.plot(recall_l, precision_l)
plt.plot(recall_c, precision_c)
コード例 #38
0
ファイル: ada_adj.py プロジェクト: hlibe/FinTech-of-Networks
                    X_test = min_max_scaler.fit_transform(X_test)
                    #sc = StandardScaler()
                    #X_train = sc.fit_transform(X_train)
                    #X_test = sc.fit_transform(X_test)
                    classifier = AdaBoostClassifier(
                        DecisionTreeClassifier(
                            max_depth=max_depth,
                            min_samples_split=min_samples_split,
                            min_samples_leaf=min_samples_leaf),
                        n_estimators=n_estimators,
                        learning_rate=learning_rate)
                    classifier.fit(X_train, y_train)
                    list_feaimp.append(classifier.feature_importances_)
                    print(classifier.feature_importances_)

                    y_train_pred = classifier.decision_function(X_train)
                    y_test_pred = classifier.decision_function(X_test)

                    train_fpr, train_tpr, tr_thresholds = roc_curve(
                        y_train, y_train_pred)
                    test_fpr, test_tpr, te_thresholds = roc_curve(
                        y_test, y_test_pred)
                    print(auc(train_fpr, train_tpr))
                    print(auc(test_fpr, test_tpr))

                    plt.grid()
                    plt.plot(train_fpr,
                             train_tpr,
                             label=" AUC TRAIN =" +
                             str(auc(train_fpr, train_tpr)))
                    plt.plot(test_fpr,
コード例 #39
0
                                argParser.prog.split('.')[0], vversion)
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
logger.info('Save to %s directory', output_directory)

#pyplot settings
class_names = ["SM Test", "BSM Test", "SM Train", "BSM Train"]
plot_colors = ["#000cff", "#ff0000", "#9ba0ff", "#ff8d8d"]
plt.figure(figsize=(18, 8)).suptitle(
    "Decision Boundaries for test- (top) and trainings-dataset (bottom) \n n: "
    + str(args.n_est),
    fontsize=18)
plot_step = 0.075

#setup the decision functions, which will also be used by the histograms
test_decision_function = bdt.decision_function(X_test)
train_decision_function = bdt.decision_function(X_train)

#show the decision shape for test data
plt.subplot(2, 1, 1)
if args.ptz_only:
    #generate the 1 values which will be used to generate the cut values
    x_min, x_max = X_test.min() - 1, X_test.max() + 1
    xx = np.arange(x_min, x_max, plot_step)
    xx = np.reshape(xx, (-1, 1))

    #map the decision function
    Z = bdt.decision_function(xx)
    Z = np.reshape(Z, (-1, 1))

    #get the limits and plot the function
コード例 #40
0
Y = np.concatenate((Y_ref,Y_new),0)

# Feature scaling
min_max_scaler = preprocessing.MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

# BDT TRAINING AND TESTING
print "Building and training BDT"
clf = AdaBoostClassifier(n_estimators=100,base_estimator=DecisionTreeClassifier(max_depth=1))
clf.fit(X_train,Y)

# Testing
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)
output_train = clf.decision_function(X_train)
output_test = clf.decision_function(X_test)

print "Training sample...."
print "  Signal identified as signal (%)        : ",100.0*np.sum(pred_train[nReferenceEvents:nReferenceEvents+nNewEvents]==1.0)/nNewEvents
print "  Signal identified as background (%)    : ",100.0*np.sum(pred_train[nReferenceEvents:nReferenceEvents+nNewEvents]==0.0)/nNewEvents
print "  Background identified as signal (%)    : ",100.0*np.sum(pred_train[0:nReferenceEvents]==1.0)/nReferenceEvents
print "  Background identified as background (%): ",100.0*np.sum(pred_train[0:nReferenceEvents]==0.0)/nReferenceEvents
print ""
print "Testing sample...."
print "  Signal identified as signal (%)        : ",100.0*np.sum(pred_test[nReferenceEvents:nReferenceEvents+nNewEvents]==1.0)/nNewEvents
print "  Signal identified as background (%)    : ",100.0*np.sum(pred_test[nReferenceEvents:nReferenceEvents+nNewEvents]==0.0)/nNewEvents
print "  Background identified as signal (%)    : ",100.0*np.sum(pred_test[0:nReferenceEvents]==1.0)/nReferenceEvents
print "  Background identified as background (%): ",100.0*np.sum(pred_test[0:nReferenceEvents]==0.0)/nReferenceEvents

# Plotting - probabilities
コード例 #41
0
def main():
    #load data
    train_data, test_data = load_data('spambase/spambase.data')

    #Using Adaboost Classifier with 200 classifiers
    print('Using AdaBoost ...')
    clf = AdaBoostClassifier(n_estimators=200, learning_rate=1)
    clf.fit(train_data.X, train_data.y)

    #Training and Testing Accuracy
    print('Training Accuracy: ', clf.score(train_data.X, train_data.y))
    print('Testing Accuracy: ', clf.score(test_data.X, test_data.y))

    #Creating Confusion Matrix
    prediction = clf.predict(test_data.X)
    confusion_matrix = np.zeros((2, 2))
    accuracy = 0
    for i in range(len(prediction)):
        if prediction[i] == 0 and test_data.y[i] == 0:
            confusion_matrix[0][0] += 1
            accuracy += 1
        elif prediction[i] == 1 and test_data.y[i] == 1:
            confusion_matrix[1][1] += 1
            accuracy += 1
        elif prediction[i] == 0 and test_data.y[i] == 1:
            confusion_matrix[1][0] += 1

        elif prediction[i] == 1 and test_data.y[i] == 0:
            confusion_matrix[0][1] += 1

    #Outputting confusion matrix
    print('\n')
    print('Confusion Matrix')
    print(' prediction')
    print('   -1  1')
    print('   -----')
    print('-1| ' + str(int(confusion_matrix[0][0])) + '  ' +
          str(int(confusion_matrix[0][1])))
    print(' 1| ' + str(int(confusion_matrix[1][0])) + '  ' +
          str(int(confusion_matrix[1][1])))
    print('\n')

    #Creating a roc curve for different number of classifiers
    clf_200 = AdaBoostClassifier(n_estimators=200, learning_rate=1)
    clf_200.fit(train_data.X, train_data.y)
    y_score_200 = clf_200.decision_function(test_data.X)
    fpr_200, tpr_200, thresholds_200 = roc_curve(test_data.y, y_score_200)

    clf_500 = AdaBoostClassifier(n_estimators=50, learning_rate=1)
    clf_500.fit(train_data.X, train_data.y)
    y_score_500 = clf_500.decision_function(test_data.X)
    fpr_500, tpr_500, thresholds_500 = roc_curve(test_data.y, y_score_500)

    clf_20 = AdaBoostClassifier(n_estimators=20, learning_rate=1)
    clf_20.fit(train_data.X, train_data.y)
    y_score_20 = clf_20.decision_function(test_data.X)
    fpr_20, tpr_20, thresholds_20 = roc_curve(test_data.y, y_score_20)

    #Plotting Roc Curve for T = 20,200 and 500
    plt.plot(fpr_200, tpr_200, 'r-', label='T= 200')
    plt.plot(fpr_500, tpr_500, 'g-', label='T= 500')
    plt.plot(fpr_20, tpr_20, 'b-', label='T= 20')
    plt.legend()
    plt.title("ROC curve for AdaBoost Classifier")
    plt.xlabel("false positive rate")
    plt.ylabel("true positive rate")
    plt.savefig('adaboost.png')
    plt.show()

    #Finding the features' importance
    feature_names =['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 'word_freq_over' , 'word_freq_remove', 'word_freq_internet','word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report',\
    'word_freq_addresses','word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet',\
    'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference', 'char_freq_;', 'char_freq_(',\
    'char_freq_[', 'char_freq_!','char_freq_$', 'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total']
    feature_imp = clf_200.feature_importances_
    print('\n')
    print('Feature names ...')
    print(feature_names)
    print('\n')
    print('Feature importance ...')
    print(feature_imp)
    print('\n')

    #Using the email parser can we detect if the email is spam ?
    print('\nParsing spam email...')
    email = Data()
    email.X, email.y = parse_email('antispamSpam.txt', 1)
    prediction = clf.predict(email.X)[0]
    true_label = email.y[0]
    print(prediction, true_label)
    if prediction == true_label:
        print('Successfully detected the spam.')
    else:
        print('Failed to detect the spam.')

    #Using the email parser can we detect if the email is not spam ?
    print('\nParsing Sara\'s email...')
    not_spam = Data()
    not_spam.X, not_spam.y = parse_email('saraEmail.txt', 0)
    prediction = clf.predict(not_spam.X)[0]
    true_label = not_spam.y[0]
    print(prediction, true_label)
    if prediction == true_label:
        print('Successfully detected that the email is safe.')
    else:
        print('Misclassified the email as spam.')

    print('\n')
    print('\n')
    #Using RandomForest Classifier with 20 classifiers
    print('Using RandomForest ...')
    clf = RandomForestClassifier(n_estimators=20, criterion='gini')
    clf.fit(train_data.X, train_data.y)

    #Training and Testing Accuracy
    print('Training Accuracy: ', clf.score(train_data.X, train_data.y))
    print('Testing Accuracy: ', clf.score(test_data.X, test_data.y))

    prediction = clf.predict(test_data.X)

    #Finding the features' importance
    feature_names =['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 'word_freq_over' , 'word_freq_remove', 'word_freq_internet','word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report',\
    'word_freq_addresses','word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet',\
    'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference', 'char_freq_;', 'char_freq_(',\
    'char_freq_[', 'char_freq_!','char_freq_$', 'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total']
    feature_imp = clf.feature_importances_
    print('\n')
    print('Feature names ...')
    print(feature_names)
    print('\n')
    print('Feature importance ...')
    print(feature_imp)
    print('\n')

    #Creating Confusion Matrix
    confusion_matrix = np.zeros((2, 2))
    accuracy = 0
    for i in range(len(prediction)):
        if prediction[i] == 0 and test_data.y[i] == 0:
            confusion_matrix[0][0] += 1
            accuracy += 1
        elif prediction[i] == 1 and test_data.y[i] == 1:
            confusion_matrix[1][1] += 1
            accuracy += 1
        elif prediction[i] == 0 and test_data.y[i] == 1:
            confusion_matrix[1][0] += 1

        elif prediction[i] == 1 and test_data.y[i] == 0:
            confusion_matrix[0][1] += 1

    #Outputting confusion matrix
    print('\n')
    print('Confusion Matrix')
    print(' prediction')
    print('   -1  1')
    print('   -----')
    print('-1| ' + str(int(confusion_matrix[0][0])) + '  ' +
          str(int(confusion_matrix[0][1])))
    print(' 1| ' + str(int(confusion_matrix[1][0])) + '  ' +
          str(int(confusion_matrix[1][1])))
    print('\n')

    #Using the email parser can we detect if the email is spam ?
    print('\nParsing spam email...')
    email = Data()
    email.X, email.y = parse_email('antispamSpam.txt', 1)
    prediction = clf.predict(email.X)[0]
    true_label = email.y[0]
    print(prediction, true_label)
    if prediction == true_label:
        print('Successfully detected the spam.')
    else:
        print('Failed to detect the spam.')

    #Using the email parser can we detect if the email is not spam ?
    print('\nParsing Sara\'s email...')
    not_spam = Data()
    not_spam.X, not_spam.y = parse_email('saraEmail.txt', 0)
    prediction = clf.predict(not_spam.X)[0]
    true_label = not_spam.y[0]
    print(prediction, true_label)
    if prediction == true_label:
        print('Successfully detected that the email is safe.')
    else:
        print('Misclassified the email as spam.')
コード例 #42
0
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
plt.axis("tight")

# Plot the training points
for i, n, c in zip(range(2), class_names, plot_colors):
    idx = np.where(y == i)
    plt.scatter(X[idx, 0], X[idx, 1],
                c=c, cmap=plt.cm.Paired,
                label="Class %s" % n)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.legend(loc='upper right')
plt.xlabel("Decision Boundary")

# Plot the two-class decision scores
twoclass_output = bdt.decision_function(X)
plot_range = (twoclass_output.min(), twoclass_output.max())
plt.subplot(122)
for i, n, c in zip(range(2), class_names, plot_colors):
    plt.hist(twoclass_output[y == i],
             bins=10,
             range=plot_range,
             facecolor=c,
             label='Class %s' % n,
             alpha=.5)
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, y1, y2 * 1.2))
plt.legend(loc='upper right')
plt.ylabel('Samples')
plt.xlabel('Decision Scores')
コード例 #43
0
    def twoClassDemo(self):
        import numpy as np
        import matplotlib.pyplot as plt

        from sklearn.ensemble import AdaBoostClassifier
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.datasets import make_gaussian_quantiles

        # Construct dataset
        X1, y1 = make_gaussian_quantiles(cov=2.,
                                         n_samples=200,
                                         n_features=2,
                                         n_classes=2,
                                         random_state=1)
        X2, y2 = make_gaussian_quantiles(mean=(3, 3),
                                         cov=1.5,
                                         n_samples=300,
                                         n_features=2,
                                         n_classes=2,
                                         random_state=1)
        X = np.concatenate((X1, X2))
        y = np.concatenate((y1, -y2 + 1))

        # Create and fit an AdaBoosted decision tree
        bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                                 algorithm="SAMME",
                                 n_estimators=200)

        bdt.fit(X, y)

        plot_colors = "br"
        plot_step = 0.02
        class_names = "AB"

        plt.figure(figsize=(10, 5))

        # Plot the decision boundaries
        plt.subplot(121)
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                             np.arange(y_min, y_max, plot_step))

        Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
        plt.axis("tight")

        # Plot the training points
        for i, n, c in zip(range(2), class_names, plot_colors):
            idx = np.where(y == i)
            plt.scatter(X[idx, 0],
                        X[idx, 1],
                        c=c,
                        cmap=plt.cm.Paired,
                        s=20,
                        edgecolor='k',
                        label="Class %s" % n)
        plt.xlim(x_min, x_max)
        plt.ylim(y_min, y_max)
        plt.legend(loc='upper right')
        plt.xlabel('x')
        plt.ylabel('y')
        plt.title('Decision Boundary')

        # Plot the two-class decision scores
        twoclass_output = bdt.decision_function(X)
        plot_range = (twoclass_output.min(), twoclass_output.max())
        plt.subplot(122)
        for i, n, c in zip(range(2), class_names, plot_colors):
            plt.hist(twoclass_output[y == i],
                     bins=10,
                     range=plot_range,
                     facecolor=c,
                     label='Class %s' % n,
                     alpha=.5,
                     edgecolor='k')
        x1, x2, y1, y2 = plt.axis()
        plt.axis((x1, x2, y1, y2 * 1.2))
        plt.legend(loc='upper right')
        plt.ylabel('Samples')
        plt.xlabel('Score')
        plt.title('Decision Scores')

        plt.tight_layout()
        plt.subplots_adjust(wspace=0.35)
        plt.show()

        import numpy as np
        import matplotlib.pyplot as plt

        from sklearn import datasets
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.metrics import zero_one_loss
        from sklearn.ensemble import AdaBoostClassifier

        n_estimators = 400
        # A learning rate of 1. may not be optimal for both SAMME and SAMME.R
        learning_rate = 1.

        X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

        X_test, y_test = X[2000:], y[2000:]
        X_train, y_train = X[:2000], y[:2000]

        dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
        dt_stump.fit(X_train, y_train)
        dt_stump_err = 1.0 - dt_stump.score(X_test, y_test)

        dt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)
        dt.fit(X_train, y_train)
        dt_err = 1.0 - dt.score(X_test, y_test)

        ada_discrete = AdaBoostClassifier(base_estimator=dt_stump,
                                          learning_rate=learning_rate,
                                          n_estimators=n_estimators,
                                          algorithm="SAMME")
        ada_discrete.fit(X_train, y_train)

        ada_real = AdaBoostClassifier(base_estimator=dt_stump,
                                      learning_rate=learning_rate,
                                      n_estimators=n_estimators,
                                      algorithm="SAMME.R")
        ada_real.fit(X_train, y_train)

        fig = plt.figure()
        ax = fig.add_subplot(111)

        ax.plot([1, n_estimators], [dt_stump_err] * 2,
                'k-',
                label='Decision Stump Error')
        ax.plot([1, n_estimators], [dt_err] * 2,
                'k--',
                label='Decision Tree Error')

        ada_discrete_err = np.zeros((n_estimators, ))
        for i, y_pred in enumerate(ada_discrete.staged_predict(X_test)):
            ada_discrete_err[i] = zero_one_loss(y_pred, y_test)

        ada_discrete_err_train = np.zeros((n_estimators, ))
        for i, y_pred in enumerate(ada_discrete.staged_predict(X_train)):
            ada_discrete_err_train[i] = zero_one_loss(y_pred, y_train)

        ada_real_err = np.zeros((n_estimators, ))
        for i, y_pred in enumerate(ada_real.staged_predict(X_test)):
            ada_real_err[i] = zero_one_loss(y_pred, y_test)

        ada_real_err_train = np.zeros((n_estimators, ))
        for i, y_pred in enumerate(ada_real.staged_predict(X_train)):
            ada_real_err_train[i] = zero_one_loss(y_pred, y_train)

        ax.plot(np.arange(n_estimators) + 1,
                ada_discrete_err,
                label='Discrete AdaBoost Test Error',
                color='red')
        ax.plot(np.arange(n_estimators) + 1,
                ada_discrete_err_train,
                label='Discrete AdaBoost Train Error',
                color='blue')
        ax.plot(np.arange(n_estimators) + 1,
                ada_real_err,
                label='Real AdaBoost Test Error',
                color='orange')
        ax.plot(np.arange(n_estimators) + 1,
                ada_real_err_train,
                label='Real AdaBoost Train Error',
                color='green')

        ax.set_ylim((0.0, 0.5))
        ax.set_xlabel('n_estimators')
        ax.set_ylabel('error rate')

        leg = ax.legend(loc='upper right', fancybox=True)
        leg.get_frame().set_alpha(0.7)

        plt.show()
コード例 #44
0
def test_sparse_classification():
    # Check classification with sparse input.

    class CustomSVC(SVC):
        """SVC variant that records the nature of the training set."""

        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super().fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15,
                                                   n_features=5,
                                                   random_state=42)
    # Flatten y to a 1d array
    y = np.ravel(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
                          dok_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME"
        ).fit(X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME"
        ).fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_equal(sparse_results, dense_results)

        # decision_function
        sparse_results = sparse_classifier.decision_function(X_test_sparse)
        dense_results = dense_classifier.decision_function(X_test)
        assert_array_almost_equal(sparse_results, dense_results)

        # predict_log_proba
        sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
        dense_results = dense_classifier.predict_log_proba(X_test)
        assert_array_almost_equal(sparse_results, dense_results)

        # predict_proba
        sparse_results = sparse_classifier.predict_proba(X_test_sparse)
        dense_results = dense_classifier.predict_proba(X_test)
        assert_array_almost_equal(sparse_results, dense_results)

        # score
        sparse_results = sparse_classifier.score(X_test_sparse, y_test)
        dense_results = dense_classifier.score(X_test, y_test)
        assert_array_almost_equal(sparse_results, dense_results)

        # staged_decision_function
        sparse_results = sparse_classifier.staged_decision_function(
            X_test_sparse)
        dense_results = dense_classifier.staged_decision_function(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_almost_equal(sprase_res, dense_res)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict_proba
        sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
        dense_results = dense_classifier.staged_predict_proba(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_almost_equal(sprase_res, dense_res)

        # staged_score
        sparse_results = sparse_classifier.staged_score(X_test_sparse,
                                                        y_test)
        dense_results = dense_classifier.staged_score(X_test, y_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # Verify sparsity of data is maintained during training
        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix)
                   for t in types])
コード例 #45
0
ファイル: Evaluate.py プロジェクト: mrelich/MuonAna
def evaluate(dt_eval, dt_train, opts):

    # If modelinput is specified then read in model
    bdt = None
    if len(opts.modelinput) != 0:
        bdt = joblib.load(opts.modelinput)
        print "Loaded model back ", opts.bdtname
        print bdt
    else:
        print "Model not specified..."
        print "Creating classification and training again"
        bdt = AdaBoostClassifier(
            DecisionTreeClassifier(max_depth=opts.maxdepth),
            algorithm="SAMME",
            n_estimators=opts.ntrees,
            learning_rate=opts.lrate,
        )

        bdt.fit(dt_train.getDataNoWeight(), dt_train.targets)

    # Now get the bdt scores
    sig_scores = bdt.decision_function(dt_eval.getDataNoWeight()[dt_eval.targets > 0.5])
    bkg_scores = bdt.decision_function(dt_eval.getDataNoWeight()[dt_eval.targets < 0.5])

    # Get weights
    sig_weights = dt_eval.getDataWeights()[dt_eval.targets > 0.5] * dt_eval.sf
    bkg_weights = dt_eval.getDataWeights()[dt_eval.targets < 0.5] * dt_eval.sf

    # Print some information for a set of cuts
    cuts = np.arange(-1, 1, 0.05)
    for cut in cuts:
        print "------------------------------------------"
        print "cut: ", cut
        print "\tSignal:    ", sum(sig_weights[sig_scores > cut])
        print "\tBackground:", sum(bkg_weights[bkg_scores > cut])

    # Make figure and axis
    fig, ax = plt.subplots(ncols=1, figsize=(10, 7))

    # Set minimum and maximum for x-axis
    xmin = -1
    xmax = 1
    nbins = 100

    # plt.yscale("log")
    plt.ylim([1e-2, 1e6])

    # Add error bars
    plotErrorBars(sig_scores, sig_weights, nbins, xmin, xmax, "r", "signal")

    # Add error bars
    plotErrorBars(bkg_scores, bkg_weights, nbins, xmin, xmax, "b", "background")

    # Make hist for signal
    plt.hist(
        sig_scores,
        weights=sig_weights,
        color="r",
        range=(xmin, xmax),
        alpha=0.5,
        bins=nbins,
        log=True,
        histtype="stepfilled",
    )

    # Make hist for bkg
    plt.hist(
        bkg_scores,
        weights=bkg_weights,
        color="b",
        range=(xmin, xmax),
        alpha=0.5,
        bins=nbins,
        log=True,
        histtype="stepfilled",
    )

    # Miscellanous
    plt.xlabel("BDT output")
    plt.ylabel("Events / year / bin")
    plt.legend(loc="best")
    plt.grid()
    plt.xticks(np.arange(-1, 1.1, 0.1))
    plt.tight_layout()
    # ax.set_yscale("log")

    plt.savefig("plots/evaluate/WeightedResult_" + opts.bdtname + "_fromModel.png")
コード例 #46
0
def test_sparse_classification():
    # Check classification with sparse input.

    class CustomSVC(SVC):
        """SVC variant that records the nature of the training set."""

        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super(CustomSVC, self).fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15,
                                                   n_features=5,
                                                   random_state=42)
    # Flatten y to a 1d array
    y = np.ravel(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
                          dok_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME"
        ).fit(X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME"
        ).fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_equal(sparse_results, dense_results)

        # decision_function
        sparse_results = sparse_classifier.decision_function(X_test_sparse)
        dense_results = dense_classifier.decision_function(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_log_proba
        sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
        dense_results = dense_classifier.predict_log_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_proba
        sparse_results = sparse_classifier.predict_proba(X_test_sparse)
        dense_results = dense_classifier.predict_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # score
        sparse_results = sparse_classifier.score(X_test_sparse, y_test)
        dense_results = dense_classifier.score(X_test, y_test)
        assert_array_equal(sparse_results, dense_results)

        # staged_decision_function
        sparse_results = sparse_classifier.staged_decision_function(
            X_test_sparse)
        dense_results = dense_classifier.staged_decision_function(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict_proba
        sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
        dense_results = dense_classifier.staged_predict_proba(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_score
        sparse_results = sparse_classifier.staged_score(X_test_sparse,
                                                        y_test)
        dense_results = dense_classifier.staged_score(X_test, y_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # Verify sparsity of data is maintained during training
        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix)
                   for t in types])
コード例 #47
0
ファイル: read_tree.py プロジェクト: alexshires/ml
    os = np.ones(len(bkgtrain))
    zs = np.zeros(len(sigtrain))
    print "adding samples together"
    X_train = pandas.concat([sigtrain, bkgtrain])
    y_train = np.append(os, zs)
    print "training"
    base_ada.fit(X=X_train, y=y_train)

    os = np.ones(len(bkgtest))
    zs = np.zeros(len(sigtest))
    print "adding samples together"
    X_test = pandas.concat([sigtest, bkgtest])
    y_test = np.append(os, zs)


    sigoutput = base_ada.decision_function(X=sigtest)
    bkgoutput = base_ada.decision_function(X=bkgtest)
    from sklearn.metrics import accuracy_score
    test_errors = []
    for te in base_ada.staged_predict(X_test):
        test_errors.append(1.- accuracy_score(te, y_test))
    ntrees = len(test_errors)
    estimator_errors = base_ada.estimator_errors_[:ntrees]
    estimator_weights = base_ada.estimator_weights_[:ntrees]

    from matplotlib.ticker import LinearLocator

    with PdfPages("bdtplots.pdf") as pdf:
        xs, xe, ys, ye = get_hist(bkgoutput)
        plt.errorbar(xs, ys, xerr=xe, yerr=ye,
                     color='red', fmt='.',
コード例 #48
0
ファイル: functionsBDT.py プロジェクト: lecdawson/BDTAnalysis
def train_bdt():
    print("Loading data...")
    if SMALL_DATA:
        signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data_small()
    else:
        signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data()

    # print("Sampling 10% of the data for training")
    # #Create smaller samples, 10% of the size
    # signal = np.asarray(random.sample(signal, int((len(signal))*0.1)))
    # bkg2nu = np.asarray(random.sample(bkg2nu, int((len(bkg2nu))*0.1)))
    # bkg214Bi = np.asarray(random.sample(bkg214Bi, int((len(bkg214Bi))*0.1)))
    # bkg208Tl = np.asarray(random.sample(bkg208Tl, int((len(bkg208Tl))*0.1)))
    # bkgRn = np.asarray(random.sample(bkgRn, int((len(bkgRn))*0.1)))

    print("Creating arrays...")
    # X = Features (i.e. the data)
    X = np.concatenate((signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn))

    # y = Labels (i.e. what it is, signal / background)
    y = np.concatenate(
        (np.ones(signal.shape[0]), np.zeros(bkg2nu.shape[0]),
         np.zeros(bkg214Bi.shape[0]), np.zeros(bkg208Tl.shape[0]),
         np.zeros(bkgRn.shape[0])))

    print("Splitting Data...")
    # Split the data
    X_dev, X_eval, y_dev, y_eval = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=48)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    # print("Oversampling...")
    # # Oversample to improve representation of backgrounds
    # ros = RandomOverSampler(random_state=0)
    # X_resampled, y_resampled = ros.fit_sample(X_train, y_train)
    # X_test_resampled, y_test_resampled = ros.fit_sample(X_test, y_test)
    # X_dev_resampled, y_dev_resampled = ros.fit_sample(X_dev, y_dev)
    # X_eval_resampled, y_eval_resampled = ros.fit_sample(X_eval, y_eval)
    # print(sorted(Counter(y_resampled).items()))

    print("Removing weights..")
    # Remove weights on backgrounds (will be passed in to the BDT later)
    # 30/09/19 - removed re sampling
    X_train_weights = X_train[:, 6]
    X_train_new = np.delete(X_train, 6, axis=1)
    X_test_new = np.delete(X_test, 6, axis=1)

    X_dev_weights = X_dev[:, 6]
    X_dev_new = np.delete(X_dev, 6, axis=1)
    X_eval_new = np.delete(X_eval, 6, axis=1)

    print("Creating classifier for DT")
    # Create classifiers
    dt = DecisionTreeClassifier(max_depth=12,
                                min_samples_split=0.5,
                                min_samples_leaf=400)

    print("Creating classifier for BDT")
    bdt = AdaBoostClassifier(dt,
                             algorithm='SAMME',
                             n_estimators=1200,
                             learning_rate=0.5)

    print("Fitting BDT...")
    # Train the classifier - pass in weights from earlier
    fitted_tree = bdt.fit(X_train_new, y_train, sample_weight=X_train_weights)

    print("Predicting on training data...")
    # Use the fitted tree to predict on training data and new test data
    y_predicted_train = bdt.predict(X_train_new)

    print("Predicting on test data...")
    y_predicted_test = bdt.predict(X_test_new)

    print(
        classification_report(y_train,
                              y_predicted_train,
                              target_names=["signal", "background"]))
    print("Area under ROC curve for training data: {0:.4f}".format(
        roc_auc_score(y_train, bdt.decision_function(X_train_new))))

    print(
        classification_report(y_test,
                              y_predicted_test,
                              target_names=["signal", "background"]))
    print("Area under ROC curve for test data: {0:.4f}".format(
        roc_auc_score(y_test, bdt.decision_function(X_test_new))))

    plot_roc_curve(bdt, X_test_new, y_test)
    compare_train_test(bdt, X_train_new, y_train, X_test_new, y_test)

    print("Saving classifier...")
    save_path = BASE_PATH + 'ml_calculated_data/weight/'
    dump(bdt, save_path + 'bdt_classifier.joblib')
    dump(fitted_tree, save_path + 'bdt_fitted_tree.joblib')
    dump(X_train_new, save_path + 'bdt_X_train_new.joblib')
    dump(X_test_new, save_path + 'bdt_X_test_new.joblib')
    dump(X_dev_new, save_path + 'bdt_X_dev_new.joblib')
    dump(X_dev_weights, save_path + 'bdt_X_dev_weights.joblib')
    dump(X_eval_new, save_path + 'bdt_X_eval_new.joblib')
    dump(y_test, save_path + 'bdt_y_test.joblib')
    dump(y_train, save_path + 'bdt_y_train.joblib')
    dump(y_dev, save_path + 'bdt_y_dev.joblib')
    dump(y_eval, save_path + 'bdt_y_eval.joblib')

    print("Finished Training.")
コード例 #49
0
roc_auc_s = auc(fpr_s, tpr_s)
cm_s = confusion_matrix(y_test, s_pre_y)
print(cm_s)
print('s  正确率为:', accuracy_score(y_test, s_pre_y))
print('s  召回率为', recall_score(y_test, s_pre_y))
endtimes = datetime.datetime.now()
print(endtimes - starttime)

ada = AdaBoostClassifier(DecisionTreeClassifier(min_samples_leaf=6,
                                                min_samples_split=10),
                         n_estimators=300,
                         learning_rate=2)
ada.fit(x_train, y_train)
a_pre_y = list(ada.predict(x_test))

y_a2_score = ada.decision_function(x_test)
fpr_a2, tpr_a2, threshold_a2 = roc_curve(y_test, y_a2_score)
roc_auc_a2 = auc(fpr_a2, tpr_a2)
cm_a = confusion_matrix(y_test, a_pre_y)
print(cm_a)
print('a  正确率为:', accuracy_score(y_test, a_pre_y))
print('a  召回率为', recall_score(y_test, a_pre_y))
endtimea = datetime.datetime.now()
print(endtimea - starttime)
"""
ada1 = AdaBoostClassifier(DecisionTreeClassifier(min_samples_leaf=5), n_estimators=200, learning_rate=1)
ada1.fit(x_train,y_train)
a1_pre_y=list(ada1.predict(x_test))
print('a  正确率为:',  accuracy_score(y_test, a1_pre_y))
print('a  召回率为', recall_score(y_test, a1_pre_y))
endtimea = datetime.datetime.now()