def main(): print 'Loading training data ...' data_train = pd.read_csv('csv/CamKt12LCTopoSplitFilteredMu100SmallR30YCut414tev_350_500_vxp_0_99-merged.csv') r =np.random.rand(data_train.shape[0]) #Algorithm = 'AKT10LCTRIM530' plt.figure(1) Y_train = data_train['label'][r<0.9] # W_train = data_train['weight'][r<0.9] Y_valid = data_train['label'][r>=0.9] # W_valid = data_train['weight'][r>=0.9] # data_train.drop('AKT10LCTRIM530_MassDropSplit', axis=1, inplace=True) for varset in itertools.combinations(data_train.columns.values[1:-1],2): print list(varset) X_train = data_train[list(varset)][r<0.9] X_valid = data_train[list(varset)][r>=0.9] #gbc = Pipeline([("scale", StandardScaler()), ("gbc",GBC(n_estimators=1,verbose=1, max_depth=10,min_samples_leaf=50))]) # gbc = GBC(n_estimators=20,verbose=1, max_depth=10,min_samples_leaf=50) #gbc = GaussianNB() dt = DC(max_depth=3,min_samples_leaf=0.05*len(X_train)) abc = ABC(dt,algorithm='SAMME', n_estimators=800, learning_rate=0.5) print 'Training classifier with all the data..' abc.fit(X_train.values, Y_train.values) # sample_weight=W_train.values print 'Done.. Applying to validation sample and drawing ROC' prob_predict_valid = abc.predict(X_valid) #[:,1] # print prob_predict_valid Y_score = abc.decision_function(X_valid.values) print Y_score fpr, tpr, _ = roc_curve(Y_valid.values, Y_score) # W_valid.values labelstring = 'And'.join(var.replace('_','') for var in varset) print labelstring plt.plot(tpr, (1-fpr), label=labelstring) plt.figure(2) plt.hist(abc.decision_function(X_valid[Y_valid==1.]).ravel(), color='r', alpha=0.5, range=(-1.0,1.0), bins=50) plt.hist(abc.decision_function(X_valid[Y_valid==0.]).ravel(), color='b', alpha=0.5, range=(-1.0,1.0), bins=50) plt.xlabel("scikit-learn BDT output") plt.savefig(labelstring+'bdtout.pdf') # labelstring = ' and '.join(var.replace(Algorithm,'') for var in varset) plt.figure(1) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.ylabel('1- Background Efficiency') plt.xlabel('Signal Efficiency') plt.title('ROC Curve') plt.legend(loc="lower left",prop={'size':6}) #plt.show() plt.savefig('rocmva.pdf')
def test_iris(): # Check consistency on dataset iris. classes = np.unique(iris.target) clf_samme = prob_samme = None for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(iris.data, iris.target) assert_array_equal(classes, clf.classes_) proba = clf.predict_proba(iris.data) if alg == "SAMME": clf_samme = clf prob_samme = proba assert_equal(proba.shape[1], len(classes)) assert_equal(clf.decision_function(iris.data).shape[1], len(classes)) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with algorithm %s and score = %f" % \ (alg, score) # Somewhat hacky regression test: prior to # ae7adc880d624615a34bafdb1d75ef67051b8200, # predict_proba returned SAMME.R values for SAMME. clf_samme.algorithm = "SAMME.R" assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
def test_classification_toy(): # Check classification on a toy dataset. for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg, random_state=0) clf.fit(X, y_class) assert_array_equal(clf.predict(T), y_t_class) assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_) assert_equal(clf.predict_proba(T).shape, (len(T), 2)) assert_equal(clf.decision_function(T).shape, (len(T),))
def n1check(d_train, d_test, opts): # Load the data with no weights and put it into panda format # for easier manipulation pd_train = pd.DataFrame(d_train.getDataNoWeight()) pd_test = pd.DataFrame(d_test.getDataNoWeight()) # Holder for results results = {} # Setup classifier clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=opts.maxdepth), n_estimators = opts.ntrees, learning_rate = opts.lrate) # Train the classifier on total data set for comparison clf.fit(pd_train, d_train.targets) results['total'] = roc_auc_score(d_test.targets, clf.decision_function(pd_test)) # Loop over the variables and store the results in dict keys = d_train.t_varnames for i in range(len(keys)): sub_train = pd_train.drop(i,axis=1) sub_test = pd_test.drop(i,axis=1) clf.fit(sub_train, d_train.targets) results[keys[i]] = roc_auc_score(d_test.targets, clf.decision_function(sub_test)) # Now that we have the results, print all information print "--------------------------------------------" for key in results: print "Leaving out ", key, "gives score: ", results[key] print ""
def test_iris(): """Check consistency on dataset iris.""" classes = np.unique(iris.target) for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(iris.data, iris.target) assert_array_equal(classes, clf.classes_) assert_equal(clf.predict_proba(iris.data).shape[1], len(classes)) assert_equal(clf.decision_function(iris.data).shape[1], len(classes)) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with algorithm %s and score = %f" % \ (alg, score)
def main(): Algorithm = 'CamKt12LCTopoSplitFilteredMu67SmallR0YCut9' print 'Loading training data ...' data_train = pd.read_csv(Algorithm+'merged.csv') r =np.random.rand(data_train.shape[0]) #Set label and weight vectors - and drop any unwanted tranining one Y_train = data_train['label'].values[r<0.5] # W_train = data_train['weight'].values[r<0.9] Y_valid = data_train['label'].values[r>=0.5] # W_valid = data_train['weight'].values[r>=0.9] # data_train.drop('AKT10LCTRIM530_MassDropSplit', axis=1, inplace=True) varcombinations = itertools.combinations(data_train.columns.values[1:-1],2) fac = lambda n: 1 if n < 2 else n * fac(n - 1) combos = lambda n, k: fac(n) / fac(k) / fac(n - k) colors = plt.get_cmap('jet')(np.linspace(0, 1.0,combos(len(data_train.columns.values[1:-1]),2) )) for varset,color in zip(varcombinations, colors): print list(varset) X_train = data_train[list(varset)].values[r<0.5] X_valid = data_train[list(varset)].values[r>=0.5] dt = DC(max_depth=3,min_samples_leaf=0.05*len(X_train)) abc = ABC(dt,algorithm='SAMME', n_estimators=8, learning_rate=0.5) print 'Training classifier with all the data..' abc.fit(X_train, Y_train) print 'Done.. Applying to validation sample and drawing ROC' prob_predict_valid = abc.predict_proba(X_valid)[:,1] Y_score = abc.decision_function(X_valid) fpr, tpr, _ = roc_curve(Y_valid, prob_predict_valid) labelstring = ' And '.join(var.replace('_','') for var in varset) print labelstring plt.plot(tpr, (1-fpr), label=labelstring, color=color) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.ylabel('1- Background Efficiency') plt.xlabel('Signal Efficiency') plt.title(Algorithm+' ROC Curve') plt.legend(loc="lower left",prop={'size':6}) plt.savefig(Algorithm+'rocmva.pdf')
def ada_boost(X_train, X_test, y_train, y_test, C=1): X1 = [] X2 = [] y1 = [] y2 = [] for x, y in zip(X_train, y_train): if y==1: y1.append(y) X1.append(x) else: y2.append(y) X2.append(x) print(y1.count(1)) print(y2.count(0)) X1 =np.asarray(X1) X2 =np.asarray(X2) y1 = np.asarray(y1) y2 = np.asarray(y2) # y = np.asarray(y) X = np.concatenate((X1, X2)) y = np.concatenate((y1, y2)) # Create and fit an AdaBoosted decision tree bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200) bdt.fit(X, y) # Plot the two-class decision scores twoclass_output = bdt.decision_function(X) print(type(twoclass_output)) # import IPython # IPython.embed() y_pre = bdt.predict(X_test) return y_pre,classification_report(y_test, y_pre)
################# # 2 JET # ################# # Create BDT object. bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3, min_samples_split=0.05), learning_rate=0.15, algorithm="SAMME", n_estimators=200 ) # Train BDT for 2 jet. bdt.fit(train_2jet, train_2jet_class, sample_weight=train_2jet_weights) # Get decision scores for test set. twoclass_output = np.array(bdt.decision_function(test_2jet)) # Plot decision histogram. plot_range = (twoclass_output.min(), twoclass_output.max()) plt.subplot(122) plot_colors = 2*"r" + 12*"g" + "y" + 3*"b" + 3*"m" plot_step = 0.02 class_names = ['qqZvvH125', 'qqWlvH125', 'Wbb', 'Wbc', 'Wcc', 'Wbl', 'Wcl', 'Wl', 'Zbb', 'Zbc', 'Zcc', 'Zbl', 'Zcl', 'Zl', 'ttbar', 'stopt', 'stops', 'stopWt', 'WW', 'ZZ', 'WZ'] for n, c in zip(class_names, plot_colors): this_data = twoclass_output[test_2jet_processes == n] this_weights = test_2jet_weights[test_2jet_processes == n] * SF_map_2jet[n] plt.hist(this_data,
clf = AdaBoostClassifier() clf = clf.fit(X, Y) X_test = exp_all[testing_idx] Yprime_test = clf.predict(X_test) print collections.Counter(health_classes[testing_idx] == Yprime_test) print collections.Counter(zip(health_classes[testing_idx], health_classes[testing_idx] == Yprime_test)) print collections.Counter(health_classes[testing_idx] == Yprime_test) print collections.Counter(zip(health_classes[testing_idx], health_classes[testing_idx] == Yprime_test)) # In[180]: clf.decision_function(X_test[0:10]) # In[213]: import operator imp = zip(range(0, 22283), clf.feature_importances_) imp.sort(key=operator.itemgetter(1), reverse=True) imp[0:20] # In[182]: health_classes[testing_idx][0:10]
# Plot the class probabilities class_proba = ada.predict_proba(x)[:, -1] pl.subplot(132) for i, n, c in zip(xrange(2), class_names, plot_colors): pl.hist(class_proba[y == i], bins=20, range=(0, 1), facecolor=c, label='Class %s' % n) pl.legend(loc='upper center') pl.ylabel('Samples') pl.xlabel('Class Probability') # Plot the two-class decision scores twoclass_output = ada.decision_function(x) pl.subplot(133) for i, n, c in zip(xrange(2), class_names, plot_colors): pl.hist(twoclass_output[y == i], bins=20, range=(-1, 1), facecolor=c, label='Class %s' % n) pl.legend(loc='upper right') pl.ylabel('Samples') pl.xlabel('Two-class Decision Scores') pl.subplots_adjust(wspace=0.25) pl.show()
def bdtModel(df_sig_train, df_bkg_train, df_sig_test, df_bkg_test): # '---------- Prepare Training ----------' X_sig = np.array(df_sig_train) y_sig = np.array(X_sig.shape[0] * [1]) X_bkg = np.array(df_bkg_train) y_bkg = np.array(X_bkg.shape[0] * [0]) X = np.concatenate((X_sig, X_bkg)) y = np.concatenate((y_sig, y_bkg)) print 'X_sig.shape: ', X_sig.shape print 'y_sig.shape: ', y_sig.shape print 'X_bkg.shape: ', X_bkg.shape print 'y_bkg.shape: ', y_bkg.shape print 'X.shape: ', X.shape print 'y.shape: ', y.shape # '---------- Prepare Testing ----------' X_sig_test = np.array(df_sig_test) y_sig_test = np.array(X_sig_test.shape[0] * [1]) X_bkg_test = np.array(df_bkg_test) y_bkg_test = np.array(X_bkg_test.shape[0] * [0]) X_test = np.concatenate((X_sig_test, X_bkg_test)) y_test = np.concatenate((y_sig_test, y_bkg_test)) print 'X_sig_test.shape: ', X_sig_test.shape print 'y_sig_test.shape: ', y_sig_test.shape print 'X_bkg_test.shape: ', X_bkg_test.shape print 'y_bkg_test.shape: ', y_bkg_test.shape print 'X_test.shape: ', X_test.shape print 'y_test.shape: ', y_test.shape # '---------- Model ----------' #scaler = preprocessing.StandardScaler().fit(X) #X = scaler.transform(X) #model = svm.SVC(C = 50, kernel = 'rbf', tol=0.001, gamma=0.005, probability=True) #model.fit(X, y) dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=0.05*len(X)) model = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=400, learning_rate=0.5) model.fit(X, y) print '---------- Training/Testing info ----------' print 'Accuracy (training): ', model.score(X, y) print 'Null Error Rate (training): ', y.mean() #X_test = scaler.transform(X_test) predicted_test = model.predict(X_test) predicted_test_clever = (predicted_test + y_test).tolist() error_test = float(predicted_test_clever.count(1)) / float(len(predicted_test_clever)) print "Error: ", error_test print "Accuracy (testing): ", metrics.accuracy_score(y_test, predicted_test) print "Recall (testing): ", metrics.recall_score(y_test, predicted_test) print "F1 score (testing): ", metrics.f1_score(y_test, predicted_test) print "ROC area under curve (testing): ", metrics.roc_auc_score(y_test, predicted_test) #'PTS','AST','REB','STL','BLK','FG_PCT','FG3_PCT','FT_PCT','MIN','EFF','WL'] #user_input = scaler.transform(np.array([10, 1, 2, 0, 2, 0.3, 0.3, 0.3, 10, 5, 1], dtype=float)) #user_input = scaler.transform(np.array([10,1,2,2,2,2,2,2,2,2,1], dtype=float)) #user_input = scaler.transform(np.array([10,1,2], dtype=float)) user_input = np.array([10.15, 1.95, 6.77, 1.12, 0.28, 0.51, 0.37, 0.47, 32.5, 14.8, 0.53], dtype=float) score = model.decision_function(user_input) print 'Score (user input): ', score result = model.predict_proba(user_input) print 'Probability of 1 (user input): ', result # '--------- Visualization -----------' Classifier_training_S = model.decision_function(X[y>0.5]).ravel() Classifier_training_B = model.decision_function(X[y<0.5]).ravel() Classifier_testing_S = model.decision_function(X_test[y_test>0.5]).ravel() Classifier_testing_B = model.decision_function(X_test[y_test<0.5]).ravel() (h_test_s, h_test_b) = visualSigBkg("BDT", Classifier_training_S, Classifier_training_B, Classifier_testing_S, Classifier_testing_B) # '-------- Variable Importance ---------' feature_importance = model.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 mpl.style.use('ggplot') pl.subplot(1, 2, 2) pl.barh(pos, feature_importance[sorted_idx], align='center') pl.yticks(pos, df_sig_train.columns[sorted_idx]) pl.xlabel('Relative Importance', fontsize=15) pl.title('Variable Importance', fontsize=15) #pl.show() plt.savefig("Var_importance.pdf") plt.close() fig = plt.figure() ax = fig.add_subplot(111) model_err = np.zeros((400,)) for i, y_pred in enumerate(model.staged_predict(X_test)): model_err[i] = zero_one_loss(y_pred, y_test) model_err_train = np.zeros((400,)) for i, y_pred in enumerate(model.staged_predict(X)): model_err_train[i] = zero_one_loss(y_pred, y) ax.plot(np.arange(400) + 1, model_err, label='AdaBoost Test Error', color='orange') ax.plot(np.arange(400) + 1, model_err_train, label='AdaBoost Train Error', color='green') ax.set_ylim((0.25, 0.35)) ax.set_xlabel('Number of Trees') ax.set_ylabel('Error Rate') leg = ax.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.7) plt.savefig("ntrees.pdf") plt.close() ########################################################### return (model, X, y, result, model.score(X, y), error_test, score, h_test_s, h_test_b)
def bdtModel(df_sig_train, df_bkg_train, df_sig_test, df_bkg_test, sr): # '---------- Prepare Training ----------' X_sig = np.array(df_sig_train) y_sig = np.array(X_sig.shape[0] * [1]) X_bkg = np.array(df_bkg_train) y_bkg = np.array(X_bkg.shape[0] * [0]) X = np.concatenate((X_sig, X_bkg)) y = np.concatenate((y_sig, y_bkg)) print 'X_sig.shape: ', X_sig.shape print 'y_sig.shape: ', y_sig.shape print 'X_bkg.shape: ', X_bkg.shape print 'y_bkg.shape: ', y_bkg.shape print 'X.shape: ', X.shape print 'y.shape: ', y.shape # '---------- Prepare Testing ----------' X_sig_test = np.array(df_sig_test) y_sig_test = np.array(X_sig_test.shape[0] * [1]) X_bkg_test = np.array(df_bkg_test) y_bkg_test = np.array(X_bkg_test.shape[0] * [0]) X_test = np.concatenate((X_sig_test, X_bkg_test)) y_test = np.concatenate((y_sig_test, y_bkg_test)) print 'X_sig_test.shape: ', X_sig_test.shape print 'y_sig_test.shape: ', y_sig_test.shape print 'X_bkg_test.shape: ', X_bkg_test.shape print 'y_bkg_test.shape: ', y_bkg_test.shape print 'X_test.shape: ', X_test.shape print 'y_test.shape: ', y_test.shape # '---------- Model ----------' #scaler = preprocessing.StandardScaler().fit(X) #X = scaler.transform(X) #model = svm.SVC(C = 50, kernel = 'rbf', tol=0.001, gamma=0.005, probability=True) #model.fit(X, y) dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=0.05*len(X)) model = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=800, learning_rate=0.5) model.fit(X, y) print '---------- Training/Testing info ----------' print 'Accuracy (training): ', model.score(X, y) print 'Null Error Rate (training): ', y.mean() #X_test = scaler.transform(X_test) predicted_test = model.predict(X_test) predicted_test_clever = (predicted_test + y_test).tolist() error_test = float(predicted_test_clever.count(1)) / float(len(predicted_test_clever)) print "Error: ", error_test print "Accuracy (testing): ", metrics.accuracy_score(y_test, predicted_test) print "Recall (testing): ", metrics.recall_score(y_test, predicted_test) print "F1 score (testing): ", metrics.f1_score(y_test, predicted_test) print "ROC area under curve (testing): ", metrics.roc_auc_score(y_test, predicted_test) #user_input = scaler.transform(np.array([10, 1, 2, 0, 2, 0.3, 0.3, 0.3, 10, 5, 1], dtype=float)) #user_input = scaler.transform(np.array([10,1,2,2,2,2,2,2,2,2,1], dtype=float)) #user_input = scaler.transform(np.array([10,1,2], dtype=float)) user_input = np.array([sr['PTS'],sr['AST'],sr['REB'],sr['STL'],sr['BLK'],sr['FG_PCT'],sr['FG3_PCT'],sr['FT_PCT'],sr['MIN'],sr['EFF'],sr['WL']], dtype=float) #user_input = np.array([10,1,2,2,2,2,2,2,2,2,1], dtype=float) print user_input score = model.decision_function(user_input) print 'Score (user input): ', score result = model.predict_proba(user_input) print 'Probability of 1 (user input): ', result # '--------- Visualization -----------' #Classifier_training_S = model.decision_function(X[y>0.5]).ravel() #Classifier_training_B = model.decision_function(X[y<0.5]).ravel() #Classifier_testing_S = model.decision_function(X_test[y_test>0.5]).ravel() #Classifier_testing_B = model.decision_function(X_test[y_test<0.5]).ravel() #(h_test_s, h_test_b) = visualSigBkg("BDT", Classifier_training_S, Classifier_training_B, Classifier_testing_S, Classifier_testing_B) ########################################################### #return (model, X, y, result, model.score(X, y), error_test, h_test_s, h_test_b) return (model, X, y, result, model.score(X, y), error_test, score)
def evaluate(dt_eval, dt_train, opts): # If modelinput is specified then read in model bdt = None if len(opts.modelinput) != 0: bdt = joblib.load(opts.modelinput) print "Loaded model back ", opts.bdtname print bdt else: print "Model not specified..." print "Creating classification and training again" bdt = AdaBoostClassifier( DecisionTreeClassifier(max_depth=opts.maxdepth), algorithm='SAMME', n_estimators=opts.ntrees, learning_rate=opts.lrate) bdt.fit(dt_train.getDataNoWeight(), dt_train.targets) # Now get the bdt scores sig_scores = bdt.decision_function( dt_eval.getDataNoWeight()[dt_eval.targets > 0.5]) bkg_scores = bdt.decision_function( dt_eval.getDataNoWeight()[dt_eval.targets < 0.5]) # Get weights sig_weights = dt_eval.getDataWeights()[dt_eval.targets > 0.5] * dt_eval.sf bkg_weights = dt_eval.getDataWeights()[dt_eval.targets < 0.5] * dt_eval.sf # Print some information for a set of cuts cuts = np.arange(-1, 1, 0.05) for cut in cuts: print "------------------------------------------" print "cut: ", cut print "\tSignal: ", sum(sig_weights[sig_scores > cut]) print "\tBackground:", sum(bkg_weights[bkg_scores > cut]) # Make figure and axis fig, ax = plt.subplots(ncols=1, figsize=(10, 7)) # Set minimum and maximum for x-axis xmin = -1 xmax = 1 nbins = 100 #plt.yscale("log") plt.ylim([1e-2, 1e6]) # Add error bars plotErrorBars(sig_scores, sig_weights, nbins, xmin, xmax, 'r', 'signal') # Add error bars plotErrorBars(bkg_scores, bkg_weights, nbins, xmin, xmax, 'b', 'background') # Make hist for signal plt.hist(sig_scores, weights=sig_weights, color='r', range=(xmin, xmax), alpha=0.5, bins=nbins, log=True, histtype='stepfilled') # Make hist for bkg plt.hist(bkg_scores, weights=bkg_weights, color='b', range=(xmin, xmax), alpha=0.5, bins=nbins, log=True, histtype='stepfilled') # Miscellanous plt.xlabel("BDT output") plt.ylabel("Events / year / bin") plt.legend(loc='best') plt.grid() plt.xticks(np.arange(-1, 1.1, 0.1)) plt.tight_layout() #ax.set_yscale("log") plt.savefig("plots/evaluate/WeightedResult_" + opts.bdtname + "_fromModel.png")
signalScore) print "- When we predict that we have a signal event, it is actually signal %.1f%% of the time (%i out of %i)" % ( 100.0 * fcorrect, int(fcorrect * len(predictionsForSignal)), len(predictionsForSignal)) ### PLOT # plot feature distributions if first: first = False for idx, indicator in enumerate(whichIndicators): featureDistributions(Xtrain, Ytrain, indicator, idx) # shamelessly stolen from https://dbaumgartel.wordpress.com/2014/03/14/machine-learning-examples-scikit-learn-versus-tmva-cern-root/ Classifier_training_S = alg.decision_function( Xtrain[Ytrain > 0.5]).ravel() Classifier_training_B = alg.decision_function( Xtrain[Ytrain < 0.5]).ravel() Classifier_testing_S = alg.decision_function( Xtest[Ytest > 0.5]).ravel() Classifier_testing_B = alg.decision_function( Xtest[Ytest < 0.5]).ravel() # This will be the min/max of our plots c_max = 1.5 c_min = -1.5 # Get histograms of the classifiers Histo_training_S = np.histogram( Classifier_training_S, bins=40, range=(c_min, c_max)) Histo_training_B = np.histogram(
def main(): args = sys.argv[1:] if len(args) < 2: return usage() print('part1') # get root files and convert them to array #branch_names = """Px_Z,Py_Z,Pz_Z,E_Z,Px_H,Py_H,Pz_H,E_H,Px_H_Z,Py_H_Z,Pz_H_Z,E_H_Z,Px_H_Zs,Py_H_Zs,Pz_H_Zs,E_H_Zs,Px_Z_Mup,Py_Z_Mup,Pz_Z_Mup,E_Z_Mup,Px_Z_Mum,Py_Z_Mum,Pz_Z_Mum,E_Z_Mum,Px_H_Z_Mup,Py_H_Z_Mup,Pz_H_Z_Mup,E_H_Z_Mup,Px_H_Z_Mum,Py_H_Z_Mum,Pz_H_Z_Mum,E_H_Z_Mum""".split(",") #branch_names = """Px_Z,Py_Z,Pz_Z,E_Z,Px_H,Py_H,Pz_H,E_H""".split(",") #branch_names = """costheta1,costheta2,phi,M_H,M_Z,M_H_Z,M_H_Zs,M_Z_Mup,M_Z_Mum""".split(",") #branch_names = """costheta1,costheta2,phi,phi1,costheta1_H,costheta2_H,phi_H""".split(",") #branch_names = """costheta1,costheta2,phi,P_H_Z,P_H_Zs,P_Z_Mup,P_Z_Mum,Px_Z,Py_Z,Pz_Z,Px_H,Py_H,Pz_H""".split(",") #The last selected feature for training the truth value branch_names = """costheta1,costheta2,Px_H,Py_H,Pz_H,Px_Z,Py_Z,Pz_Z,Px_Z_Mup,Py_Z_Mup,Pz_Z_Mup,Pz_Z_Mup,E_Z_Mup,Px_Z_Mum,Py_Z_Mum,Pz_Z_Mum,E_Z_Mum""".split( ",") # new sample # branch_names = """Px_Beamp,Py_Beamp,Pz_Beamp,E_Beamp,Px_Beamm,Py_Beamm,Pz_Beamm,E_Beamm,Px_Z,Py_Z,Pz_Z,E_Z,Px_H,Py_H,Pz_H,E_H,Px_H_Z,Py_H_Z,Pz_H_Z,E_H_Z,Px_H_Zs,Py_H_Zs,Pz_H_Zs,E_H_Zs,Px_Z_Mup,Py_Z_Mup,Pz_Z_Mup,E_Z_Mup,Px_Z_Mum,Py_Z_Mum,Pz_Z_Mum,E_Z_Mum,Px_H_Z_Mup,Py_H_Z_Mup,Pz_H_Z_Mup,E_H_Z_Mup,Px_H_Z_Mum,Py_H_Z_Mum,Pz_H_Z_Mum,E_H_Z_Mum""".split(",") fin1 = ROOT.TFile(args[0]) fin2 = ROOT.TFile(args[1]) tree1 = fin1.Get("trialTree") #truth's root tree #tree1 = fin1.Get("fancy_tree") #Reconstruction's root tree signal0 = tree1.AsMatrix(columns=branch_names) signal = signal0[:100000, :] #signal = signal0[:100000,:] tree2 = fin2.Get("trialTree") #truth's root tree #tree2 = fin2.Get("fancy_tree") #Reconstruction's root tree backgr0 = tree2.AsMatrix(columns=branch_names) backgr = backgr0[:100000, :] #backgr = backgr0[:100000,:] signal = np.insert(signal, 3, np.full(len(signal), 1), axis=1) backgr = np.insert(backgr, 3, np.full(len(backgr), 10), axis=1) # for sklearn data is usually organised into one 2D array of shape (n_samples * n_features) # containing all the data and one array of categories of length n_samples X_raw = np.concatenate((signal, backgr)) y_raw = np.concatenate( (np.ones(signal.shape[0]), np.zeros(backgr.shape[0]))) print(len(signal)) print(len(backgr)) print('part2') #imbalanced learn n_sig = len(y_raw[y_raw == 1]) n_bkg = len(y_raw[y_raw == 0]) print(n_sig) print(n_bkg) sb_ratio = len(y_raw[y_raw == 1]) / (1.0 * len(y_raw[y_raw == 0])) if (sb_ratio > 0.2 and sb_ratio < 0.5): smote = SMOTE(ratio=0.5) X, y = smote.fit_sample(X_raw, y_raw) print('Number of events: ') print('before: signal: ', len(y_raw[y_raw == 1]), ' background: ', len(y_raw[y_raw == 0])) print('after: signal: ', len(y[y == 1]), ' background: ', len(y[y == 0])) elif (n_sig > 1000 and sb_ratio < 0.2 and sb_ratio > 0.1): smote = SMOTE(ratio=0.2) X, y = smote.fit_sample(X_raw, y_raw) print('Number of events: ') print('before: signal: ', len(y_raw[y_raw == 1]), ' background: ', len(y_raw[y_raw == 0])) print('after: signal: ', len(y[y == 1]), ' background: ', len(y[y == 0])) elif (n_sig < 1000 and sb_ratio < 0.2 and sb_ratio > 0.1): smote = SMOTE(ratio=0.4) X, y = smote.fit_sample(X_raw, y_raw) print('Number of events: ') print('before: signal: ', len(y_raw[y_raw == 1]), ' background: ', len(y_raw[y_raw == 0])) print('after: signal: ', len(y[y == 1]), ' background: ', len(y[y == 0])) elif (sb_ratio < 0.1 and sb_ratio > 0.05): smote = SMOTE(ratio=0.4) X, y = smote.fit_sample(X_raw, y_raw) print('Number of events: ') print('before: signal: ', len(y_raw[y_raw == 1]), ' background: ', len(y_raw[y_raw == 0])) print('after: signal: ', len(y[y == 1]), ' background: ', len(y[y == 0])) elif (sb_ratio < 0.05 and sb_ratio > 0.01): smote = SMOTE(ratio=0.1) X, y = smote.fit_sample(X_raw, y_raw) print('Number of events: ') print('before: signal: ', len(y_raw[y_raw == 1]), ' background: ', len(y_raw[y_raw == 0])) print('after: signal: ', len(y[y == 1]), ' background: ', len(y[y == 0])) elif (sb_ratio < 0.01): smote = SMOTE(ratio=0.03) X, y = smote.fit_sample(X_raw, y_raw) print('Number of events: ') print('before: signal: ', len(y_raw[y_raw == 1]), ' background: ', len(y_raw[y_raw == 0])) print('after: signal: ', len(y[y == 1]), ' background: ', len(y[y == 0])) else: X = X_raw y = y_raw print('Number of events: ') print('signal: ', len(y[y == 1]), ' background: ', len(y[y == 0])) """ Training Part """ # Train and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=3543) weights = X_train[:, 3] X_train = np.delete(X_train, 3, 1) X_test = np.delete(X_test, 3, 1) #dt = DecisionTreeClassifier(max_depth=51, min_samples_leaf=20, min_samples_split=40) #bdt = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=250, learning_rate=0.03) dt = DecisionTreeClassifier(max_depth=5, min_samples_leaf=100, min_samples_split=10) bdt = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=200, learning_rate=0.2) bdt.fit(X_train, y_train, sample_weight=weights) importances = bdt.feature_importances_ f = open('bdt_results/output_importance_New.txt', 'w') f.write("%-25s%-15s\n" % ('Variable Name', 'Output Importance')) #for i in range (32): for i in range(17): f.write("%-25s%-15s\n" % (branch_names[i], importances[i])) print("%-25s%-15s\n" % (branch_names[i], importances[i]), file=f) f.close() y_predicted = bdt.predict(X_train) print( classification_report(y_train, y_predicted, target_names=["background", "signal"])) print("Area under ROC curve: %.4f" % (roc_auc_score(y_train, bdt.decision_function(X_train)))) y_trainacc = accuracy_score(y_train, y_predicted) print("Area under ACC curve: %.4f" % y_trainacc) y_predicted = bdt.predict(X_test) print( classification_report(y_test, y_predicted, target_names=["background", "signal"])) print("Area under ROC curve: %.4f" % (roc_auc_score(y_test, bdt.decision_function(X_test)))) y_trainacc = accuracy_score(y_test, y_predicted) print("Area under ACC curve: %.4f" % y_trainacc) decisions1 = bdt.decision_function(X_train) decisions2 = bdt.decision_function(X_test) filepath = 'SM-vs-BSM-CPeven' # Compute ROC curve and area under the curve fpr1, tpr1, thresholds1 = roc_curve(y_train, decisions1) fpr2, tpr2, thresholds2 = roc_curve(y_test, decisions2) roc_auc1 = auc(fpr1, tpr1) roc_auc2 = auc(fpr2, tpr2) fig = plt.figure(figsize=(8, 6)) fig.patch.set_color('white') plt.plot(fpr1, tpr1, lw=1.2, label='train:ROC (area = %0.4f)' % (roc_auc1), color="r") plt.plot(fpr2, tpr2, lw=1.2, label='test: ROC (area = %0.4f)' % (roc_auc2), color="b") plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.grid() plt.savefig('./bdt_results/' + filepath + '/ROC_Hbb.png') # plt.show() compare_train_test(bdt, X_train, y_train, X_test, y_test, filepath) joblib.dump(bdt, './bdt_results/' + filepath + '/bdt_model_New.pkl')
n_estimators=200 ) bdt_k2_2jet = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3, min_samples_split=0.05), learning_rate=0.15, algorithm="SAMME", n_estimators=200 ) # Train BDT for 2 jet. bdt_k1_2jet.fit(X_k1_2jet, Y_k1_2jet, sample_weight=weights_k1_2jet) bdt_k2_2jet.fit(X_k2_2jet, Y_k2_2jet, sample_weight=weights_k2_2jet) # Get decision scores. # K1 BDT tests on K2 data, and vice-versa. output_k1_2jet = np.array(bdt_k1_2jet.decision_function(X_k2_2jet)) output_k2_2jet = np.array(bdt_k2_2jet.decision_function(X_k1_2jet)) output_2jet = np.append(output_k2_2jet, output_k1_2jet) # IMPORTANT: order reversal # In[32]: # ### Hyperparameter Scan # In[33]: param_grid = {"n_estimators": np.arange(100,350,50), "learning_rate": np.arange(0.1,0.4,0.1),
def classifier(data, label): binary_data = np.zeros((21, data.shape[1])) notsure_data = np.zeros((7, data.shape[1])) new_label = [] ind = 0 ind_1 = 0 for i, l in enumerate(label): if l == 2: notsure_data[ind_1] = data[i] ind_1 = ind_1 + 1 continue binary_data[ind] = data[i] new_label.append(l) ind = ind + 1 binary_data = preprocessing.normalize(binary_data) #pca = decomposition.PCA(n_components=256) #binary_data = pca.fit_transform(binary_data) new_label = np.array(new_label) X_train, X_test, y_train, y_test = train_test_split( binary_data, new_label, test_size=.3, random_state=np.random.RandomState(0)) '''X_train = np.vstack((binary_data[0:7,:],binary_data[7:12,:])) y_train = np.array([1,1,1,1,1,1,1, 0,0,0,0,0]) X_test = np.vstack((binary_data[16:21,:],binary_data[12:16,:])) y_test = np.array([1,1,1,1,1, 0,0,0,0])''' clf_1 = MLPClassifier() clf_1.fit(X_train, y_train) pre_1 = clf_1.predict(X_test) p_1 = clf_1.predict_proba(notsure_data) pnotsure_1 = clf_1.predict(notsure_data) #t_score_1 = clf_1.score(X_test,y_test) #t_score_1 = clf_1.decision_function(X_test) print( metrics.classification_report(y_test, pre_1, target_names=['Fake', 'True'])) clf_2 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), algorithm="SAMME", n_estimators=25) clf_2.fit(X_train, y_train) t_score_2 = clf_2.decision_function(X_test) pre_2 = clf_2.predict(X_test) print( metrics.classification_report(y_test, pre_2, target_names=['Fake', 'True'])) C_range = 2.**np.arange(-5, 15) gamma_range = 2.**np.arange(-15, 3) param_grid = dict(gamma=gamma_range, C=C_range) clf_3 = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid) clf_3.fit(X_train, y_train) t_score_3 = clf_3.decision_function(X_test) pre_3 = clf_3.predict(X_test) draw_pre_recall(t_score_2, t_score_3, y_test) '''scores = clf.cv_results_['mean_test_score'].reshape(len(C_range), len(gamma_range)) plt.figure(figsize=(8, 6)) plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95) plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot, norm=MidpointNormalize(vmin=0.2, midpoint=0.92)) plt.xlabel('gamma') plt.ylabel('C') plt.colorbar() plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45) plt.yticks(np.arange(len(C_range)), C_range) plt.title('Validation accuracy') plt.show()''' print( metrics.classification_report(y_test, pre_3, target_names=['Fake', 'True'])) '''loo = LeaveOneOut() acc = [] for train_index, test_index in loo.split(binary_data): X_train, X_test = binary_data[train_index], binary_data[test_index] y_train, y_test = new_label[train_index], new_label[test_index] clf.fit(X_train, y_train) pre = clf.predict(X_test) pre_tr = clf.predict(X_train) print('===================================') print('Real: ',y_test) print('Pre: ',pre) print('Real_tr: ', y_train) print('Pre_tr: ',pre_tr) print((sum(y_train==pre_tr)+sum(y_test==pre))/21.0) acc.append((sum(y_train==pre_tr)+sum(y_test==pre))/21.0) print('Summary: ', sum(acc)/21.0)''' p_2 = clf_2.predict_proba(notsure_data) pnotsure_2 = clf_2.predict(notsure_data) p_3 = clf_3.predict_proba(notsure_data) pnotsure_3 = clf_3.predict(notsure_data) return (p_1, p_2, p_3, pnotsure_1, pnotsure_2, pnotsure_3)
pred = clf.predict(testX) print "Confusion Matrix of AdaBoost is :-\n" print confusion_matrix(testY, pred) print "\n\nClassification report for AdaBoost:-" print classification_report(testY, pred) #Isolation Forest clf = IsolationForest(contamination=outlier_fraction, random_state=state, n_jobs=4) testX = testX.drop([ 'errorBalanceOrig', ], axis=1) clf.fit(testX) scores_pred = clf.decision_function(testX) y_pred = clf.predict(testX) #reshape theprediction values to 0 for valid and 1 for fraud y_pred[y_pred == 1] = 0 y_pred[y_pred == -1] = 1 n_errors = (y_pred != testY).sum() #run classification metrics #print ('{}'.format(clf_name,n_errors)) #print(accuracy_score(y,y_pred)) #since it's an unbalanced class problem the accurancy score will be inappropriate print "Classification report for Isolation Forest:-" print(classification_report(testY, y_pred))
X_deti_test = min_max_scaler.fit_transform(X_deti_test) X_dech_train = min_max_scaler.fit_transform(X_dech_train) X_dech_test = min_max_scaler.fit_transform(X_dech_test) X_deca_train = min_max_scaler.fit_transform(X_deca_train) X_deca_test = min_max_scaler.fit_transform(X_deca_test) classifier = AdaBoostClassifier(DecisionTreeClassifier( max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf), n_estimators=n_estimators, learning_rate=learning_rate) #demo classifier.fit(X_demo_train, y_demo_train) y_demo_test_pred = classifier.decision_function( X_demo_test) #可以加weight 0.5 basemodelperc = np.percentile(y_demo_test_pred, [95, 90, 80, 70, 60, 50]) base_rej_perc_5 = basemodelperc[0] base_rej_perc_10 = basemodelperc[1] base_rej_perc_20 = basemodelperc[2] base_rej_perc_30 = basemodelperc[3] base_rej_perc_40 = basemodelperc[4] base_rej_perc_50 = basemodelperc[5] print("baseline model rejection rate[5,10,20,30,40,50]: %s" % basemodelperc) # get percentile of array y_test_pred #记录base model该循环中的rejection rate为5%,10%,20%,30%,40%,50%时候的违约率 df_demo = np.vstack((y_test, y_demo_test_pred)) df_demo = pd.DataFrame(df_demo) df_demo = df_demo.transpose() df_demo.columns = ["label", "pred_prob"] def_rate_5_demo = df_demo[
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) plt.axis("tight") # Plot the training points for i, n, c in zip(range(2), class_names, plot_colors): idx = np.where(Ydf_train['default_Yes'] == i) plt.scatter(Xdf_train[Xdf_train['student']==1.0].ix[idx].ix[:,0], Xdf_train[Xdf_train['student']==1.0].ix[idx].ix[:,2],c=c, cmap=plt.cm.Paired,label="Class %s" % n) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.legend(loc='upper right') plt.xlabel("Decision Boundary") plt.show() # Plot the two-class decision scores twoclass_output = clf.decision_function(Xdf_train) plot_range = (twoclass_output.min(), twoclass_output.max()) plt.subplot(132) for i, n, c in zip(range(2), class_names, plot_colors): idx = np.where(Ydf_train['default_Yes'] == i) plt.hist(twoclass_output[idx], bins=10, range=plot_range, facecolor=c, label='Class %s' % n, alpha=.5) x1, x2, y1, y2 = plt.axis() plt.axis((x1, x2, y1, y2 * 1.2)) plt.legend(loc='upper left') plt.ylabel('Samples') plt.xlabel('Decision Scores')
g_w_train_sum_bsm = 0.0 #Create the tree dt_k = DecisionTreeClassifier(max_depth= args.max_depth, criterion=kldc) dt_g = DecisionTreeClassifier(max_depth= args.max_depth, criterion='gini') # Create and fit an AdaBoosted decision tree bdt_k = AdaBoostClassifier(dt_k, algorithm= args.boost_algorithm,n_estimators= args.est_num) bdt_k.fit(X_train, y_train, w_train) # Create and fit an AdaBoosted decision tree bdt_g = AdaBoostClassifier(dt_g, algorithm= args.boost_algorithm,n_estimators= args.est_num) bdt_g.fit(X_train, y_train, w_train) #setup the decision functions, which will be used by the histograms as well k_test_decision_function = bdt_k.decision_function(X_test) k_train_decision_function = bdt_k.decision_function(X_train) #setup the decision functions, which will be used by the histograms as well g_test_decision_function = bdt_g.decision_function(X_test) g_train_decision_function = bdt_g.decision_function(X_train) ende_training = time.time() logger.info('Time to train the tree ' + '{:5.3f}s'.format(ende_training-start)) #get the directory for data output_dir = os.path.join(tmp_directory, args.data_version) if not os.path.exists(output_dir): os.makedirs( output_dir) #get the output directory for plots
rnd_state=42, name="trainsplit") print len(d_trn.data), len(d_tst.data) # Make BDT bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=opts.maxdepth), algorithm = 'SAMME', n_estimators=opts.ntrees, learning_rate=opts.lrate) print "Fitting data" bdt.fit(d_trn.getDataNoWeight(), d_trn.targets) print "Evaluating" pred = bdt.decision_function(d_tst.getDataNoWeight()) #pred_eval = bdt.decision_function(d_eval.getDataNoWeight()) # Import ROOT stuff and save from ROOT import TH1F, TFile h_sig = TH1F("h_sig","h",100,-1,1) h_bkg = TH1F("h_bkg","h",100,-1,1) h_sig_eval = TH1F("h_sig_eval","h",100,-1,1) h_bkg_eval = TH1F("h_bkg_eval","h",100,-1,1) # fill hist weights = d_tst.getDataWeights() for i in range(len(pred)): if d_tst.targets[i]: h_sig.Fill(pred[i],weights[i]) else:
def AdaBoost_model(X_train, X_test, y_train, y_test): #standar scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) X = X_train y = y_train.reshape(1, -1)[0] # Create and fit an AdaBoosted decision tree bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200) bdt.fit(X, y) predict = bdt.predict(X_test) plot_colors = "br" plot_step = 0.02 class_names = "AB" plt.figure(figsize=(10, 5)) # Plot the decision boundaries plt.subplot(121) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 print(x_min, x_max, y_min, y_max) xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) plt.axis("tight") # Plot the training points for i, n, c in zip(range(2), class_names, plot_colors): idx = np.where(y == i) plt.scatter(X[idx, 0], X[idx, 1], c=c, cmap=plt.cm.Paired, s=20, edgecolor='k', label="Class %s" % n) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.legend(loc='upper right') plt.xlabel('x') plt.ylabel('y') plt.title('Decision Boundary') # Plot the two-class decision scores twoclass_output = bdt.decision_function(X) plot_range = (twoclass_output.min(), twoclass_output.max()) plt.subplot(122) for i, n, c in zip(range(2), class_names, plot_colors): plt.hist(twoclass_output[y == i], bins=10, range=plot_range, facecolor=c, label='Class %s' % n, alpha=.5, edgecolor='k') x1, x2, y1, y2 = plt.axis() plt.axis((x1, x2, y1, y2 * 1.2)) plt.legend(loc='upper right') plt.ylabel('Samples') plt.xlabel('Score') plt.title('Decision Scores') plt.tight_layout() plt.subplots_adjust(wspace=0.35) plt.show() return predict
plot_colors = "br" plot_step = 0.2 class_names = ["W Jets","QCD"] plt.figure(figsize=(10, 5)) # Plot the decision boundaries plt.subplot(121) x_min, x_max = X[:, 0].min() , X[:, 0].max() y_min, y_max = X[:, 1].min() , X[:, 1].max() xx, yy = np.meshgrid(np.arange(x_min, x_max, (x_max-x_min)/10000), np.arange(y_min, y_max, (y_max-y_min)/10000)) print 'made mesh' Z = abc.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdBu) plt.axis("tight") print 'Drawing fancy plots - train points' # Plot the training points for i, n, c in zip(range(2), class_names, plot_colors): idx = np.where(Y_valid == i) plt.scatter(X_valid[idx, 0], X_valid[idx, 1], c=c, cmap=plt.cm.Paired, label="%s" % n) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.legend(loc='upper right') plt.ylabel(list(varset)[1])
X_train_sig = df_cheat.query( hlt2_cut_string)[features][int(0.2*n_events):n_events] X_train = X_train_bkg.append(X_train_sig, ignore_index=True).values # DEFINE WHICH PARTS OF TEST AND TRAINING SAMPLES CONTAIN SIGNAL OR BACKGROUND y_test = int(0.2*n_events)*[0]+int(0.2*n_events)*[1] y_train = int(0.8*n_events)*[0]+int(0.8*n_events)*[1] # DEFINE BDT ALGORITHM dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=0.05*len(X_train)) bdt = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=800, learning_rate=0.5) # RUN BDT TRAINING AND SHOW RESULTS bdt.fit(X_train, y_train) sk_y_predicted = bdt.predict(X_test) print classification_report(y_test, sk_y_predicted, target_names=["background", "signal"]) print "Area under ROC curve: %.4f" % (roc_auc_score(y_test, sk_y_predicted)) plt.hist(bdt.decision_function(X_test_bkg).ravel(), color='r', alpha=0.5, range=(-0.4, 0.4), bins=30) plt.hist(bdt.decision_function(X_test_sig).ravel(), color='b', alpha=0.5, range=(-0.4, 0.4), bins=30) plt.xlabel("scikit-learn BDT output") plt.savefig('BDT.pdf')
# Plot the training points for i, n, c in zip(range(2), class_names, plot_colors): idx = np.where(y == i) plt.scatter(X[idx, 0], X[idx, 1], c=c, cmap=plt.cm.Paired, s=20, edgecolor='k', label="Class %s" % n) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.legend(loc='upper right') plt.xlabel('x') plt.ylabel('y') plt.title('Decision Boundary') # Plot the two-class decision scores twoclass_output = bdt.decision_function(X) plot_range = (twoclass_output.min(), twoclass_output.max()) plt.subplot(122) for i, n, c in zip(range(2), class_names, plot_colors): plt.hist(twoclass_output[y == i], bins=10, range=plot_range, facecolor=c, label='Class %s' % n, alpha=.5, edgecolor='k') x1, x2, y1, y2 = plt.axis() plt.axis((x1, x2, y1, y2 * 1.2)) plt.legend(loc='upper right') plt.ylabel('Samples') plt.xlabel('Score')
trainingDataTemp, [trainingDataTemp.shape[1] - 1], axis=1) testingVarsTemp, testingTargetTemp = np.split( testingDataTemp, [testingDataTemp.shape[1] - 1], axis=1) ## scale variables. Map all variables to values between 0 and 1. This is to prevent large numbers from dominating in the testing. min_max_scaler = preprocessing.MinMaxScaler() trainingVarsTemp = min_max_scaler.fit_transform(trainingVarsTemp) testingVarsTemp = min_max_scaler.transform(testingVarsTemp) #build and train bdt bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=depth), n_estimators=trees) bdt.fit(trainingVarsTemp, np.ravel(trainingTargetTemp)) #bdt score for testing sample output_test = bdt.decision_function(testingVarsTemp) #calculate area under curve auc = roc_auc_score(testingTargetTemp, output_test) if verbose: print "Area under ROC = ", auc #append AUC and size to lists for plotting after loop sizes.append(trainingSize) AUC.append(auc) #update Size (if while loop) # trainingSize += updateSize ###################################### # Information Sheet for the Plot PDF # ######################################
def main(): Algorithm = 'AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_matchedL_ranged_v2_1000_1500' pca(Algorithm) #plotVars(Algorithm) return #Algorithm = sys.argv[1] #Algorithm = 'CamKt12LCTopoSplitFilteredMu100SmallR30YCut414tev_350_500_vxp_0_99' print 'Loading training data ...' data_train = pd.read_csv('csv/'+Algorithm+'_merged.csv') #standardise data for t in trainvars: minx = np.amin(data_train[t]) maxx = np.amax(data_train[t]) data_train[t] = (data_train[t] - minx)/(maxx-minx) #data_train[t] = (data_train[t] - np.mean(data_train[t]))/np.std(data_train[t]) r =np.random.rand(data_train.shape[0]) #Set label and weight vectors - and drop any unwanted tranining one Y_train = data_train['label'].values[r<0.5] # W_train = data_train['weight'].values[r<0.9] Y_valid = data_train['label'].values[r>=0.5] # W_valid = data_train['weight'].values[r>=0.9] # data_train.drop('AKT10LCTRIM530_MassDropSplit',axis=1, inplace=True) print data_train.columns.values[1:-1] #varcombinations = itertools.combinations(data_train.columns.values[1:-1],2) varcombinations = itertools.combinations(trainvars[:],26) fac = lambda n: 1 if n < 2 else n * fac(n - 1) combos = lambda n, k: fac(n) / fac(k) / fac(n - k) #colors = plt.get_cmap('jet')(np.linspace(0, 1.0,combos(len(data_train.columns.values[1:-1]),2) )) colors = plt.get_cmap('jet')(np.linspace(0, 1.0,combos(len(trainvars),2) )) for varset,color in zip(varcombinations, colors): print list(varset) X_train = data_train[list(varset)].values[r<0.5] X_valid = data_train[list(varset)].values[r>=0.5] dt = DC(max_depth=3,min_samples_leaf=0.05*len(X_train)) abc = ABC(dt,algorithm='SAMME', n_estimators=8, learning_rate=0.5) print 'Training classifier with all the data..' abc.fit(X_train, Y_train) print 'Done.. Applying to validation sample and drawing ROC' prob_predict_valid = abc.predict_proba(X_valid)[:,1] Y_score = abc.decision_function(X_valid) fpr, tpr, _ = roc_curve(Y_valid, prob_predict_valid) # if we want to compare directly with the cut-based method we need to calculate 1/(1-roc(0.5)). # however, this is what we do when we've already applied the mass window. This does not do so. labelstring = ' And '.join(var.replace('_','') for var in varset) print labelstring plt.plot(tpr, (1-fpr), label=labelstring, color=color) print abc.feature_importances_ plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.ylabel('1- Background Efficiency') plt.xlabel('Signal Efficiency') plt.title(Algorithm+' ROC Curve') plt.legend(loc="lower left",prop={'size':6}) plt.savefig(Algorithm+'rocmva.pdf')
#create the n_estimators to loop over parameters = np.linspace(args.n_est_start, args.n_est_end, num=args.est_num, dtype=np.int32) for para in parameters: # Create and fit an AdaBoosted decision tree for the selected criterion bdt = AdaBoostClassifier(dt, algorithm=args.boost_algorithm, n_estimators=para) bdt.fit(X_train, y_train, w_train) #get the decision functions from the kule tree test_dec_fct = bdt.decision_function(X_test) train_dec_fct = bdt.decision_function(X_train) #get the histograms for kule h_dis_train_SM, h_dis_train_BSM, h_dis_test_SM, h_dis_test_BSM = get_histograms( X_test, X_train, y_test, y_train, w_test, w_train, test_dec_fct, train_dec_fct) #Berechne die Kule Div und den Gini kule_test, k_error_test = kl.kule_div(h_dis_test_SM, h_dis_test_BSM) kule_train, k_error_train = kl.kule_div(h_dis_train_SM, h_dis_train_BSM) gini_test, g_error_test = gi.gini(h_dis_test_SM, h_dis_test_BSM) gini_train, g_error_train = gi.gini(h_dis_train_SM, h_dis_train_BSM) #reset the histogramms to fill them again next iteration
depth = 3 #1 = stumps print "Declaring Classifier" bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=depth), n_estimators=trees) print "Training BDT" bdt.fit(trainingVars, np.ravel(trainingTarget)) ####################################################################################### # Calculate Rate of Correct and Incorrect Classification in Training and Testing Data # ####################################################################################### pred_train = bdt.predict(trainingVars) pred_test = bdt.predict(testingVars) output_train = bdt.decision_function(trainingVars) output_test = bdt.decision_function(testingVars) train_SS = 0 train_SB = 0 train_BS = 0 train_BB = 0 for number, entry in enumerate(trainingTarget): if entry == 1: if pred_train[number] == 1: train_SS += 1 elif pred_train[number] == 0: train_SB += 1 elif entry == 0: if pred_train[number] == 1:
def train_kfold(clf_type, X, y, folds=6, show_plots=False, write_decisions=False, state=0, **kwargs): """Uses kFolding to train a certain classifier. Keyword arguments: clf_type: classifier type as string, currently supported: ['AdaBoostClassifier', 'GradientBoostingClassifier'] X: complete dataset (note: you don't need to split your dataset into a train and test dataset using kFolding!) y: corresponding flags folds: number of folds (default: 6) show_plots: if True, shows probability distributions from training and testing dataset, using the 'plot_train_test_comparison' function (default: False) write_decisions: if True, appends decision columns to given DataFrame X (default: False) kwargs: key word arguments for KFold Returns: list of trained classifiers """ if clf_type not in ['AdaBoostClassifier', 'GradientBoostingClassifier']: raise ValueError( 'Classifier type {} is not supported for kfolding right now!'. format(clf_type)) decision_col_name = clf_type + '_decision' clfs = [] kf = KFold(len(X), n_folds=folds, **kwargs) for i, (train_index, test_index) in tqdm_notebook(enumerate(kf, start=1), total=len(kf)): train_cols = list(X.columns) if write_decisions and decision_col_name in X.columns: train_cols.remove(decision_col_name) X_train, X_test = X[train_cols].iloc[train_index], X[train_cols].iloc[ test_index] y_train, y_test = y[train_index], y[test_index] if clf_type == 'AdaBoostClassifier': clf = AdaBoostClassifier(random_state=state) elif clf_type == 'GradientBoostingClassifier': clf = GradientBoostingClassifier(random_state=state) clf.fit(X_train.as_matrix(), y_train) if show_plots: plot_classifier_output(clf, X_train, y_train, X_test, y_test, title='Classifier iteration {}'.format(i)) if write_decisions: X.set_value(test_index, decision_col_name, clf.decision_function(X_test)) clfs.append(clf) return clfs
X_test_ref_ae = min_max_scaler.transform(X_test_ref) X_test_new_ae = min_max_scaler.transform(X_test_new) ############################# # Assembly, training & testing ############################# # Boosted decision tree classifier if runBDT: print "Building and training BDT" bdt = AdaBoostClassifier(n_estimators=100,base_estimator=DecisionTreeClassifier(max_depth=1)) bdt.fit(X_train,Y) # Testing pred_train_bdt = bdt.predict(X_train) pred_test_bdt = bdt.predict(X_test) output_train_bdt = bdt.decision_function(X_train) output_test_bdt = bdt.decision_function(X_test) # Results print-out print "BDT classifier esults...." printResults(pred_train_bdt,pred_test_bdt,nReferenceEvents,nNewEvents) # Neural network classifier if runNN: # Training if runTraining: print "Building and training neural network" nn = Sequential() from keras.layers import Dense, Activation nn.add(Dense(71, input_dim=71)) nn.add(Activation("relu")) nn.add(Dense(1))
signalScore) print "- When we predict that we have a signal event, it is actually signal %.1f%% of the time (%i out of %i)" % ( 100.0 * fcorrect, int(fcorrect * len(predictionsForSignal)), len(predictionsForSignal)) ### PLOT # plot feature distributions if first: first = False for idx, indicator in enumerate(whichIndicators): featureDistributions(Xtrain, Ytrain, indicator, idx) # shamelessly stolen from https://dbaumgartel.wordpress.com/2014/03/14/machine-learning-examples-scikit-learn-versus-tmva-cern-root/ Classifier_training_S = alg.decision_function( Xtrain[Ytrain > 0.5]).ravel() Classifier_training_B = alg.decision_function( Xtrain[Ytrain < 0.5]).ravel() Classifier_testing_S = alg.decision_function( Xtest[Ytest > 0.5]).ravel() Classifier_testing_B = alg.decision_function( Xtest[Ytest < 0.5]).ravel() # This will be the min/max of our plots c_max = 1.5 c_min = -1.5 # Get histograms of the classifiers Histo_training_S = np.histogram(Classifier_training_S, bins=40, range=(c_min, c_max))
def ploteffarea(dt_eval, dt_train, opts, dt_LowE): # If modelinput is specified then read in model bdt = None if opts.modelinput != "": bdt = joblib.load(opts.modelinput) print "Loaded model back" print bdt else: print "Model not specified..." print "Creating classification and training again" bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=opts.maxdepth), algorithm = 'SAMME', n_estimators=opts.ntrees, learning_rate=opts.lrate) bdt.fit(dt_train.getDataNoWeight(), dt_train.targets) # Now get the bdt scores sig_scores = bdt.decision_function(dt_eval.getDataNoWeight()[dt_eval.targets > 0.5]) sig_data = dt_eval.data[dt_eval.targets > 0.5] # Also for low energy le_sig_scores = bdt.decision_function(dt_LowE.getDataNoWeight()) # Specify the number of bins and the range #nbins = int(30) #xmin = float(5) #xmax = float(8) # Copying the bins from leif and sebastian for now xmin = 3 xmax = 9 nbins = 20. bins = np.arange(3,9.1,0.3) # Some constants #solidangle = 4*pi solidangle = 2 * (1 + cos(85*pi/180)) * pi ebins_per_decade = float(nbins/(xmax-xmin)) # Some stuff from the data oneweightloc = len(dt_eval.t_varnames) + dt_eval.w_varnames.index('OneWeight') Eloc = len(dt_eval.t_varnames) + dt_eval.w_varnames.index('nuE') NEvents = sig_data[0][ len(dt_eval.t_varnames) + dt_eval.w_varnames.index('NEvents') ] # Basic methods def mcLogEBin(E): return int(log10(E)*ebins_per_decade) def mcEMin(mc_log_ebin): return pow(10,mc_log_ebin/ebins_per_decade) def mcEMax(mc_log_ebin): return pow(10,(1+mc_log_ebin)/ebins_per_decade) # Calculate effective area def getEffA(data, sf): effA = np.zeros(len(data),dtype=float) energy = np.empty(len(data),dtype=float) nfiles = sf / (NEvents*961) for i in range(len(effA)): E = data[i][Eloc] OneWeight = data[i][oneweightloc] mclogebin = mcLogEBin(E) mcemin = mcEMin(mclogebin) mcemax = mcEMax(mclogebin) effA[i] = 1e-4 * OneWeight * nfiles * 1/(solidangle*(mcemax-mcemin)) energy[i] = log10(E) return effA, energy effA, energy = getEffA(sig_data,dt_eval.sf) le_effA, le_energy = getEffA(dt_LowE.data,dt_LowE.sf) # Now all scale factor info has been added # combine the data for ease of plotting effA = np.concatenate((effA, le_effA)) energy = np.concatenate((energy, le_energy)) sig_scores = np.concatenate((sig_scores,le_sig_scores)) # Draw eff area fig, ax = plt.subplots(ncols=1, figsize=(10,7)) bdtcut = 0.6 h, g, v = plt.hist(energy[sig_scores > bdtcut], weights=effA[sig_scores > bdtcut], color='b', label='NuGen (bdt > %0.2f)'%bdtcut, range=(xmin,xmax), bins=nbins, log=True, histtype='step') #hle, gle, vle = plt.hist(le_energy[le_sig_scores > bdtcut], # weights=le_effA[le_sig_scores > bdtcut], # color='r', label='NuGen low# (bdt > %0.2f)'%bdtcut, # range=(xmin,xmax), # bins=nbins, # log=True, # histtype='step') plt.ylim([1.e-3, 1.e4]) plt.xlabel('log$_{10}$(E/GeV)') plt.ylabel('Effective Area [m$^2$]') plt.grid() plt.tight_layout() # Dump output output = {'logebins': bins, 'effA': h} pickle.dump(output,open('myeffaDump.pkl','w')) # Save figure #plt.savefig("plots/EffArea/EffArea_bdtcut%0.2f_sep3best.png"%bdtcut) plt.show()
algorithm="SAMME", n_estimators=200 ) bdt_B = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3, min_samples_leaf=0.01), learning_rate=0.15, algorithm="SAMME", n_estimators=200 ) bdt_A.fit(X_A, Y_A, sample_weight=w_A) bdt_B.fit(X_B, Y_B, sample_weight=w_B) print "BDT training completed." # Get scores of X_A for BDT_B and vice-versa. scores_A = bdt_B.decision_function(X_A).tolist() scores_B = bdt_A.decision_function(X_B).tolist() print "Non-normalised decision function scores processed." # Normalise decision scores between -1 and 1. max_score = max([a for a in scores_A + scores_B]) min_score = min([a for a in scores_A + scores_B]) score_range = max_score - min_score score_midpoint = min_score + score_range / 2 # Translate and shrink. scores_A = map(lambda a: (a - score_midpoint) / (score_range / 2 + 0.000001), scores_A) # .001 added for bounding scores_B = map(lambda a: (a - score_midpoint) / (score_range / 2 + 0.000001), scores_B) print "Updating event objects with decision scores..."
class adaBoost: __all__=['run','plotFeatureRanking','plotScores'] def __init__(self, foundVariables, trainingData, trainingClasses, trainingWeights, testingData, testingClasses, adaName, bkg_name): """Build a forest and compute the feature importances. Keyword args: foundVariables -- The list of the names of found variabes, can get using Sample_x.returnFoundVariables() trainingData -- The training data trainingClasses -- The training data classes testingData -- the testing data testingClasses -- the testing data classes adaName -- the name of the object (eg. sig+bkg_name) """ self.ada = AdaBoostClassifier(DecisionTreeClassifier(compute_importances=True,max_depth=4,min_samples_split=2,min_samples_leaf=100),n_estimators=400, learning_rate=0.5, algorithm="SAMME",compute_importances=True) #class sklearn.tree.DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_density=0.10000000000000001, max_features=None, compute_importances=False, random_state=None) self.foundVariables = foundVariables self.trainingData = trainingData self.trainingClasses = trainingClasses self.testingData = testingData self.testingClasses = testingClasses self.trainingWeights = trainingWeights self.name = adaName self.bkg_name = bkg_name self.elapsed = 0.0 def returnName(self): return self.name def run(self): """Run the fitting and testing.""" #start the fitting and time it start = clock() print 'starting training on AdaBoostClassifier' self.ada.fit(self.trainingData, self.trainingClasses, self.trainingWeights) self.elapsed = clock()-start print 'time taken for training: ' + str(self.elapsed) #set up the arrays for testing/ eval #xtA_C = copy.deepcopy(self.testingData) #pred = self.ada.predict(xtA_C) #import createHists #createHists.drawSigBkgDistrib(xtA_C, pred, self.foundVariables) # draw the signal and background distributions together # list the importances of each variable in the bdt, get the score on the test data self.importancesada = self.ada.feature_importances_ print 'importances' print self.importancesada self.score= self.ada.score(self.testingData,self.testingClasses) self.params = self.ada.get_params() self.std_mat = np.std([tree.feature_importances_ for tree in self.ada.estimators_], axis=0) self.indicesada = np.argsort(self.importancesada)[::-1] self.variableNamesSorted = [] for i in self.indicesada: self.variableNamesSorted.append(self.foundVariables[i]) # Print the feature ranking print "Feature ranking:" for f in xrange(12): print "%d. feature %d (%f)" % (f + 1, self.indicesada[f], self.importancesada[self.indicesada[f]]) + " " +self.variableNamesSorted[f] self.twoclass_output = self.ada.decision_function(self.testingData) self.twoclass_output_train = self.ada.decision_function(self.trainingData) self.class_proba = self.ada.predict_proba(self.testingData)[:, -1] def plotFeatureRanking(self): # We need this to run in batch because it complains about not being able to open display from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas import matplotlib.pyplot as plt import pylab as pl #plot the feature ranking pl.figure() pl.title("Feature importances Ada") pl.bar(xrange(len(self.variableNamesSorted)), self.importancesada[self.indicesada], color="r", yerr=self.std_mat[self.indicesada], align="center") pl.xticks(xrange(12), self.variableNamesSorted)#indicesada) pl.xlim([-1, 12]) pl.show() def plotScores(self, returnROC = False, rocInput = []): from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas import matplotlib.pyplot as plt import pylab as pl from sklearn.metrics import roc_curve, auc plot_colors = "rb" plot_step = 1000.0 class_names = "AB" # Plot the training points pl.subplot(131) for i, n, c in zip(xrange(2), class_names, plot_colors): idx = np.where(self.trainingClasses == i) pl.scatter(self.trainingData[idx, 0], self.trainingData[idx, 1], c=c, cmap=pl.cm.Paired, label="Class %s" % n) pl.axis("tight") pl.legend(loc='upper right') pl.xlabel("Decision Boundary") # Plot the class probabilities for i, n, c in zip(xrange(2), class_names, plot_colors): pl.hist(self.class_proba[self.testingClasses == i], bins=50, range=(0, 1), facecolor=c, label='Class %s' % n) pl.legend(loc='upper center') pl.ylabel('Samples') pl.xlabel('Class Probability') # Plot the two-class decision scores/ bdt scores pl.subplot(133) for i, n, c in zip(xrange(2), class_names, plot_colors): pl.hist(self.twoclass_output[self.testingClasses == i], bins=50, range=(-1, 1), facecolor=c, label='Class %s' % n, normed=True) pl.legend(loc='upper right') pl.ylabel('Samples') pl.xlabel('Two-class Decision Scores') pl.subplots_adjust(wspace=0.25) mean_tpr = 0.0 mean_fpr = pl.linspace(0, 1, 100) pl.subplot(132) beginIdx = 0 endIdx = len(self.testingData)#/2 fpr_arr = [] tpr_arr = [] roc_auc_arr = [] rej_arr = [] for i in range(1): probas_ = self.ada.predict_proba(self.testingData[beginIdx:endIdx]) #probas_ = self.ada.predict_proba(self.testingData[self.testingClasses == i]) # Compute ROC curve and area the curve fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[beginIdx:endIdx], probas_[:,1]) #fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[self.testingClasses == i], probas_[:,1],i) #mean_tpr += interp(mean_fpr, fpr, tpr) #mean_tpr[0] = 0.0 roc_auc = auc(tpr,rej)#auc(fpr, tpr) fpr_arr.append(fpr) tpr_arr.append(tpr) roc_auc_arr.append(roc_auc) rej_arr.append(rej) pl.plot(tpr_arr[i], rej_arr[i], lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc_arr[i]), color=plot_colors[i]) beginIdx = endIdx endIdx = len(self.testingData) if len(rocInput)>0: pl.plot(rocInput[1][0], rocInput[3][0], lw=1, label='ROC fold %d (area = %0.2f)' % (2, rocInput[2][0]), color=plot_colors[1]) if returnROC: return [fpr_arr, tpr_arr, roc_auc_arr, rej_arr] pl.show() def plotBDTScores(self): from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas import matplotlib.pyplot as plt import pylab as pl plot_colors = "rb" plot_step = 1000.0 alpha_h = [1.0, 0.7] class_names = ['Background', 'Signal'] for i, n, c in zip(xrange(2), class_names, plot_colors): pl.hist(self.twoclass_output[self.testingClasses == i], bins=50, range=(-1, 1), facecolor=c, alpha=alpha_h[i], label='Class %s' % n, normed=True) pl.legend(loc='upper right') pl.ylabel('Samples') pl.xlabel('BDT Scores') pl.savefig('BDTScores'+self.name+'.png') def plotROC(self, returnROC = False, rocInput = []): from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas import matplotlib.pyplot as plt import pylab as pl from sklearn.metrics import roc_curve, auc beginIdx = 0 endIdx = len(self.testingData)#/2 plot_colors = "rb" plot_step = 1000.0 class_names = "AB" fpr_arr = [] tpr_arr = [] roc_auc_arr = [] rej_arr = [] names = [] pl.xlabel("Signal Efficiency") pl.ylabel("Background Rejection") pl.title("ROC curves") for i in range(1): probas_ = self.ada.predict_proba(self.testingData[beginIdx:endIdx]) #probas_ = self.ada.predict_proba(self.testingData[self.testingClasses == i]) # Compute ROC curve and area the curve fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[beginIdx:endIdx], probas_[:,1]) #fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[self.testingClasses == i], probas_[:,1],i) #mean_tpr += interp(mean_fpr, fpr, tpr) #mean_tpr[0] = 0.0 roc_auc = auc(tpr,rej)#auc(fpr, tpr) fpr_arr.append(fpr) tpr_arr.append(tpr) roc_auc_arr.append(roc_auc) rej_arr.append(rej) names.append(self.name) beginIdx = endIdx endIdx = len(self.testingData) if len(rocInput)>0: label_bkg = rocInput[4][0] if '_A' in rocInput[4][0]: label_bkg = 'even event number' pl.plot(rocInput[1][0], rocInput[3][0], lw=1, label='ROC %s (area = %0.2f)' % (label_bkg, rocInput[2][0]), color=plot_colors[1]) if not returnROC: label_bkg = self.name if '_B' in self.name: label_bkg = 'odd event number' pl.plot(tpr_arr[i], rej_arr[i], lw=1, label='ROC %s (area = %0.2f)' % (label_bkg, roc_auc_arr[i]), color=plot_colors[i]) pl.legend(loc='lower left') pl.savefig("roc_combined_"+self.name+".png") if returnROC: return [fpr_arr, tpr_arr, roc_auc_arr, rej_arr, names] pl.show() def plotDecisionBoundaries(self): import numpy as np import pylab as pl from matplotlib.colors import ListedColormap from sklearn.preprocessing import StandardScaler #from sklearn.cross_validation import train_test_split # just plot the dataset first cm = pl.cm.RdBu cm_bright = ListedColormap(['#FF0000', '#0000FF']) #self.trainingData = StandardScaler().fit_transform(self.trainingData) #self.testingData = StandardScaler().fit_transform(self.testingData) #X_train = StandardScaler().fit_transform(self.twoclass_output_train) h = 0.1 h2 = 0.01 #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) # get most important variable indices idx1 = self.foundVariables.index(self.variableNamesSorted[0]) idx2 = self.foundVariables.index(self.variableNamesSorted[1]) x_min, x_max = self.trainingData[np.argmin(self.trainingData[:, idx1])][idx1] - .1, self.trainingData[np.argmax(self.trainingData[:, idx1])][idx1] + .1 y_min, y_max = self.trainingData[np.argmin(self.trainingData[:, idx2])][idx2]- .01, self.trainingData[np.argmax(self.trainingData[:,idx2])][idx2] + .01 x_min2, x_max2 = self.testingData[np.argmin(self.testingData[:, idx1])][idx1] - .1, self.testingData[np.argmax(self.testingData[:, idx1])][idx1] + .1 y_min2, y_max2 = self.testingData[np.argmin(self.testingData[:, idx2])][idx2] - .01, self.testingData[np.argmax(self.testingData[:, idx2])][idx2] + .01 xmin = min(x_min,x_min2) xmax = max(x_max,x_max2) ymin = min(y_min, y_min2) ymax = max(y_max,y_max2) xx, yy = np.meshgrid(np.arange(xmin, xmax, float((xmax-xmin)/25.0)), np.arange(ymin, ymax, float((ymax-ymin)/25.0))) # get mean values for other variables means = np.mean(self.testingData, axis=0) means = np.tile(means, (xx.shape[1]*xx.shape[0],1)) for j in xrange(xx.shape[0]): for k in xrange(xx.shape[1]): means[(j+1)*(k+1)-1][idx1] = xx[0][j] means[(j+1)*(k+1)-1][idx2] = yy[k][0] #print 'shape X: ' #print X.shape print 'shape xx: ' print xx.shape print 'shape yy: ' print yy.shape #rav = np.c_[xx.ravel(), yy.ravel()] print 'shape means: ' print means.shape # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. #if hasattr(clf, "decision_function"): # Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) #else: Z = self.ada.predict_proba(means)[:, 1] print 'Z shape:' print Z.shape # Put the result into a color plot Z = Z.reshape(xx.shape) figure = pl.figure() ax = pl.axes() ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot also the training points #for i, n in zip(xrange(2), class_names): # idx = np.where(self.trainingClasses == i) ax.scatter(self.trainingData[:, idx1], self.trainingData[:, idx2], c=self.trainingClasses[:], cmap=cm_bright) #for i, n in zip(xrange(2), class_names): # idx = np.where(self.testingClasses == i) ax.scatter(self.testingData[:, idx1], self.testingData[:, idx2], c=self.testingClasses[:], cmap=cm_bright, alpha=0.6) #ax.scatter(X_train[:, 0], X_training[:, 1], c=self.trainingClasses, cmap=cm_bright) # and testing points #ax.scatter(X[:, 0], X[:, 1], c=self.testingClasses, cmap=cm_bright, # alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) ax.set_title("adaBoost") ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % self.score).lstrip('0'), size=15, horizontalalignment='right') pl.savefig("adaBoostDecisionBoundaries"+self.name+".png") pl.show()
#log precision_l, recall_l, thresholds_l = precision_recall_curve(test["los"], log.decision_function(test_variables)) pl.plot(recall_l, precision_l) pl.xlabel("precision") pl.ylabel("recall") pl.title("LogisticRegression") pl.show() #cart precision_c, recall_c, thresholds_c = precision_recall_curve(test["los"], test_cart_prob[::,1]) pl.plot(recall_c, precision_c) pl.xlabel("precision") pl.ylabel("recall") pl.title("CART") pl.show() #ad precision_ad, recall_ad, thresholds_ad = precision_recall_curve(test["los"], ad.decision_function(test_variables)) pl.plot(recall_ad, precision_ad) pl.xlabel("precision") pl.ylabel("recall") pl.title("AdBoosting") pl.show() #Naive precision_n, recall_n, thresholds_n = precision_recall_curve(test["los"], test_naive_prob[::,1]) pl.plot(recall_n, precision_n) pl.xlabel("precision") pl.ylabel("recall") pl.title("NaiveBayes") pl.show() #integral plt.plot(recall_l, precision_l) plt.plot(recall_c, precision_c)
X_test = min_max_scaler.fit_transform(X_test) #sc = StandardScaler() #X_train = sc.fit_transform(X_train) #X_test = sc.fit_transform(X_test) classifier = AdaBoostClassifier( DecisionTreeClassifier( max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf), n_estimators=n_estimators, learning_rate=learning_rate) classifier.fit(X_train, y_train) list_feaimp.append(classifier.feature_importances_) print(classifier.feature_importances_) y_train_pred = classifier.decision_function(X_train) y_test_pred = classifier.decision_function(X_test) train_fpr, train_tpr, tr_thresholds = roc_curve( y_train, y_train_pred) test_fpr, test_tpr, te_thresholds = roc_curve( y_test, y_test_pred) print(auc(train_fpr, train_tpr)) print(auc(test_fpr, test_tpr)) plt.grid() plt.plot(train_fpr, train_tpr, label=" AUC TRAIN =" + str(auc(train_fpr, train_tpr))) plt.plot(test_fpr,
argParser.prog.split('.')[0], vversion) if not os.path.exists(output_directory): os.makedirs(output_directory) logger.info('Save to %s directory', output_directory) #pyplot settings class_names = ["SM Test", "BSM Test", "SM Train", "BSM Train"] plot_colors = ["#000cff", "#ff0000", "#9ba0ff", "#ff8d8d"] plt.figure(figsize=(18, 8)).suptitle( "Decision Boundaries for test- (top) and trainings-dataset (bottom) \n n: " + str(args.n_est), fontsize=18) plot_step = 0.075 #setup the decision functions, which will also be used by the histograms test_decision_function = bdt.decision_function(X_test) train_decision_function = bdt.decision_function(X_train) #show the decision shape for test data plt.subplot(2, 1, 1) if args.ptz_only: #generate the 1 values which will be used to generate the cut values x_min, x_max = X_test.min() - 1, X_test.max() + 1 xx = np.arange(x_min, x_max, plot_step) xx = np.reshape(xx, (-1, 1)) #map the decision function Z = bdt.decision_function(xx) Z = np.reshape(Z, (-1, 1)) #get the limits and plot the function
Y = np.concatenate((Y_ref,Y_new),0) # Feature scaling min_max_scaler = preprocessing.MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) X_test = min_max_scaler.transform(X_test) # BDT TRAINING AND TESTING print "Building and training BDT" clf = AdaBoostClassifier(n_estimators=100,base_estimator=DecisionTreeClassifier(max_depth=1)) clf.fit(X_train,Y) # Testing pred_train = clf.predict(X_train) pred_test = clf.predict(X_test) output_train = clf.decision_function(X_train) output_test = clf.decision_function(X_test) print "Training sample...." print " Signal identified as signal (%) : ",100.0*np.sum(pred_train[nReferenceEvents:nReferenceEvents+nNewEvents]==1.0)/nNewEvents print " Signal identified as background (%) : ",100.0*np.sum(pred_train[nReferenceEvents:nReferenceEvents+nNewEvents]==0.0)/nNewEvents print " Background identified as signal (%) : ",100.0*np.sum(pred_train[0:nReferenceEvents]==1.0)/nReferenceEvents print " Background identified as background (%): ",100.0*np.sum(pred_train[0:nReferenceEvents]==0.0)/nReferenceEvents print "" print "Testing sample...." print " Signal identified as signal (%) : ",100.0*np.sum(pred_test[nReferenceEvents:nReferenceEvents+nNewEvents]==1.0)/nNewEvents print " Signal identified as background (%) : ",100.0*np.sum(pred_test[nReferenceEvents:nReferenceEvents+nNewEvents]==0.0)/nNewEvents print " Background identified as signal (%) : ",100.0*np.sum(pred_test[0:nReferenceEvents]==1.0)/nReferenceEvents print " Background identified as background (%): ",100.0*np.sum(pred_test[0:nReferenceEvents]==0.0)/nReferenceEvents # Plotting - probabilities
def main(): #load data train_data, test_data = load_data('spambase/spambase.data') #Using Adaboost Classifier with 200 classifiers print('Using AdaBoost ...') clf = AdaBoostClassifier(n_estimators=200, learning_rate=1) clf.fit(train_data.X, train_data.y) #Training and Testing Accuracy print('Training Accuracy: ', clf.score(train_data.X, train_data.y)) print('Testing Accuracy: ', clf.score(test_data.X, test_data.y)) #Creating Confusion Matrix prediction = clf.predict(test_data.X) confusion_matrix = np.zeros((2, 2)) accuracy = 0 for i in range(len(prediction)): if prediction[i] == 0 and test_data.y[i] == 0: confusion_matrix[0][0] += 1 accuracy += 1 elif prediction[i] == 1 and test_data.y[i] == 1: confusion_matrix[1][1] += 1 accuracy += 1 elif prediction[i] == 0 and test_data.y[i] == 1: confusion_matrix[1][0] += 1 elif prediction[i] == 1 and test_data.y[i] == 0: confusion_matrix[0][1] += 1 #Outputting confusion matrix print('\n') print('Confusion Matrix') print(' prediction') print(' -1 1') print(' -----') print('-1| ' + str(int(confusion_matrix[0][0])) + ' ' + str(int(confusion_matrix[0][1]))) print(' 1| ' + str(int(confusion_matrix[1][0])) + ' ' + str(int(confusion_matrix[1][1]))) print('\n') #Creating a roc curve for different number of classifiers clf_200 = AdaBoostClassifier(n_estimators=200, learning_rate=1) clf_200.fit(train_data.X, train_data.y) y_score_200 = clf_200.decision_function(test_data.X) fpr_200, tpr_200, thresholds_200 = roc_curve(test_data.y, y_score_200) clf_500 = AdaBoostClassifier(n_estimators=50, learning_rate=1) clf_500.fit(train_data.X, train_data.y) y_score_500 = clf_500.decision_function(test_data.X) fpr_500, tpr_500, thresholds_500 = roc_curve(test_data.y, y_score_500) clf_20 = AdaBoostClassifier(n_estimators=20, learning_rate=1) clf_20.fit(train_data.X, train_data.y) y_score_20 = clf_20.decision_function(test_data.X) fpr_20, tpr_20, thresholds_20 = roc_curve(test_data.y, y_score_20) #Plotting Roc Curve for T = 20,200 and 500 plt.plot(fpr_200, tpr_200, 'r-', label='T= 200') plt.plot(fpr_500, tpr_500, 'g-', label='T= 500') plt.plot(fpr_20, tpr_20, 'b-', label='T= 20') plt.legend() plt.title("ROC curve for AdaBoost Classifier") plt.xlabel("false positive rate") plt.ylabel("true positive rate") plt.savefig('adaboost.png') plt.show() #Finding the features' importance feature_names =['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 'word_freq_over' , 'word_freq_remove', 'word_freq_internet','word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report',\ 'word_freq_addresses','word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet',\ 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference', 'char_freq_;', 'char_freq_(',\ 'char_freq_[', 'char_freq_!','char_freq_$', 'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total'] feature_imp = clf_200.feature_importances_ print('\n') print('Feature names ...') print(feature_names) print('\n') print('Feature importance ...') print(feature_imp) print('\n') #Using the email parser can we detect if the email is spam ? print('\nParsing spam email...') email = Data() email.X, email.y = parse_email('antispamSpam.txt', 1) prediction = clf.predict(email.X)[0] true_label = email.y[0] print(prediction, true_label) if prediction == true_label: print('Successfully detected the spam.') else: print('Failed to detect the spam.') #Using the email parser can we detect if the email is not spam ? print('\nParsing Sara\'s email...') not_spam = Data() not_spam.X, not_spam.y = parse_email('saraEmail.txt', 0) prediction = clf.predict(not_spam.X)[0] true_label = not_spam.y[0] print(prediction, true_label) if prediction == true_label: print('Successfully detected that the email is safe.') else: print('Misclassified the email as spam.') print('\n') print('\n') #Using RandomForest Classifier with 20 classifiers print('Using RandomForest ...') clf = RandomForestClassifier(n_estimators=20, criterion='gini') clf.fit(train_data.X, train_data.y) #Training and Testing Accuracy print('Training Accuracy: ', clf.score(train_data.X, train_data.y)) print('Testing Accuracy: ', clf.score(test_data.X, test_data.y)) prediction = clf.predict(test_data.X) #Finding the features' importance feature_names =['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 'word_freq_over' , 'word_freq_remove', 'word_freq_internet','word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report',\ 'word_freq_addresses','word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet',\ 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference', 'char_freq_;', 'char_freq_(',\ 'char_freq_[', 'char_freq_!','char_freq_$', 'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total'] feature_imp = clf.feature_importances_ print('\n') print('Feature names ...') print(feature_names) print('\n') print('Feature importance ...') print(feature_imp) print('\n') #Creating Confusion Matrix confusion_matrix = np.zeros((2, 2)) accuracy = 0 for i in range(len(prediction)): if prediction[i] == 0 and test_data.y[i] == 0: confusion_matrix[0][0] += 1 accuracy += 1 elif prediction[i] == 1 and test_data.y[i] == 1: confusion_matrix[1][1] += 1 accuracy += 1 elif prediction[i] == 0 and test_data.y[i] == 1: confusion_matrix[1][0] += 1 elif prediction[i] == 1 and test_data.y[i] == 0: confusion_matrix[0][1] += 1 #Outputting confusion matrix print('\n') print('Confusion Matrix') print(' prediction') print(' -1 1') print(' -----') print('-1| ' + str(int(confusion_matrix[0][0])) + ' ' + str(int(confusion_matrix[0][1]))) print(' 1| ' + str(int(confusion_matrix[1][0])) + ' ' + str(int(confusion_matrix[1][1]))) print('\n') #Using the email parser can we detect if the email is spam ? print('\nParsing spam email...') email = Data() email.X, email.y = parse_email('antispamSpam.txt', 1) prediction = clf.predict(email.X)[0] true_label = email.y[0] print(prediction, true_label) if prediction == true_label: print('Successfully detected the spam.') else: print('Failed to detect the spam.') #Using the email parser can we detect if the email is not spam ? print('\nParsing Sara\'s email...') not_spam = Data() not_spam.X, not_spam.y = parse_email('saraEmail.txt', 0) prediction = clf.predict(not_spam.X)[0] true_label = not_spam.y[0] print(prediction, true_label) if prediction == true_label: print('Successfully detected that the email is safe.') else: print('Misclassified the email as spam.')
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) plt.axis("tight") # Plot the training points for i, n, c in zip(range(2), class_names, plot_colors): idx = np.where(y == i) plt.scatter(X[idx, 0], X[idx, 1], c=c, cmap=plt.cm.Paired, label="Class %s" % n) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.legend(loc='upper right') plt.xlabel("Decision Boundary") # Plot the two-class decision scores twoclass_output = bdt.decision_function(X) plot_range = (twoclass_output.min(), twoclass_output.max()) plt.subplot(122) for i, n, c in zip(range(2), class_names, plot_colors): plt.hist(twoclass_output[y == i], bins=10, range=plot_range, facecolor=c, label='Class %s' % n, alpha=.5) x1, x2, y1, y2 = plt.axis() plt.axis((x1, x2, y1, y2 * 1.2)) plt.legend(loc='upper right') plt.ylabel('Samples') plt.xlabel('Decision Scores')
def twoClassDemo(self): import numpy as np import matplotlib.pyplot as plt from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import make_gaussian_quantiles # Construct dataset X1, y1 = make_gaussian_quantiles(cov=2., n_samples=200, n_features=2, n_classes=2, random_state=1) X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5, n_samples=300, n_features=2, n_classes=2, random_state=1) X = np.concatenate((X1, X2)) y = np.concatenate((y1, -y2 + 1)) # Create and fit an AdaBoosted decision tree bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200) bdt.fit(X, y) plot_colors = "br" plot_step = 0.02 class_names = "AB" plt.figure(figsize=(10, 5)) # Plot the decision boundaries plt.subplot(121) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) plt.axis("tight") # Plot the training points for i, n, c in zip(range(2), class_names, plot_colors): idx = np.where(y == i) plt.scatter(X[idx, 0], X[idx, 1], c=c, cmap=plt.cm.Paired, s=20, edgecolor='k', label="Class %s" % n) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.legend(loc='upper right') plt.xlabel('x') plt.ylabel('y') plt.title('Decision Boundary') # Plot the two-class decision scores twoclass_output = bdt.decision_function(X) plot_range = (twoclass_output.min(), twoclass_output.max()) plt.subplot(122) for i, n, c in zip(range(2), class_names, plot_colors): plt.hist(twoclass_output[y == i], bins=10, range=plot_range, facecolor=c, label='Class %s' % n, alpha=.5, edgecolor='k') x1, x2, y1, y2 = plt.axis() plt.axis((x1, x2, y1, y2 * 1.2)) plt.legend(loc='upper right') plt.ylabel('Samples') plt.xlabel('Score') plt.title('Decision Scores') plt.tight_layout() plt.subplots_adjust(wspace=0.35) plt.show() import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import zero_one_loss from sklearn.ensemble import AdaBoostClassifier n_estimators = 400 # A learning rate of 1. may not be optimal for both SAMME and SAMME.R learning_rate = 1. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_test, y_test = X[2000:], y[2000:] X_train, y_train = X[:2000], y[:2000] dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) dt_stump.fit(X_train, y_train) dt_stump_err = 1.0 - dt_stump.score(X_test, y_test) dt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1) dt.fit(X_train, y_train) dt_err = 1.0 - dt.score(X_test, y_test) ada_discrete = AdaBoostClassifier(base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME") ada_discrete.fit(X_train, y_train) ada_real = AdaBoostClassifier(base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME.R") ada_real.fit(X_train, y_train) fig = plt.figure() ax = fig.add_subplot(111) ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', label='Decision Stump Error') ax.plot([1, n_estimators], [dt_err] * 2, 'k--', label='Decision Tree Error') ada_discrete_err = np.zeros((n_estimators, )) for i, y_pred in enumerate(ada_discrete.staged_predict(X_test)): ada_discrete_err[i] = zero_one_loss(y_pred, y_test) ada_discrete_err_train = np.zeros((n_estimators, )) for i, y_pred in enumerate(ada_discrete.staged_predict(X_train)): ada_discrete_err_train[i] = zero_one_loss(y_pred, y_train) ada_real_err = np.zeros((n_estimators, )) for i, y_pred in enumerate(ada_real.staged_predict(X_test)): ada_real_err[i] = zero_one_loss(y_pred, y_test) ada_real_err_train = np.zeros((n_estimators, )) for i, y_pred in enumerate(ada_real.staged_predict(X_train)): ada_real_err_train[i] = zero_one_loss(y_pred, y_train) ax.plot(np.arange(n_estimators) + 1, ada_discrete_err, label='Discrete AdaBoost Test Error', color='red') ax.plot(np.arange(n_estimators) + 1, ada_discrete_err_train, label='Discrete AdaBoost Train Error', color='blue') ax.plot(np.arange(n_estimators) + 1, ada_real_err, label='Real AdaBoost Test Error', color='orange') ax.plot(np.arange(n_estimators) + 1, ada_real_err_train, label='Real AdaBoost Train Error', color='green') ax.set_ylim((0.0, 0.5)) ax.set_xlabel('n_estimators') ax.set_ylabel('error rate') leg = ax.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.7) plt.show()
def test_sparse_classification(): # Check classification with sparse input. class CustomSVC(SVC): """SVC variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super().fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15, n_features=5, random_state=42) # Flatten y to a 1d array y = np.ravel(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) # Trained on sparse format sparse_classifier = AdaBoostClassifier( base_estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME" ).fit(X_train_sparse, y_train) # Trained on dense format dense_classifier = AdaBoostClassifier( base_estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME" ).fit(X_train, y_train) # predict sparse_results = sparse_classifier.predict(X_test_sparse) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) # decision_function sparse_results = sparse_classifier.decision_function(X_test_sparse) dense_results = dense_classifier.decision_function(X_test) assert_array_almost_equal(sparse_results, dense_results) # predict_log_proba sparse_results = sparse_classifier.predict_log_proba(X_test_sparse) dense_results = dense_classifier.predict_log_proba(X_test) assert_array_almost_equal(sparse_results, dense_results) # predict_proba sparse_results = sparse_classifier.predict_proba(X_test_sparse) dense_results = dense_classifier.predict_proba(X_test) assert_array_almost_equal(sparse_results, dense_results) # score sparse_results = sparse_classifier.score(X_test_sparse, y_test) dense_results = dense_classifier.score(X_test, y_test) assert_array_almost_equal(sparse_results, dense_results) # staged_decision_function sparse_results = sparse_classifier.staged_decision_function( X_test_sparse) dense_results = dense_classifier.staged_decision_function(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_almost_equal(sprase_res, dense_res) # staged_predict sparse_results = sparse_classifier.staged_predict(X_test_sparse) dense_results = dense_classifier.staged_predict(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # staged_predict_proba sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse) dense_results = dense_classifier.staged_predict_proba(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_almost_equal(sprase_res, dense_res) # staged_score sparse_results = sparse_classifier.staged_score(X_test_sparse, y_test) dense_results = dense_classifier.staged_score(X_test, y_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # Verify sparsity of data is maintained during training types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([(t == csc_matrix or t == csr_matrix) for t in types])
def evaluate(dt_eval, dt_train, opts): # If modelinput is specified then read in model bdt = None if len(opts.modelinput) != 0: bdt = joblib.load(opts.modelinput) print "Loaded model back ", opts.bdtname print bdt else: print "Model not specified..." print "Creating classification and training again" bdt = AdaBoostClassifier( DecisionTreeClassifier(max_depth=opts.maxdepth), algorithm="SAMME", n_estimators=opts.ntrees, learning_rate=opts.lrate, ) bdt.fit(dt_train.getDataNoWeight(), dt_train.targets) # Now get the bdt scores sig_scores = bdt.decision_function(dt_eval.getDataNoWeight()[dt_eval.targets > 0.5]) bkg_scores = bdt.decision_function(dt_eval.getDataNoWeight()[dt_eval.targets < 0.5]) # Get weights sig_weights = dt_eval.getDataWeights()[dt_eval.targets > 0.5] * dt_eval.sf bkg_weights = dt_eval.getDataWeights()[dt_eval.targets < 0.5] * dt_eval.sf # Print some information for a set of cuts cuts = np.arange(-1, 1, 0.05) for cut in cuts: print "------------------------------------------" print "cut: ", cut print "\tSignal: ", sum(sig_weights[sig_scores > cut]) print "\tBackground:", sum(bkg_weights[bkg_scores > cut]) # Make figure and axis fig, ax = plt.subplots(ncols=1, figsize=(10, 7)) # Set minimum and maximum for x-axis xmin = -1 xmax = 1 nbins = 100 # plt.yscale("log") plt.ylim([1e-2, 1e6]) # Add error bars plotErrorBars(sig_scores, sig_weights, nbins, xmin, xmax, "r", "signal") # Add error bars plotErrorBars(bkg_scores, bkg_weights, nbins, xmin, xmax, "b", "background") # Make hist for signal plt.hist( sig_scores, weights=sig_weights, color="r", range=(xmin, xmax), alpha=0.5, bins=nbins, log=True, histtype="stepfilled", ) # Make hist for bkg plt.hist( bkg_scores, weights=bkg_weights, color="b", range=(xmin, xmax), alpha=0.5, bins=nbins, log=True, histtype="stepfilled", ) # Miscellanous plt.xlabel("BDT output") plt.ylabel("Events / year / bin") plt.legend(loc="best") plt.grid() plt.xticks(np.arange(-1, 1.1, 0.1)) plt.tight_layout() # ax.set_yscale("log") plt.savefig("plots/evaluate/WeightedResult_" + opts.bdtname + "_fromModel.png")
def test_sparse_classification(): # Check classification with sparse input. class CustomSVC(SVC): """SVC variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super(CustomSVC, self).fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15, n_features=5, random_state=42) # Flatten y to a 1d array y = np.ravel(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) # Trained on sparse format sparse_classifier = AdaBoostClassifier( base_estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME" ).fit(X_train_sparse, y_train) # Trained on dense format dense_classifier = AdaBoostClassifier( base_estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME" ).fit(X_train, y_train) # predict sparse_results = sparse_classifier.predict(X_test_sparse) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) # decision_function sparse_results = sparse_classifier.decision_function(X_test_sparse) dense_results = dense_classifier.decision_function(X_test) assert_array_equal(sparse_results, dense_results) # predict_log_proba sparse_results = sparse_classifier.predict_log_proba(X_test_sparse) dense_results = dense_classifier.predict_log_proba(X_test) assert_array_equal(sparse_results, dense_results) # predict_proba sparse_results = sparse_classifier.predict_proba(X_test_sparse) dense_results = dense_classifier.predict_proba(X_test) assert_array_equal(sparse_results, dense_results) # score sparse_results = sparse_classifier.score(X_test_sparse, y_test) dense_results = dense_classifier.score(X_test, y_test) assert_array_equal(sparse_results, dense_results) # staged_decision_function sparse_results = sparse_classifier.staged_decision_function( X_test_sparse) dense_results = dense_classifier.staged_decision_function(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # staged_predict sparse_results = sparse_classifier.staged_predict(X_test_sparse) dense_results = dense_classifier.staged_predict(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # staged_predict_proba sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse) dense_results = dense_classifier.staged_predict_proba(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # staged_score sparse_results = sparse_classifier.staged_score(X_test_sparse, y_test) dense_results = dense_classifier.staged_score(X_test, y_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # Verify sparsity of data is maintained during training types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([(t == csc_matrix or t == csr_matrix) for t in types])
os = np.ones(len(bkgtrain)) zs = np.zeros(len(sigtrain)) print "adding samples together" X_train = pandas.concat([sigtrain, bkgtrain]) y_train = np.append(os, zs) print "training" base_ada.fit(X=X_train, y=y_train) os = np.ones(len(bkgtest)) zs = np.zeros(len(sigtest)) print "adding samples together" X_test = pandas.concat([sigtest, bkgtest]) y_test = np.append(os, zs) sigoutput = base_ada.decision_function(X=sigtest) bkgoutput = base_ada.decision_function(X=bkgtest) from sklearn.metrics import accuracy_score test_errors = [] for te in base_ada.staged_predict(X_test): test_errors.append(1.- accuracy_score(te, y_test)) ntrees = len(test_errors) estimator_errors = base_ada.estimator_errors_[:ntrees] estimator_weights = base_ada.estimator_weights_[:ntrees] from matplotlib.ticker import LinearLocator with PdfPages("bdtplots.pdf") as pdf: xs, xe, ys, ye = get_hist(bkgoutput) plt.errorbar(xs, ys, xerr=xe, yerr=ye, color='red', fmt='.',
def train_bdt(): print("Loading data...") if SMALL_DATA: signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data_small() else: signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data() # print("Sampling 10% of the data for training") # #Create smaller samples, 10% of the size # signal = np.asarray(random.sample(signal, int((len(signal))*0.1))) # bkg2nu = np.asarray(random.sample(bkg2nu, int((len(bkg2nu))*0.1))) # bkg214Bi = np.asarray(random.sample(bkg214Bi, int((len(bkg214Bi))*0.1))) # bkg208Tl = np.asarray(random.sample(bkg208Tl, int((len(bkg208Tl))*0.1))) # bkgRn = np.asarray(random.sample(bkgRn, int((len(bkgRn))*0.1))) print("Creating arrays...") # X = Features (i.e. the data) X = np.concatenate((signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn)) # y = Labels (i.e. what it is, signal / background) y = np.concatenate( (np.ones(signal.shape[0]), np.zeros(bkg2nu.shape[0]), np.zeros(bkg214Bi.shape[0]), np.zeros(bkg208Tl.shape[0]), np.zeros(bkgRn.shape[0]))) print("Splitting Data...") # Split the data X_dev, X_eval, y_dev, y_eval = train_test_split(X, y, test_size=0.33, random_state=48) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # print("Oversampling...") # # Oversample to improve representation of backgrounds # ros = RandomOverSampler(random_state=0) # X_resampled, y_resampled = ros.fit_sample(X_train, y_train) # X_test_resampled, y_test_resampled = ros.fit_sample(X_test, y_test) # X_dev_resampled, y_dev_resampled = ros.fit_sample(X_dev, y_dev) # X_eval_resampled, y_eval_resampled = ros.fit_sample(X_eval, y_eval) # print(sorted(Counter(y_resampled).items())) print("Removing weights..") # Remove weights on backgrounds (will be passed in to the BDT later) # 30/09/19 - removed re sampling X_train_weights = X_train[:, 6] X_train_new = np.delete(X_train, 6, axis=1) X_test_new = np.delete(X_test, 6, axis=1) X_dev_weights = X_dev[:, 6] X_dev_new = np.delete(X_dev, 6, axis=1) X_eval_new = np.delete(X_eval, 6, axis=1) print("Creating classifier for DT") # Create classifiers dt = DecisionTreeClassifier(max_depth=12, min_samples_split=0.5, min_samples_leaf=400) print("Creating classifier for BDT") bdt = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=1200, learning_rate=0.5) print("Fitting BDT...") # Train the classifier - pass in weights from earlier fitted_tree = bdt.fit(X_train_new, y_train, sample_weight=X_train_weights) print("Predicting on training data...") # Use the fitted tree to predict on training data and new test data y_predicted_train = bdt.predict(X_train_new) print("Predicting on test data...") y_predicted_test = bdt.predict(X_test_new) print( classification_report(y_train, y_predicted_train, target_names=["signal", "background"])) print("Area under ROC curve for training data: {0:.4f}".format( roc_auc_score(y_train, bdt.decision_function(X_train_new)))) print( classification_report(y_test, y_predicted_test, target_names=["signal", "background"])) print("Area under ROC curve for test data: {0:.4f}".format( roc_auc_score(y_test, bdt.decision_function(X_test_new)))) plot_roc_curve(bdt, X_test_new, y_test) compare_train_test(bdt, X_train_new, y_train, X_test_new, y_test) print("Saving classifier...") save_path = BASE_PATH + 'ml_calculated_data/weight/' dump(bdt, save_path + 'bdt_classifier.joblib') dump(fitted_tree, save_path + 'bdt_fitted_tree.joblib') dump(X_train_new, save_path + 'bdt_X_train_new.joblib') dump(X_test_new, save_path + 'bdt_X_test_new.joblib') dump(X_dev_new, save_path + 'bdt_X_dev_new.joblib') dump(X_dev_weights, save_path + 'bdt_X_dev_weights.joblib') dump(X_eval_new, save_path + 'bdt_X_eval_new.joblib') dump(y_test, save_path + 'bdt_y_test.joblib') dump(y_train, save_path + 'bdt_y_train.joblib') dump(y_dev, save_path + 'bdt_y_dev.joblib') dump(y_eval, save_path + 'bdt_y_eval.joblib') print("Finished Training.")
roc_auc_s = auc(fpr_s, tpr_s) cm_s = confusion_matrix(y_test, s_pre_y) print(cm_s) print('s 正确率为:', accuracy_score(y_test, s_pre_y)) print('s 召回率为', recall_score(y_test, s_pre_y)) endtimes = datetime.datetime.now() print(endtimes - starttime) ada = AdaBoostClassifier(DecisionTreeClassifier(min_samples_leaf=6, min_samples_split=10), n_estimators=300, learning_rate=2) ada.fit(x_train, y_train) a_pre_y = list(ada.predict(x_test)) y_a2_score = ada.decision_function(x_test) fpr_a2, tpr_a2, threshold_a2 = roc_curve(y_test, y_a2_score) roc_auc_a2 = auc(fpr_a2, tpr_a2) cm_a = confusion_matrix(y_test, a_pre_y) print(cm_a) print('a 正确率为:', accuracy_score(y_test, a_pre_y)) print('a 召回率为', recall_score(y_test, a_pre_y)) endtimea = datetime.datetime.now() print(endtimea - starttime) """ ada1 = AdaBoostClassifier(DecisionTreeClassifier(min_samples_leaf=5), n_estimators=200, learning_rate=1) ada1.fit(x_train,y_train) a1_pre_y=list(ada1.predict(x_test)) print('a 正确率为:', accuracy_score(y_test, a1_pre_y)) print('a 召回率为', recall_score(y_test, a1_pre_y)) endtimea = datetime.datetime.now()