def TrainDNN(setupClient): (X_train, y_train), (X_test, y_test), (w_train, w_test), (ix_train, ix_test) = LoadData(setupClient) n_dim = X_train.shape[1] modelpath = setupClient.ModelSavePath Nsig = (y_train == 1).sum() Nbkg = (y_train != 1).sum() # TODO: Check this is working or still needed if (Nsig != Nbkg) and setupClient.useEqualSizeSandB == True: print( 'You have selected to use equal portions of signal and background events but the numbers are not equal' ) print(Nsig, Nbkg) quit() print(Fore.BLUE + "--------------------------") print(Back.BLUE + " TRAINING...! ") print(Fore.BLUE + "--------------------------") print("Number of input variables : ", X_train.shape[1]) from collections import Counter cls_ytrain_count = Counter(y_train) Nclass = len(cls_ytrain_count) lossFunc = 'binary_crossentropy' if setupClient.runMode == 'multi': print(Fore.GREEN + 'Number of events per class in Train Sample:') for channel in channelDic: print('{:<15}{:<15}'.format(channel, cls_ytrain_count[channelDic[channel]])) lossFunc = 'sparse_categorical_crossentropy' model = ModelBuilder.BuildDNNMulti(setupClient, Nclass, n_dim, setupClient.Params['Width'], setupClient.Params['Depth']) else: print(Fore.GREEN + 'Number of events per class in Train Sample:') print('{:<15}{:<15}'.format('Background', cls_ytrain_count[0])) print('{:<15}{:<15}'.format('Signal', cls_ytrain_count[1])) model = ModelBuilder.BuildDNN(setupClient, n_dim, setupClient.Params['Width'], setupClient.Params['Depth']) model.compile(loss=lossFunc, optimizer=setupClient.Params['Optimizer'], metrics=['accuracy']) K.set_value(model.optimizer.lr, setupClient.Params['LearningRate']) # model.summary() print(model.get_config()) # print (model.optimizer.__class__.__name__) # print (K.get_value(model.optimizer.lr)) callbacks = [ # if we don't have a decrease of the loss for 4 epochs, terminate training. EarlyStopping(verbose=True, patience=3, monitor='val_loss'), # Always make sure that we're saving the model weights with the best val loss. ModelCheckpoint(modelpath + '/model.h5', monitor='val_loss', verbose=True, save_best_only=True) ] # TODO: make the number of classes be found and treated automatically by the flow #store the configuration of the training to disk outfile = open(modelpath + '/DNN_Setup', 'wb') pickle.dump(setupClient, outfile) outfile.close() if setupClient.runMode == 'multi': Nsig = float(cls_ytrain_count[1]) NZjets = float(cls_ytrain_count[0]) NDiboson = float(cls_ytrain_count[2]) NTop = float(cls_ytrain_count[3]) wZjets = round(Nsig / NZjets, 3) wDiboson = round(Nsig / NDiboson, 3) wTop = round(Nsig / NTop, 3) wsig = round(1.0, 2) print(Fore.GREEN + 'Weights to apply:') print('{:<15}{:<15}'.format('Zjets', wZjets)) print('{:<15}{:<15}'.format('Signal', wsig)) print('{:<15}{:<15}'.format('Diboson', wDiboson)) print('{:<15}{:<15}'.format('Top', wTop)) modelMetricsHistory = model.fit( X_train, y_train, class_weight={ 0: wZjets, 1: wsig, ## Signal 2: wDiboson, 3: wTop }, epochs=setupClient.Params['Epochs'], batch_size=setupClient.Params['BatchSize'], validation_split=0.2, callbacks=callbacks, verbose=1) else: if setupClient.useEqualSizeSandB == True: modelMetricsHistory = model.fit( X_train, y_train, epochs=setupClient.Params['Epochs'], batch_size=setupClient.Params['BatchSize'], validation_split=0.2, callbacks=callbacks, verbose=0) print(modelMetricsHistory.history['val_loss']) else: print('{:<25}'.format( Fore.BLUE + 'Training with class_weights because of unbalance classes !!')) nsignal = cls_ytrain_count[1] nbackground = cls_ytrain_count[0] print('Signal=', nsignal, 'Background=', nbackground) wbkg = (nsignal / nbackground) wsig = 1.0 if nsignal > nbackground: wbkg = 1.0 wsig = (nbackground / nsignal) print(Fore.GREEN + 'Weights to apply:') print('{:<15}{:<15}'.format('Background', round(wbkg, 3))) print('{:<15}{:<15}'.format('Signal', wsig)) modelMetricsHistory = model.fit( X_train, y_train, class_weight={ 0: wbkg, 1: wsig }, epochs=setupClient.Params['Epochs'], batch_size=setupClient.Params['BatchSize'], validation_split=0.2, callbacks=callbacks, verbose=1) return modelMetricsHistory
def doKFold(setupClient): print(Fore.BLUE + "--------------------------") print(Back.BLUE + " K-Fold Cross Validation ") print(Fore.BLUE + "--------------------------") pdtoLoad_Train = setupClient.PDPath + setupClient.MixPD_TrainTestTag + '_Train.pkl' pdtoLoad_Test = setupClient.PDPath + setupClient.MixPD_TrainTestTag + '_Test.pkl' print('{:<45}{:<25}'.format("Train sample", Fore.GREEN + pdtoLoad_Train)) print('{:<45}{:<25}'.format("Test sample", Fore.GREEN + pdtoLoad_Test)) if not os.path.isfile(pdtoLoad_Train): print("PD file", pdtoLoad_Train, " not found!") quit() if not os.path.isfile(pdtoLoad_Test): print("PD file", pdtoLoad_Test, " not found!") quit() df_Train = pd.read_pickle(pdtoLoad_Train) df_Test = pd.read_pickle(pdtoLoad_Test) ## Add them together: df_tot = pd.concat([df_Train, df_Test], ignore_index=True) VariablesSet = setupClient.InputDNNVariables[setupClient.VarSet] print('{:<45}{:<25}'.format( "Variable set", Fore.GREEN + str(setupClient.VarSet) + ' ' + str(VariablesSet))) X = df_tot[VariablesSet].as_matrix() scaler = StandardScaler() le = LabelEncoder() Y = le.fit_transform(df_tot['isSignal']) kfold = StratifiedKFold(n_splits=5, shuffle=False, random_state=None) cvscores = [] ii = 0 tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) for train, test in kfold.split(X, Y): print('Doing Fold', ii) cls_ytrain_count = Counter(Y[train]) print(Fore.GREEN + 'Number of events per class in Train Sample:') print('{:<15}{:<15}'.format('Background', cls_ytrain_count[0])) print('{:<15}{:<15}'.format('Signal', cls_ytrain_count[1])) X[train] = scaler.fit_transform(X[train]) X[test] = scaler.fit_transform(X[test]) n_dim = X[train].shape[1] lossFunc = 'binary_crossentropy' model = ModelBuilder.BuildDNN(setupClient, n_dim, setupClient.Params['Width'], setupClient.Params['Depth']) if setupClient.runMode == 'multi': lossFunc = 'sparse_categorical_crossentropy' model = ModelBuilder.BuildDNNMulti(setupClient, Nclass, n_dim, setupClient.Params['Width'], setupClient.Params['Depth']) model.compile(loss=lossFunc, optimizer=setupClient.Params['Optimizer'], metrics=['accuracy']) K.set_value(model.optimizer.lr, setupClient.Params['LearningRate']) callbacks = [ EarlyStopping(verbose=True, patience=5, monitor='val_loss'), ModelCheckpoint(setupClient.ModelSavePath + '/model_kfold' + str(ii) + '.h5', monitor='val_loss', verbose=True, save_best_only=True) ] wbkg = (cls_ytrain_count[1] / cls_ytrain_count[0]) wsig = 1.0 print(Fore.GREEN + 'Weights to apply:') print('{:<15}{:<15}'.format('Background', round(wbkg, 3))) print('{:<15}{:<15}'.format('Signal', wsig)) kf_history = model.fit(X[train], Y[train], class_weight={ 0: wbkg, 1: wsig }, epochs=setupClient.Params['Epochs'], batch_size=setupClient.Params['BatchSize'], validation_split=0.2, callbacks=callbacks, verbose=1) kf_scores = model.evaluate(X[test], Y[test], verbose=1) print("%s: %.3f%%" % (model.metrics_names[1], kf_scores[1] * 100)) cvscores.append(kf_scores[1] * 100) kf_yhat_test = model.predict(X[test]) # Get 'Receiver operating characteristic' (ROC) fpr, tpr, thresholds = roc_curve(Y[test], kf_yhat_test) tprs.append(np.interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.3f)' % (ii, roc_auc)) np.save( os.path.join(setupClient.ModelSavePath, 'cv_metrics_fold' + str(ii) + '.npy'), kf_scores) np.save( os.path.join(setupClient.ModelSavePath, 'cv_thresholds_fold' + str(ii) + '.npy'), thresholds) np.save( os.path.join(setupClient.ModelSavePath, 'cv_tpr_fold' + str(ii) + '.npy'), tpr) np.save( os.path.join(setupClient.ModelSavePath, 'cv_fpr_fold' + str(ii) + '.npy'), fpr) ii += 1 plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.3f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=1, alpha=.7) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') # plt.title('Receiver operating characteristic example') plt.xticks(np.arange(0.0, 1.1, 0.1)) plt.yticks(np.arange(0.0, 1.1, 0.1)) plt.title('ROC curves for Signal vs Background') plt.legend(loc="lower right", fontsize='x-small') plt.savefig(setupClient.ModelSavePath + "/KFold_ROC.png") plt.clf()