def bootstrapMeanAMS(data, wFactor=250000. / 50000., N=512, br=10): procs = [] out_q = mp.Queue() for i in range(N): indeces = np.random.choice(data.index, len(data), replace=True) p = mp.Process(target=mpAMS, args=(data.iloc[indeces], i, wFactor, br, out_q)) procs.append(p) p.start() resultdict = {} for i in range(N): resultdict.update(out_q.get()) for p in procs: p.join() amss = np.array([resultdict[x] for x in resultdict if 'ams' in x]) cuts = np.array([resultdict[x] for x in resultdict if 'cut' in x]) meanAMS = uncertRound(np.mean(amss), np.std(amss)) meanCut = uncertRound(np.mean(cuts), np.std(cuts)) ams = AMS( wFactor * np.sum(data.loc[(data.pred_class >= np.mean(cuts)) & (data.gen_target == 1), 'gen_weight']), wFactor * np.sum(data.loc[(data.pred_class >= np.mean(cuts)) & (data.gen_target == 0), 'gen_weight'])) print('\nMean AMS={}+-{}, at mean cut of {}+-{}'.format( meanAMS[0], meanAMS[1], meanCut[0], meanCut[1])) print('Exact mean cut {}, corresponds to AMS of {}'.format( np.mean(cuts), ams)) return (meanAMS[0], meanCut[0])
def bootstrapSKFoldMeanAMS(data, size=250000., N=10, nFolds=500, br=10): print("Warning, this method might not be trustworthy: cut decreases with nFolds") procs = [] out_q = mp.Queue() for i in range(N): indeces = np.random.choice(data.index, len(data), replace=True) p = mp.Process(target=mpSKFoldAMS, args=(data, i, size, nFolds, br, out_q)) procs.append(p) p.start() resultdict = {} for i in range(N): resultdict.update(out_q.get()) for p in procs: p.join() amss = np.array([resultdict[x] for x in resultdict if 'ams' in x]) cuts = np.array([resultdict[x] for x in resultdict if 'cut' in x]) meanAMS = uncertRound(np.mean(amss), np.std(amss)/np.sqrt(N*nFolds)) meanCut = uncertRound(np.mean(cuts), np.std(cuts)/np.sqrt(N*nFolds)) scale = size/len(data) ams = AMS(scale*np.sum(data.loc[(data.pred_class >= np.mean(cuts)) & (data.gen_target == 1), 'gen_weight']), scale*np.sum(data.loc[(data.pred_class >= np.mean(cuts)) & (data.gen_target == 0), 'gen_weight'])) print('\nMean AMS={}+-{}, at mean cut of {}+-{}'.format(meanAMS[0], meanAMS[1], meanCut[0], meanCut[1])) print('Exact mean cut {}, corresponds to AMS of {}'.format(np.mean(cuts), ams)) return (meanAMS[0], meanCut[0])
def rocPlot(inData=None, curves=None, predName='pred_class', targetName='gen_target', weightName=None, labels=None, aucs=None, bootstrap=False, log=False, baseline=True, params=[{}]): buildCurves = True if isinstance(inData, type(None)) == isinstance(curves, type(None)): print("Must pass either targets and preds, or curves") return -1 if not isinstance(curves, type(None)): buildCurves = False if buildCurves: curves = {} if bootstrap: aucArgs = [] for i in range(len(inData)): aucArgs.append({ 'labels': inData[i][targetName], 'preds': inData[i][predName], 'name': labels[i], 'indeces': inData[i].index.tolist() }) if not isinstance(weightName, type(None)): aucArgs[-1]['weights'] = inData[i][weightName] aucs = mpRun(aucArgs, rocauc) meanScores = {} for i in labels: meanScores[i] = (np.mean(aucs[i]), np.std(aucs[i])) print( str(i) + ' ROC AUC, Mean = {} +- {}'.format( meanScores[i][0], meanScores[i][1])) else: meanScores = {} for i in range(len(inData)): if isinstance(weightName, type(None)): meanScores[labels[i]] = roc_auc_score( inData[i][targetName].values, inData[i][predName]) else: meanScores[labels[i]] = roc_auc_score( inData[i][targetName].values, inData[i][predName], sample_weight=inData[i][weightName]) print(str(i) + ' ROC AUC: {}'.format(meanScores[labels[i]])) for i in range(len(inData)): if isinstance(weightName, type(None)): curves[labels[i]] = roc_curve(inData[i][targetName].values, inData[i][predName].values)[:2] else: curves[labels[i]] = roc_curve( inData[i][targetName].values, inData[i][predName].values, sample_weight=inData[i][weightName].values)[:2] plt.figure(figsize=[8, 8]) for i in range(len(curves)): if buildCurves: if bootstrap: meanScore = uncertRound(*meanScores[labels[i]]) plt.plot(*curves[labels[i]], label=labels[i] + r', AUC$={}\pm{}$'.format(meanScore[0], meanScore[1]), **params[i]) else: plt.plot(*curves[labels[i]], label=labels[i] + r', AUC$={:.5f}$'.format(meanScores[labels[i]]), **params[i]) else: plt.plot(*curves[i], label=labels[i], **params[i]) if baseline: plt.plot([0, 1], [0, 1], 'k--', label='No discrimination') plt.xlabel('Background acceptance', fontsize=24, color='black') plt.ylabel('Signal acceptance', fontsize=24, color='black') if len(labels): plt.legend(loc='best', fontsize=16) if log: plt.xscale('log', nonposx='clip') plt.grid(True, which="both") plt.xticks(fontsize=16, color='black') plt.yticks(fontsize=16, color='black') plt.show()
def batchTrainClassifier(batchYielder, nSplits, modelGen, modelGenParams, trainParams, cosAnnealMult=0, reverseAnneal=False, plotLR=False, reduxDecay=False, annealMomentum=False, reverseAnnealMomentum=False, plotMomentum=False, oneCycle=False, ratio=0.25, reverse=False, lrScale=10, momScale=0.1, plotOneCycle=False, scale=30, mode='sgd', swaStart=-1, swaRenewal=-1, sgdReplacement=False, trainOnWeights=True, saveLoc='train_weights/', patience=10, maxEpochs=10000, verbose=False, logoutput=False, amsSize=0, plot=True, binary=None, stopIfStallingTest=-1): os.system("mkdir " + saveLoc) os.system("rm " + saveLoc + "*.h5") os.system("rm " + saveLoc + "*.json") os.system("rm " + saveLoc + "*.pkl") os.system("rm " + saveLoc + "*.png") os.system("rm " + saveLoc + "*.log") if logoutput: old_stdout = sys.stdout log_file = open(saveLoc + 'training_log.log', 'w') sys.stdout = log_file start = timeit.default_timer() results = [] histories = [] if not isinstance(batchYielder, BatchYielder): print ("HDF5 as input is depreciated, converting to BatchYielder") batchYielder = BatchYielder(batchYielder) if cosAnnealMult: print ("Using cosine annealing") if trainOnWeights: print ("Training using weights") for fold in range(nSplits): foldStart = timeit.default_timer() print ("Running fold", fold+1, "/", nSplits) os.system("rm " + saveLoc + "best.h5") best = 1000000 bestLR = 1000000 reduxDecayActive = False tmpPatience = patience epochCounter = 0 subEpoch = 0 stop = False lossHistory = { 'val_loss':[], 'swa_val_loss':[], 'val_train_loss':[], 'AUC':[], 'wAUC':[], 'ACC':[], 'wACC':[], } trainID, testID = getFolds(fold, nSplits) #Get fold indeces for training and testing for current fold model = None model = modelGen(**modelGenParams) model.reset_states testbatch = batchYielder.getBatch(testID) #Load testing fold callbacks = [] if cosAnnealMult: cosAnneal = CosAnneal(math.ceil(len(batchYielder.source['fold_0/targets'])/trainParams['batch_size']), cosAnnealMult, reverseAnneal) callbacks.append(cosAnneal) if annealMomentum: cosAnnealMomentum = CosAnnealMomentum(math.ceil(len(batchYielder.source['fold_0/targets'])/trainParams['batch_size']), cosAnnealMult, reverseAnnealMomentum) callbacks.append(cosAnnealMomentum) if oneCycle: oneCycle = OneCycle(math.ceil(len(batchYielder.source['fold_0/targets'])/trainParams['batch_size']), ratio=ratio, reverse=reverse, lrScale=lrScale, momScale=momScale, scale=scale, mode=mode) callbacks.append(oneCycle) if swaStart >= 0: if cosAnnealMult: swa = SWA(swaStart, testbatch, modelGen(**modelGenParams), verbose, swaRenewal, cosAnneal, trainOnWeights=trainOnWeights, sgdReplacement=sgdReplacement) else: swa = SWA(swaStart, testbatch, modelGen(**modelGenParams), verbose, swaRenewal, trainOnWeights=trainOnWeights, sgdReplacement=sgdReplacement) callbacks.append(swa) useSWA = False for epoch in range(maxEpochs): for n in trainID: #Loop through training folds trainbatch = batchYielder.getBatch(n) #Load fold data subEpoch += 1 if verbose: print('.', sep='', end='') if binary == None: #First run, check classification mode binary = True nClasses = len(np.unique(trainbatch['targets'])) if nClasses > 2: print (nClasses, "classes found, running in multiclass mode\n") trainbatch['targets'] = utils.to_categorical(trainbatch['targets'], num_classes=nClasses) binary = False else: print (nClasses, "classes found, running in binary mode\n") if trainOnWeights: train_history = model.fit( trainbatch['inputs'], trainbatch['targets'], sample_weight=trainbatch['weights'], callbacks = callbacks, **trainParams ) #Train for one epoch if swaStart >= 0 and swa.active: losses = swa.get_losses() print('{} swa loss {}, default loss {}'.format(subEpoch, losses['swa'], losses['base'])) if losses['swa'] < losses['base']: loss = losses['swa'] useSWA = True else: loss = losses['base'] useSWA = False else: loss = model.evaluate(testbatch['inputs'], testbatch['targets'], sample_weight=testbatch['weights'], verbose=0, batch_size=trainParams['batch_size']) else: train_history = model.fit( trainbatch['inputs'], trainbatch['targets'], class_weight = 'auto', callbacks = callbacks, **trainParams ) #Train for one epoch if swaStart >= 0 and swa.active: losses = swa.get_losses() print('{} swa loss {}, default loss {}'.format(subEpoch, losses['swa'], losses['base'])) if losses['swa'] < losses['base']: loss = losses['swa'] useSWA = True else: loss = losses['base'] useSWA = False else: loss = model.evaluate(testbatch['inputs'], testbatch['targets'], verbose=0, batch_size=trainParams['batch_size']) if swaStart >= 0 and swa.active and cosAnnealMult > 1: print ("{} SWA loss:", subEpoch, loss) if swaStart >= 0: if swa.active: lossHistory['swa_val_loss'].append(losses['swa']) lossHistory['val_loss'].append(losses['base']) else: lossHistory['swa_val_loss'].append(loss) lossHistory['val_loss'].append(loss) else: lossHistory['val_loss'].append(loss) lossHistory['val_train_loss'].append(train_history.history['loss']) if binary: testbatch = batchYielder.getBatch(testID) #Load testing fold prediction = model.predict(testbatch['inputs'], verbose=0) prediction4acc = (prediction>0.5)*1 targets = testbatch.get('orig_targets', testbatch['targets']) if not isinstance(testbatch['weights'], type(None)): lossHistory['wAUC'] = 1-roc_auc_score(targets, prediction, sample_weight=testbatch['weights']) lossHistory['wACC'] = accuracy_score(targets, prediction4acc, sample_weight=testbatch['weights']) lossHistory['AUC'] = 1-roc_auc_score(targets, prediction) lossHistory['ACC'] = accuracy_score(targets, prediction4acc) _lh = lossHistory['val_loss'] if (stopIfStallingTest > 0 and len(_lh) > stopIfStallingTest+1 and (_lh[-stopIfStallingTest] - _lh[-1])/_lh[-1] < 1e-7 ): print('Learning process stalled at %s. Stopping...'%_lh[-1]) stop = True if loss < best: #Save best best = loss if cosAnnealMult: if cosAnneal.lrs[-1] > 0: bestLR = cosAnneal.lrs[-1] else: bestLR = cosAnneal.lrs[-2] epochCounter = 0 def _store(): if swaStart >= 0 and swa.active and useSWA: swa.test_model.save_weights(saveLoc + "best.h5") else: model.save_weights(saveLoc + "best.h5") try: _store() except RuntimeError: # sleep a little and try again print("RuntimeError while saving. Trying again.") import time time.sleep(0.5) try: _store() except RuntimeError: print("RuntimeError while saving again!!! Maybe next time then.") if reduxDecayActive: cosAnneal.lrs.append(float(K.get_value(model.optimizer.lr))) if verbose: print ('\n{} New best found: {}'.format(subEpoch, best)) elif cosAnnealMult and not reduxDecayActive: if cosAnneal.cycle_end: epochCounter += 1 else: epochCounter += 1 if reduxDecayActive: lr = 0.8*float(K.get_value(model.optimizer.lr)) cosAnneal.lrs.append(lr) K.set_value(model.optimizer.lr, lr) if epochCounter >= tmpPatience: #Early stopping if cosAnnealMult and reduxDecay and not reduxDecayActive: print ('CosineAnneal stalling after {} epochs, entering redux decay at LR={}'.format(subEpoch, bestLR)) model.load_weights(saveLoc + "best.h5") cosAnneal.lrs.append(bestLR) K.set_value(model.optimizer.lr, bestLR) tmpPatience = 10 epochCounter = 0 callbacks = [] reduxDecayActive = True else: if verbose: print ('Early stopping after {} epochs'.format(subEpoch)) stop = True break if stop: break model.load_weights(saveLoc + "best.h5") histories.append(lossHistory.copy()) #histories[-1]['val_loss'] = lossHistory['val_loss'] #if swaStart >= 0: # histories[-1]['swa_val_loss'] = lossHistory['swa_val_loss'] results.append({}) results[-1]['loss'] = best if binary: testbatch = batchYielder.getBatch(testID) #Load testing fold prediction = model.predict(testbatch['inputs'], verbose=0) targets = testbatch.get('orig_targets', testbatch['targets']) if not isinstance(testbatch['weights'], type(None)): results[-1]['wAUC'] = 1-roc_auc_score(targets, prediction, sample_weight=testbatch['weights']) results[-1]['AUC'] = 1-roc_auc_score(targets, prediction) if amsSize: results[-1]['AMS'], results[-1]['cut'] = amsScanQuick(batchYielder.getBatchDF(testID, preds=prediction, weightName='orig_weights'), wFactor=amsSize/len(prediction)) print ("Score is:", results[-1]) if plotLR: cosAnneal.plot_lr() #if plotMomentum: cosAnnealMomentum.plot_momentum() #if plotOneCycle: oneCycle.plot() if callable(plot): plot(lossHistory) print("Fold took {:.3f}s\n".format(timeit.default_timer() - foldStart)) model.save(saveLoc + 'train_' + str(fold) + '.h5') with open(saveLoc + 'resultsFile.pkl', 'wb') as fout: #Save results pickle.dump(results, fout) print("\n______________________________________") print("Training finished") print("Cross-validation took {:.3f}s ".format(timeit.default_timer() - start)) #if plot: plotTrainingHistory(histories, save=saveLoc + 'loss_history.png') for score in results[0]: mean = uncertRound(np.mean([x[score] for x in results]), np.std([x[score] for x in results])/np.sqrt(len(results))) print ("Mean", score, "= {} +- {}".format(mean[0], mean[1])) print("______________________________________\n") if logoutput: sys.stdout = old_stdout log_file.close() return results, histories
def batchTrainRegressor(data, nSplits, modelGen, modelGenParams, trainParams, cosAnnealMult=0, trainOnWeights=True, getBatch=getBatch, extraMetrics=None, monitorData=None, saveLoc='train_weights/', patience=10, maxEpochs=10000, verbose=False, logoutput=False): os.system("mkdir " + saveLoc) os.system("rm " + saveLoc + "*.h5") os.system("rm " + saveLoc + "*.json") os.system("rm " + saveLoc + "*.pkl") os.system("rm " + saveLoc + "*.png") os.system("rm " + saveLoc + "*.log") if logoutput: old_stdout = sys.stdout log_file = open(saveLoc + 'training_log.log', 'w') sys.stdout = log_file start = timeit.default_timer() results = [] histories = [] if cosAnnealMult: print ("Using cosine annealing") monitor = False if not isinstance(monitorData, type(None)): monitorInputs = monitorData['inputs'] monitorTargets = monitorData['targets'] monitor = True print ("Using a monitor sample to judge convergence") for fold in range(nSplits): foldStart = timeit.default_timer() print ("Running fold", fold+1, "/", nSplits) os.system("rm " + saveLoc + "best.h5") best = -1 epochCounter = 0 subEpoch = 0 stop = False lossHistory = [] monitorHistory = [] trainID, testID = getFolds(fold, nSplits) #Get fold indeces for training and testing for current fold testbatch = getBatch(testID, data) #Load testing fold model = None model = modelGen(**modelGenParams) model.reset_states #Just checking callbacks = [] if cosAnnealMult: cosAnneal = CosAnneal(math.ceil(len(data['fold_0/targets'])/trainParams['batch_size']), cosAnnealMult) callbacks.append(cosAnneal) for epoch in range(maxEpochs): epochStart = timeit.default_timer() for n in trainID: #Loop through training folds trainbatch = getBatch(n, data) #Load fold data subEpoch += 1 if trainOnWeights: model.fit(trainbatch['inputs'], trainbatch['targets'], sample_weight=trainbatch['weights'], callbacks=callbacks, **trainParams) #Train for one epoch loss = model.evaluate(testbatch['inputs'], testbatch['targets'], sample_weight=testbatch['weights'], verbose=0) else: model.fit(trainbatch['inputs'], trainbatch['targets'], callbacks=callbacks, **trainParams) #Train for one epoch loss = model.evaluate(testbatch['inputs'], testbatch['targets'], verbose=0) lossHistory.append(loss) monLoss = loss if monitor: monLoss = model.evaluate(monitorInputs, monitorTargets, verbose=0) monitorHistory.append(monLoss) if monLoss < best: #Save best best = monLoss epochCounter = 0 model.save_weights(saveLoc + "best.h5") if verbose: print ('{} New best found: {}'.format(subEpoch, best)) elif cosAnnealMult: if cosAnneal.cycle_end: epochCounter += 1 else: epochCounter += 1 if epochCounter >= patience: #Early stopping if verbose: print ('Early stopping after {} epochs'.format(subEpoch)) stop = True break if stop: break model.load_weights(saveLoc + "best.h5") histories.append({}) histories[-1]['val_loss'] = lossHistory histories[-1]['mon_loss'] = monitorHistory results.append({}) results[-1]['loss'] = best if not isinstance(extraMetrics, type(None)): metrics = extraMetrics(model.predict(testbatch['inputs'], verbose=0), testbatch['targets'], testbatch['weights']) for metric in metrics: results[-1][metric] = metrics[metric] print ("Score is:", results[-1]) print("Fold took {:.3f}s\n".format(timeit.default_timer() - foldStart)) model.save(saveLoc + 'train_' + str(fold) + '.h5') with open(saveLoc + 'resultsFile.pkl', 'wb') as fout: #Save results pickle.dump(results, fout) print("\n______________________________________") print("Training finished") print("Cross-validation took {:.3f}s ".format(timeit.default_timer() - start)) plotTrainingHistory(histories, save=saveLoc + 'loss_history.png') for score in results[0]: mean = uncertRound(np.mean([x[score] for x in results]), np.std([x[score] for x in results])/np.sqrt(len(results))) print ("Mean", score, "= {} +- {}".format(mean[0], mean[1])) print("______________________________________\n") if logoutput: sys.stdout = old_stdout log_file.close() return results, histories