Ejemplo n.º 1
0
def bootstrapMeanAMS(data, wFactor=250000. / 50000., N=512, br=10):
    procs = []
    out_q = mp.Queue()
    for i in range(N):
        indeces = np.random.choice(data.index, len(data), replace=True)
        p = mp.Process(target=mpAMS,
                       args=(data.iloc[indeces], i, wFactor, br, out_q))
        procs.append(p)
        p.start()
    resultdict = {}
    for i in range(N):
        resultdict.update(out_q.get())
    for p in procs:
        p.join()

    amss = np.array([resultdict[x] for x in resultdict if 'ams' in x])
    cuts = np.array([resultdict[x] for x in resultdict if 'cut' in x])

    meanAMS = uncertRound(np.mean(amss), np.std(amss))
    meanCut = uncertRound(np.mean(cuts), np.std(cuts))

    ams = AMS(
        wFactor * np.sum(data.loc[(data.pred_class >= np.mean(cuts)) &
                                  (data.gen_target == 1), 'gen_weight']),
        wFactor * np.sum(data.loc[(data.pred_class >= np.mean(cuts)) &
                                  (data.gen_target == 0), 'gen_weight']))

    print('\nMean AMS={}+-{}, at mean cut of {}+-{}'.format(
        meanAMS[0], meanAMS[1], meanCut[0], meanCut[1]))
    print('Exact mean cut {}, corresponds to AMS of {}'.format(
        np.mean(cuts), ams))
    return (meanAMS[0], meanCut[0])
Ejemplo n.º 2
0
def bootstrapSKFoldMeanAMS(data, size=250000., N=10, nFolds=500, br=10):
    print("Warning, this method might not be trustworthy: cut decreases with nFolds")
    procs = []
    out_q = mp.Queue()
    for i in range(N):
        indeces = np.random.choice(data.index, len(data), replace=True)
        p = mp.Process(target=mpSKFoldAMS, args=(data, i, size, nFolds, br, out_q))
        procs.append(p)
        p.start()
    resultdict = {}
    for i in range(N):
        resultdict.update(out_q.get())
    for p in procs:
        p.join()

    amss = np.array([resultdict[x] for x in resultdict if 'ams' in x])
    cuts = np.array([resultdict[x] for x in resultdict if 'cut' in x])

    meanAMS = uncertRound(np.mean(amss), np.std(amss)/np.sqrt(N*nFolds))
    meanCut = uncertRound(np.mean(cuts), np.std(cuts)/np.sqrt(N*nFolds))

    scale = size/len(data)
    ams = AMS(scale*np.sum(data.loc[(data.pred_class >= np.mean(cuts)) & (data.gen_target == 1), 'gen_weight']),
              scale*np.sum(data.loc[(data.pred_class >= np.mean(cuts)) & (data.gen_target == 0), 'gen_weight']))

    print('\nMean AMS={}+-{}, at mean cut of {}+-{}'.format(meanAMS[0], meanAMS[1], meanCut[0], meanCut[1]))
    print('Exact mean cut {}, corresponds to AMS of {}'.format(np.mean(cuts), ams))
    return (meanAMS[0], meanCut[0])
Ejemplo n.º 3
0
def rocPlot(inData=None,
            curves=None,
            predName='pred_class',
            targetName='gen_target',
            weightName=None,
            labels=None,
            aucs=None,
            bootstrap=False,
            log=False,
            baseline=True,
            params=[{}]):
    buildCurves = True
    if isinstance(inData, type(None)) == isinstance(curves, type(None)):
        print("Must pass either targets and preds, or curves")
        return -1
    if not isinstance(curves, type(None)):
        buildCurves = False

    if buildCurves:
        curves = {}
        if bootstrap:
            aucArgs = []
            for i in range(len(inData)):
                aucArgs.append({
                    'labels': inData[i][targetName],
                    'preds': inData[i][predName],
                    'name': labels[i],
                    'indeces': inData[i].index.tolist()
                })
                if not isinstance(weightName, type(None)):
                    aucArgs[-1]['weights'] = inData[i][weightName]
            aucs = mpRun(aucArgs, rocauc)
            meanScores = {}
            for i in labels:
                meanScores[i] = (np.mean(aucs[i]), np.std(aucs[i]))
                print(
                    str(i) + ' ROC AUC, Mean = {} +- {}'.format(
                        meanScores[i][0], meanScores[i][1]))
        else:
            meanScores = {}
            for i in range(len(inData)):
                if isinstance(weightName, type(None)):
                    meanScores[labels[i]] = roc_auc_score(
                        inData[i][targetName].values, inData[i][predName])
                else:
                    meanScores[labels[i]] = roc_auc_score(
                        inData[i][targetName].values,
                        inData[i][predName],
                        sample_weight=inData[i][weightName])
                print(str(i) + ' ROC AUC: {}'.format(meanScores[labels[i]]))
        for i in range(len(inData)):
            if isinstance(weightName, type(None)):
                curves[labels[i]] = roc_curve(inData[i][targetName].values,
                                              inData[i][predName].values)[:2]
            else:
                curves[labels[i]] = roc_curve(
                    inData[i][targetName].values,
                    inData[i][predName].values,
                    sample_weight=inData[i][weightName].values)[:2]

    plt.figure(figsize=[8, 8])
    for i in range(len(curves)):
        if buildCurves:
            if bootstrap:
                meanScore = uncertRound(*meanScores[labels[i]])
                plt.plot(*curves[labels[i]],
                         label=labels[i] +
                         r', AUC$={}\pm{}$'.format(meanScore[0], meanScore[1]),
                         **params[i])
            else:
                plt.plot(*curves[labels[i]],
                         label=labels[i] +
                         r', AUC$={:.5f}$'.format(meanScores[labels[i]]),
                         **params[i])
        else:
            plt.plot(*curves[i], label=labels[i], **params[i])

    if baseline:
        plt.plot([0, 1], [0, 1], 'k--', label='No discrimination')
    plt.xlabel('Background acceptance', fontsize=24, color='black')
    plt.ylabel('Signal acceptance', fontsize=24, color='black')
    if len(labels):
        plt.legend(loc='best', fontsize=16)
    if log:
        plt.xscale('log', nonposx='clip')
        plt.grid(True, which="both")
    plt.xticks(fontsize=16, color='black')
    plt.yticks(fontsize=16, color='black')
    plt.show()
Ejemplo n.º 4
0
def batchTrainClassifier(batchYielder, nSplits, modelGen, modelGenParams, trainParams,
                         cosAnnealMult=0, reverseAnneal=False, plotLR=False, reduxDecay=False,
                         annealMomentum=False, reverseAnnealMomentum=False, plotMomentum=False,
                         oneCycle=False, ratio=0.25, reverse=False, lrScale=10, momScale=0.1, 
                         plotOneCycle=False, scale=30, mode='sgd',
                         swaStart=-1, swaRenewal=-1, sgdReplacement=False,
                         trainOnWeights=True,
                         saveLoc='train_weights/', patience=10, maxEpochs=10000,
                         verbose=False, logoutput=False, amsSize=0, plot=True, binary=None,
                         stopIfStallingTest=-1):

    os.system("mkdir " + saveLoc)
    os.system("rm " + saveLoc + "*.h5")
    os.system("rm " + saveLoc + "*.json")
    os.system("rm " + saveLoc + "*.pkl")
    os.system("rm " + saveLoc + "*.png")
    os.system("rm " + saveLoc + "*.log")

    if logoutput:
        old_stdout = sys.stdout
        log_file = open(saveLoc + 'training_log.log', 'w')
        sys.stdout = log_file

    start = timeit.default_timer()
    results = []
    histories = []

    if not isinstance(batchYielder, BatchYielder):
        print ("HDF5 as input is depreciated, converting to BatchYielder")
        batchYielder = BatchYielder(batchYielder)

    if cosAnnealMult: print ("Using cosine annealing")
    if trainOnWeights: print ("Training using weights")

    for fold in range(nSplits):
        foldStart = timeit.default_timer()
        print ("Running fold", fold+1, "/", nSplits)
        os.system("rm " + saveLoc + "best.h5")
        best = 1000000
        bestLR = 1000000
        reduxDecayActive = False
        tmpPatience = patience
        epochCounter = 0
        subEpoch = 0
        stop = False
        lossHistory = {
            'val_loss':[],
            'swa_val_loss':[],
            'val_train_loss':[],
            'AUC':[],
            'wAUC':[],
            'ACC':[],
            'wACC':[],
        }
        trainID, testID = getFolds(fold, nSplits) #Get fold indeces for training and testing for current fold

        model = None
        model = modelGen(**modelGenParams)
        model.reset_states

        testbatch = batchYielder.getBatch(testID) #Load testing fold

        callbacks = []
        if cosAnnealMult:
            cosAnneal = CosAnneal(math.ceil(len(batchYielder.source['fold_0/targets'])/trainParams['batch_size']), cosAnnealMult, reverseAnneal)
            callbacks.append(cosAnneal)

        if annealMomentum:
            cosAnnealMomentum = CosAnnealMomentum(math.ceil(len(batchYielder.source['fold_0/targets'])/trainParams['batch_size']), cosAnnealMult, reverseAnnealMomentum)
            callbacks.append(cosAnnealMomentum)

        if oneCycle:
            oneCycle = OneCycle(math.ceil(len(batchYielder.source['fold_0/targets'])/trainParams['batch_size']), ratio=ratio, reverse=reverse, lrScale=lrScale, momScale=momScale, scale=scale, mode=mode)
            callbacks.append(oneCycle)

        if swaStart >= 0:
            if cosAnnealMult:
                swa = SWA(swaStart, testbatch, modelGen(**modelGenParams), verbose, swaRenewal, cosAnneal, trainOnWeights=trainOnWeights, sgdReplacement=sgdReplacement)
            else:
                swa = SWA(swaStart, testbatch, modelGen(**modelGenParams), verbose, swaRenewal, trainOnWeights=trainOnWeights, sgdReplacement=sgdReplacement)
            callbacks.append(swa)
        useSWA = False
        for epoch in range(maxEpochs):
            for n in trainID: #Loop through training folds
                trainbatch = batchYielder.getBatch(n) #Load fold data
                subEpoch += 1
                if verbose:
                    print('.', sep='', end='')

                if binary == None: #First run, check classification mode
                    binary = True
                    nClasses = len(np.unique(trainbatch['targets']))
                    if nClasses > 2:
                        print (nClasses, "classes found, running in multiclass mode\n")
                        trainbatch['targets'] = utils.to_categorical(trainbatch['targets'], num_classes=nClasses)
                        binary = False
                    else:
                        print (nClasses, "classes found, running in binary mode\n")

                if trainOnWeights:
                    train_history = model.fit(
                        trainbatch['inputs'],
                        trainbatch['targets'],
                        sample_weight=trainbatch['weights'],
                        callbacks = callbacks,
                        **trainParams
                    ) #Train for one epoch

                    if swaStart >= 0 and swa.active:
                        losses = swa.get_losses()
                        print('{} swa loss {}, default loss {}'.format(subEpoch, losses['swa'], losses['base']))
                        if losses['swa'] < losses['base']:
                            loss = losses['swa']
                            useSWA = True
                        else:
                            loss = losses['base']
                            useSWA = False

                    else:
                        loss = model.evaluate(testbatch['inputs'], testbatch['targets'], sample_weight=testbatch['weights'], verbose=0, batch_size=trainParams['batch_size'])

                else:
                    train_history = model.fit(
                        trainbatch['inputs'],
                        trainbatch['targets'],
                        class_weight = 'auto',
                        callbacks = callbacks,
                        **trainParams
                    ) #Train for one epoch

                    if swaStart >= 0 and swa.active:
                        losses = swa.get_losses()
                        print('{} swa loss {}, default loss {}'.format(subEpoch, losses['swa'], losses['base']))
                        if losses['swa'] < losses['base']:
                            loss = losses['swa']
                            useSWA = True
                        else:
                            loss = losses['base']
                            useSWA = False
                    else:
                        loss = model.evaluate(testbatch['inputs'], testbatch['targets'], verbose=0, batch_size=trainParams['batch_size'])

                if swaStart >= 0 and swa.active and cosAnnealMult > 1:
                    print ("{} SWA loss:", subEpoch, loss)

                if swaStart >= 0:
                    if swa.active:
                        lossHistory['swa_val_loss'].append(losses['swa'])
                        lossHistory['val_loss'].append(losses['base'])
                    else:
                        lossHistory['swa_val_loss'].append(loss)
                        lossHistory['val_loss'].append(loss)
                else:
                    lossHistory['val_loss'].append(loss)

                lossHistory['val_train_loss'].append(train_history.history['loss'])

                if binary:
                    testbatch = batchYielder.getBatch(testID) #Load testing fold
                    prediction = model.predict(testbatch['inputs'], verbose=0)
                    prediction4acc = (prediction>0.5)*1
                    targets = testbatch.get('orig_targets', testbatch['targets'])

                    if not isinstance(testbatch['weights'], type(None)):
                        lossHistory['wAUC'] = 1-roc_auc_score(targets,
                                                              prediction,
                                                              sample_weight=testbatch['weights'])
                        lossHistory['wACC'] = accuracy_score(targets,
                                                             prediction4acc,
                                                             sample_weight=testbatch['weights'])
                    lossHistory['AUC'] = 1-roc_auc_score(targets, prediction)
                    lossHistory['ACC'] = accuracy_score(targets, prediction4acc)

                _lh = lossHistory['val_loss']
                if (stopIfStallingTest > 0
                and len(_lh) > stopIfStallingTest+1
                and (_lh[-stopIfStallingTest] - _lh[-1])/_lh[-1] < 1e-7
                ):
                    print('Learning process stalled at %s. Stopping...'%_lh[-1])
                    stop = True


                if loss < best: #Save best
                    best = loss
                    if cosAnnealMult:
                        if cosAnneal.lrs[-1] > 0:
                            bestLR = cosAnneal.lrs[-1]
                        else:
                            bestLR = cosAnneal.lrs[-2]
                    epochCounter = 0
                    def _store():
                        if swaStart >= 0 and swa.active and useSWA:
                            swa.test_model.save_weights(saveLoc + "best.h5")
                        else:
                            model.save_weights(saveLoc + "best.h5")
                    try:
                        _store()
                    except RuntimeError: # sleep a little and try again
                        print("RuntimeError while saving. Trying again.")
                        import time
                        time.sleep(0.5)
                        try:
                            _store()
                        except RuntimeError:
                            print("RuntimeError while saving again!!! Maybe next time then.")
                    if reduxDecayActive:
                        cosAnneal.lrs.append(float(K.get_value(model.optimizer.lr)))
                    if verbose:
                        print ('\n{} New best found: {}'.format(subEpoch, best))
                elif cosAnnealMult and not reduxDecayActive:
                    if cosAnneal.cycle_end:
                        epochCounter += 1
                else:
                    epochCounter += 1
                    if reduxDecayActive:
                        lr = 0.8*float(K.get_value(model.optimizer.lr))
                        cosAnneal.lrs.append(lr)
                        K.set_value(model.optimizer.lr, lr)

                if epochCounter >= tmpPatience: #Early stopping
                    if cosAnnealMult and reduxDecay and not reduxDecayActive:
                        print ('CosineAnneal stalling after {} epochs, entering redux decay at LR={}'.format(subEpoch, bestLR))
                        model.load_weights(saveLoc +  "best.h5")
                        cosAnneal.lrs.append(bestLR)
                        K.set_value(model.optimizer.lr, bestLR)
                        tmpPatience = 10
                        epochCounter = 0
                        callbacks = []
                        reduxDecayActive = True
                    else:
                        if verbose:
                            print ('Early stopping after {} epochs'.format(subEpoch))
                        stop = True
                        break

            if stop:
                break

        model.load_weights(saveLoc +  "best.h5")

        histories.append(lossHistory.copy())
        #histories[-1]['val_loss'] = lossHistory['val_loss']
        #if swaStart >= 0:
        #    histories[-1]['swa_val_loss'] = lossHistory['swa_val_loss']

        results.append({})
        results[-1]['loss'] = best
        if binary:
            testbatch = batchYielder.getBatch(testID) #Load testing fold
            prediction = model.predict(testbatch['inputs'], verbose=0)
            targets = testbatch.get('orig_targets', testbatch['targets'])
            if not isinstance(testbatch['weights'], type(None)):
                results[-1]['wAUC'] = 1-roc_auc_score(targets,
                                                      prediction,
                                                      sample_weight=testbatch['weights'])
            results[-1]['AUC'] = 1-roc_auc_score(targets, prediction)

            if amsSize:
                 results[-1]['AMS'], results[-1]['cut'] = amsScanQuick(batchYielder.getBatchDF(testID, preds=prediction, weightName='orig_weights'),
                                                                       wFactor=amsSize/len(prediction))
        print ("Score is:", results[-1])

        if plotLR: cosAnneal.plot_lr()
        #if plotMomentum: cosAnnealMomentum.plot_momentum()
        #if plotOneCycle: oneCycle.plot()
        if callable(plot):
            plot(lossHistory)

        print("Fold took {:.3f}s\n".format(timeit.default_timer() - foldStart))

        model.save(saveLoc +  'train_' + str(fold) + '.h5')
        with open(saveLoc +  'resultsFile.pkl', 'wb') as fout: #Save results
            pickle.dump(results, fout)

    print("\n______________________________________")
    print("Training finished")
    print("Cross-validation took {:.3f}s ".format(timeit.default_timer() - start))
    #if plot: plotTrainingHistory(histories, save=saveLoc + 'loss_history.png')
    for score in results[0]:
        mean = uncertRound(np.mean([x[score] for x in results]), np.std([x[score] for x in results])/np.sqrt(len(results)))
        print ("Mean", score, "= {} +- {}".format(mean[0], mean[1]))
    print("______________________________________\n")

    if logoutput:
        sys.stdout = old_stdout
        log_file.close()
    return results, histories
Ejemplo n.º 5
0
def batchTrainRegressor(data, nSplits,
                        modelGen, modelGenParams,
                        trainParams, cosAnnealMult=0, trainOnWeights=True, getBatch=getBatch,
                        extraMetrics=None, monitorData=None,
                        saveLoc='train_weights/', patience=10, maxEpochs=10000, verbose=False, logoutput=False):

    os.system("mkdir " + saveLoc)
    os.system("rm " + saveLoc + "*.h5")
    os.system("rm " + saveLoc + "*.json")
    os.system("rm " + saveLoc + "*.pkl")
    os.system("rm " + saveLoc + "*.png")
    os.system("rm " + saveLoc + "*.log")

    if logoutput:
        old_stdout = sys.stdout
        log_file = open(saveLoc + 'training_log.log', 'w')
        sys.stdout = log_file

    start = timeit.default_timer()
    results = []
    histories = []

    if cosAnnealMult: print ("Using cosine annealing")

    monitor = False
    if not isinstance(monitorData, type(None)):
        monitorInputs = monitorData['inputs']
        monitorTargets = monitorData['targets']
        monitor = True
        print ("Using a monitor sample to judge convergence")

    for fold in range(nSplits):
        foldStart = timeit.default_timer()
        print ("Running fold", fold+1, "/", nSplits)
        os.system("rm " + saveLoc + "best.h5")
        best = -1
        epochCounter = 0
        subEpoch = 0
        stop = False
        lossHistory = []
        monitorHistory = []
        trainID, testID = getFolds(fold, nSplits) #Get fold indeces for training and testing for current fold
        testbatch = getBatch(testID, data) #Load testing fold

        model = None
        model = modelGen(**modelGenParams)
        model.reset_states #Just checking

        callbacks = []
        if cosAnnealMult:
            cosAnneal = CosAnneal(math.ceil(len(data['fold_0/targets'])/trainParams['batch_size']), cosAnnealMult)
            callbacks.append(cosAnneal)

        for epoch in range(maxEpochs):
            epochStart = timeit.default_timer()

            for n in trainID: #Loop through training folds
                trainbatch = getBatch(n, data) #Load fold data
                subEpoch += 1

                if trainOnWeights:
                    model.fit(trainbatch['inputs'], trainbatch['targets'],
                              sample_weight=trainbatch['weights'],
                              callbacks=callbacks, **trainParams) #Train for one epoch

                    loss = model.evaluate(testbatch['inputs'], testbatch['targets'], sample_weight=testbatch['weights'], verbose=0)
                else:
                    model.fit(trainbatch['inputs'], trainbatch['targets'],
                              callbacks=callbacks, **trainParams) #Train for one epoch

                    loss = model.evaluate(testbatch['inputs'], testbatch['targets'], verbose=0)

                lossHistory.append(loss)

                monLoss = loss
                if monitor:
                    monLoss = model.evaluate(monitorInputs, monitorTargets, verbose=0)
                    monitorHistory.append(monLoss)

                if monLoss < best: #Save best
                    best = monLoss
                    epochCounter = 0
                    model.save_weights(saveLoc + "best.h5")
                    if verbose:
                        print ('{} New best found: {}'.format(subEpoch, best))
                elif cosAnnealMult:
                    if cosAnneal.cycle_end:
                        epochCounter += 1
                else:
                    epochCounter += 1

                if epochCounter >= patience: #Early stopping
                    if verbose:
                        print ('Early stopping after {} epochs'.format(subEpoch))
                    stop = True
                    break

            if stop:
                break

        model.load_weights(saveLoc +  "best.h5")

        histories.append({})
        histories[-1]['val_loss'] = lossHistory
        histories[-1]['mon_loss'] = monitorHistory

        results.append({})
        results[-1]['loss'] = best

        if not isinstance(extraMetrics, type(None)):
            metrics = extraMetrics(model.predict(testbatch['inputs'], verbose=0), testbatch['targets'], testbatch['weights'])
            for metric in metrics:
                results[-1][metric] = metrics[metric]

        print ("Score is:", results[-1])

        print("Fold took {:.3f}s\n".format(timeit.default_timer() - foldStart))

        model.save(saveLoc +  'train_' + str(fold) + '.h5')
        with open(saveLoc +  'resultsFile.pkl', 'wb') as fout: #Save results
            pickle.dump(results, fout)

    print("\n______________________________________")
    print("Training finished")
    print("Cross-validation took {:.3f}s ".format(timeit.default_timer() - start))
    plotTrainingHistory(histories, save=saveLoc + 'loss_history.png')
    for score in results[0]:
        mean = uncertRound(np.mean([x[score] for x in results]), np.std([x[score] for x in results])/np.sqrt(len(results)))
        print ("Mean", score, "= {} +- {}".format(mean[0], mean[1]))
    print("______________________________________\n")

    if logoutput:
        sys.stdout = old_stdout
        log_file.close()
    return results, histories