Ejemplo n.º 1
0
def loadModel(inputDir,
              trainData,
              model,
              LoadModel,
              sampleDatasets=None,
              removedVars=None):
    inputModel = '%s/KERAS_check_best_model.h5' % inputDir
    # inputModel = '%s/KERAS_model.h5'%inputDir
    inputWeights = '%s/KERAS_check_best_model_weights.h5' % inputDir

    from DeepJetCore.DataCollection import DataCollection
    traind = DataCollection()
    traind.readFromFile(trainData)
    traind.dataclass.regressiontargetclasses = range(0, NBINS)
    print traind.getNRegressionTargets()

    if (LoadModel):
        evalModel = load_model(inputModel, custom_objects=global_loss_list)
        shapes = traind.getInputShapes()

    else:
        shapes = traind.getInputShapes()
        train_inputs = []
        for s in shapes:
            train_inputs.append(keras.layers.Input(shape=s))
        evalModel = model(train_inputs, traind.getNClassificationTargets(),
                          traind.getNRegressionTargets(), sampleDatasets,
                          removedVars)
        evalModel.load_weights(inputWeights)

    return evalModel
Ejemplo n.º 2
0
def dcToDf(dc_file, df_out):
    dc = DataCollection()
    dc.readFromFile(dc_file)

    NENT = 1  # Can skip some events
    filelist = []
    i = 0
    storeInputs = True
    count = 0

    feature_names = dc.dataclass.branches[1]
    spectator_names = dc.dataclass.branches[0]
    labels_names = dc.getUsedTruth()
    labels_names = ['truth' + l for l in labels_names]

    for s in dc.samples:
        if count > 1000000: break
        spath = dc.getSamplePath(s)
        filelist.append(spath)
        h5File = h5py.File(spath)
        f = h5File
        features_val_i = [
            h5File['x%i' % j][()]
            for j in range(0, h5File['x_listlength'][()][0])
        ]
        features_val_i = features_val_i[0][::NENT, 0, :]
        #predict_test_i = model.predict(features_val)
        weights_val_i = h5File['w0'][()]
        labels_val_i = h5File['y0'][()][::NENT, :]
        spectators_val_i = h5File['z0'][()][::NENT, 0, :]
        if storeInputs: raw_features_val_i = h5File['z1'][()][::NENT, 0, :]
        if i == 0:
            #predict_test = predict_test_i
            weights_val = weights_val_i
            labels_val = labels_val_i
            spectators_val = spectators_val_i
            features_val = features_val_i
            if storeInputs: raw_features_val = raw_features_val_i
        else:
            #predict_test = np.concatenate((predict_test,predict_test_i))
            weights_val = np.concatenate((weights_val, weights_val_i))
            labels_val = np.concatenate((labels_val, labels_val_i))
            features_val = np.concatenate((features_val, features_val_i))
            spectators_val = np.concatenate((spectators_val, spectators_val_i))
            if storeInputs:
                raw_features_val = np.concatenate(
                    (raw_features_val, raw_features_val_i))
        i += 1
        count += labels_val.shape[0]

    entries = np.hstack((raw_features_val, spectators_val, labels_val,
                         weights_val.reshape((len(weights_val), 1))))
    df = pd.DataFrame(entries,
                      columns=feature_names + spectator_names + labels_names +
                      ['weight'])
    #df = pd.DataFrame(raw_features_val+spectators_val , columns = feature_names+spectator_names)
    #print df
    if df_out != None:
        df.to_pickle(df_out)
        print "Saved df to", df_out
Ejemplo n.º 3
0
class TrainingInfo:

    def __init__( self, directory ):

        filename = os.path.join( directory, 'dataCollection.dc')
        file_    = open( filename, 'rb')

        self.samples    =   pickle.load(file_)
        sampleentries   =   pickle.load(file_)
        originRoots     =   pickle.load(file_)
        nsamples        =   pickle.load(file_)
        useweights      =   pickle.load(file_)
        batchsize       =   pickle.load(file_)
        dataclass       =   pickle.load(file_)
        weighter        =   pickle.load(file_)
        self._means     =   pickle.load(file_)
        file_.close()


        # Get means dictionary
        self.means = {name : (self._means[0][i], self._means[1][i]) for i, name in enumerate( self._means.dtype.names) }

        # Get DeepJetCore DataCollection
        self.dataCollection = DataCollection()
        self.dataCollection.readFromFile(filename) 

        # Reading first sample & get branch structure
        fullpath = self.dataCollection.getSamplePath(self.samples[0])
        self.dataCollection.dataclass.readIn(fullpath)
        self.branches = self.dataCollection.dataclass.branches

        print "Branches:"
        for i in range(len(self.branches)):
            print "Collection", i
            for i_b, b in enumerate(self.branches[i]):
                print "  branch %2i/%2i %40s   mean %8.5f var %8.5f" %( i, i_b, b, self.means[b][0], self.means[b][1])
            print

    def dump( self, filename):
        pickle.dump( [ self.branches, self.means], file( filename, 'w' ) )
        print "Written", filename
Ejemplo n.º 4
0
def loadModel(inputDir,
              trainData,
              model,
              LoadModel,
              sampleDatasets=None,
              removedVars=None,
              adv=False):
    inputModel = '%s/KERAS_check_best_model.h5' % inputDir

    from DeepJetCore.DataCollection import DataCollection
    traind = DataCollection()
    traind.readFromFile(trainData)
    traind.dataclass.regressiontargetclasses = range(0, NBINS)
    print(traind.getNRegressionTargets())

    if (LoadModel):
        evalModel = load_model(inputModel, custom_objects=global_loss_list)
        shapes = traind.getInputShapes()

    else:
        shapes = traind.getInputShapes()
        train_inputs = []
        for s in shapes:
            train_inputs.append(keras.layers.Input(shape=s))
        modelargs = {}
        if adv:
            modelargs.update({
                'nRegTargets': NBINS,
                'discTrainable': True,
                'advTrainable': True
            })
        evalModel = model(train_inputs, traind.getNClassificationTargets(),
                          traind.getNRegressionTargets(), sampleDatasets,
                          removedVars, **modelargs)
        evalModel.load_weights(inputModel)

    return evalModel
Ejemplo n.º 5
0
# encoding: utf-8

from argparse import ArgumentParser
from DeepJetCore.DataCollection import DataCollection

parser = ArgumentParser(
    'convert a data collection to a single set of numpy arrays. Warning, this can produce a large output'
)
parser.add_argument('inputDataCollection')
parser.add_argument('outputFilePrefix')
args = parser.parse_args()

print('reading data collection')

dc = DataCollection()
dc.readFromFile(args.inputDataCollection)

print('producing feature array')
feat = dc.getAllFeatures()

print('producing truth array')
truth = dc.getAllLabels()

print('producing weight array')
weight = dc.getAllWeights()

print('producing means and norms array')
means = dc.means

from numpy import save
Ejemplo n.º 6
0
LoadModel = False
removedVars = None
forceNClasses = False
signals = [1]
sigNames = ['Hbb']
backgrounds = [0]
backNames = ['QCD']
NClasses = len(signals) + len(backgrounds)

if True:
    evalModel = loadModel(trainDir, inputTrainDataCollection, trainingModel,
                          LoadModel, forceNClasses, NClasses, inputDataset,
                          removedVars)

    evalDir = opts.o

    from DeepJetCore.DataCollection import DataCollection
    testd = DataCollection()
    testd.readFromFile(inputTestDataCollection)

    if os.path.isdir(evalDir):
        raise Exception('output directory: %s must not exists yet' % evalDir)
    else:
        os.mkdir(evalDir)

    df, features_val = makePlots(testd, evalModel, evalDir)
    makeLossPlot(trainDir, evalDir)

    #df = evaluate(testd, inputTrainDataCollection, evalModel, evalDir)
    #make_plots(evalDir, savedir='Plots')
Ejemplo n.º 7
0
from DeepJetCore.DataCollection import DataCollection
from pprint import pprint

dc = DataCollection()
dc.readFromFile(
    'dc/dataCollection.dc'
)  #/storage/9/dseith/DeepJet/deepCSV/results/../../Ntuples/Thu_135917_batch/dataCollections/deepCSV/train/dataCollection.dc')
#dc.readFromFile('/storage/9/dseith/DeepJet/deepCSV/results/../../Ntuples/Thu_135917_batch/dataCollections/deepFlavour_FT_reg/train/dataCollection.dc')
#pprint (dc.means[0])
#print '-'*100
#pprint (dc.means[1])
#print '-'*100
#pprint (dc.means.dtype.names)
#pprint (dc.means[0][0].dtype)
#pprint (dc.useweights)
#pprint (dc.weighter)
#pprint (dc.samples)
#pprint (dc.sampleentries)
#pprint (dc.originRoots)
#pprint (dc.nsamples)
#pprint (dc.useweights)
##pprint (dc.__batchsize)
pprint(dc.dataclass)
#pprint (dc.weighter)
#pprint (dc.means)

six_times = [
    'TagVarCSVTrk_trackJetDistVal', 'TagVarCSVTrk_trackPtRel',
    'TagVarCSVTrk_trackDeltaR', 'TagVarCSVTrk_trackPtRatio',
    'TagVarCSVTrk_trackSip3dSig', 'TagVarCSVTrk_trackSip2dSig',
    'TagVarCSVTrk_trackDecayLenVal'
Ejemplo n.º 8
0
    raise Exception('output directory must not exists yet')

custom_objs = {}
custom_objs.update(global_loss_list)
custom_objs.update(global_layers_list)
model=load_model(args.inputModel, custom_objects=custom_objs)


td=testDescriptor()
if args.use:
	td.use_only = [int(i) for i in args.use.split(',')]

from DeepJetCore.DataCollection import DataCollection

testd=DataCollection()
testd.readFromFile(args.inputDataCollection)


os.mkdir(args.outputDir)

td.makePrediction(
    model, testd, args.outputDir,
    store_labels = args.labels,
    monkey_class = args.monkey_class
)

td.writeToTextFile(args.outputDir+'/tree_association.txt')

#    make the file reading entirely C++
#    then it can be used for other studies
Ejemplo n.º 9
0
def evaluate(testd, trainData, model, outputDir, storeInputs=False, adv=False):
    NENT = 1  # Can skip some events
    filelist = []
    i = 0
    for s in testd.samples:
        #for s in testd.samples[0:1]:
        spath = testd.getSamplePath(s)
        filelist.append(spath)
        h5File = h5py.File(spath)
        f = h5File
        #features_val = [h5File['x%i_shape'%j][()] for j in range(0, h5File['x_listlength'][()][0])]
        features_val = [
            h5File['x%i' % j][()]
            for j in range(0, h5File['x_listlength'][()][0])
        ]
        #features_val=testd.getAllFeatures()
        predict_test_i = model.predict(features_val)
        labels_val_i = h5File['y0'][()][::NENT, :]
        spectators_val_i = h5File['z0'][()][::NENT, 0, :]
        if storeInputs: raw_features_val_i = h5File['z1'][()][::NENT, 0, :]
        if i == 0:
            predict_test = predict_test_i
            labels_val = labels_val_i
            spectators_val = spectators_val_i
            if storeInputs: raw_features_val = raw_features_val_i
        else:
            predict_test = np.concatenate((predict_test, predict_test_i))
            labels_val = np.concatenate((labels_val, labels_val_i))
            spectators_val = np.concatenate((spectators_val, spectators_val_i))
            if storeInputs:
                raw_features_val = np.concatenate(
                    (raw_features_val, raw_features_val_i))
        i += 1

# Value
#labels_val=testd.getAllLabels()[0][::NENT,:]
#features_val=testd.getAllFeatures()[0][::NENT,0,:]
#spectators_val = testd.getAllSpectators()[0][::NENT,0,:]
#if storeInputs: raw_features_val = testd.getAllSpectators()[-1][::NENT,0,:]

# Labels
    print testd.dataclass.branches
    feature_names = testd.dataclass.branches[1]
    spectator_names = testd.dataclass.branches[0]
    #truthnames = testd.getUsedTruth()

    from DeepJetCore.DataCollection import DataCollection
    traind = DataCollection()
    traind.readFromFile(trainData)
    truthnames = traind.getUsedTruth()
    # Store features
    print "Coulmns", spectator_names
    df = pd.DataFrame(spectators_val, columns=spectator_names)

    if storeInputs:
        for i, tname in enumerate(feature_names):
            df[tname] = raw_features_val[:, i]

# Add predictions
    print truthnames
    print predict_test.shape
    for i, tname in enumerate(truthnames):
        df['truth' + tname] = labels_val[:, i]
        #print "Mean 0th label predict predict of ", tname, np.mean(predict_test[:,0]), ", Stats:", np.sum(labels_val[:,i]), "/", len(labels_val[:,i])
        if adv:
            df['predict' + tname] = predict_test[:, NBINS + i]
            for j in range(NBINS):
                df['predict_massbin%i' % j] = predict_test[:, j + i]
        else:
            df['predict' + tname] = predict_test[:, i]

    print "Testing prediction:"
    print "Total: ", len(predict_test[:, 0])
    for lab in truthnames:
        print lab, ":", sum(df['truth' + lab].values)

    df.to_pickle(outputDir +
                 '/output.pkl')  #to save the dataframe, df to 123.pkl
    return df
    print "Finished storing dataframe"
Ejemplo n.º 10
0
def evaluate(testd, trainData, model, outputDir):
    NENT = 1  # Can skip some events
    filelist = []
    i = 0
    for s in testd.samples:
        spath = testd.getSamplePath(s)
        filelist.append(spath)
        h5File = h5py.File(spath)
        f = h5File
        #features_val = [h5File['x%i_shape'%j][()] for j in range(0, h5File['x_listlength'][()][0])]
        features_val = [
            h5File['x%i' % j][()]
            for j in range(0, h5File['x_listlength'][()][0])
        ]
        #features_val=testd.getAllFeatures()
        predict_test_i = model.predict(features_val)
        if i == 0:
            predict_test = predict_test_i
        else:
            predict_test = np.concatenate((predict_test, predict_test_i))
        i += 1

# Value
    labels_val = testd.getAllLabels()[0][::NENT, :]
    features_val = testd.getAllFeatures()[0][::NENT, 0, :]
    spectators_val = testd.getAllSpectators()[0][::NENT, 0, :]
    raw_features_val = testd.getAllSpectators()[-1][::NENT, 0, :]
    # Labels
    print testd.dataclass.branches
    feature_names = testd.dataclass.branches[1]
    spectator_names = testd.dataclass.branches[0]
    #truthnames = testd.getUsedTruth()

    from DeepJetCore.DataCollection import DataCollection
    traind = DataCollection()
    traind.readFromFile(trainData)
    truthnames = traind.getUsedTruth()
    # Store features
    df = pd.DataFrame(spectators_val)
    df.columns = [spectator_names]

    for i, tname in enumerate(feature_names):
        df[tname] = raw_features_val[:, i]

# Add predictions
    print truthnames
    print predict_test.shape
    for i, tname in enumerate(truthnames):
        df['truth' + tname] = labels_val[:, i]
        df['predict' + tname] = predict_test[:, i]

    df.to_pickle(outputDir +
                 '/output.pkl')  #to save the dataframe, df to 123.pkl
    print df
    dt = pd.read_pickle(outputDir + '/output.pkl')
    print dt

    def dists(xdf, truthnames):
        truths = truthnames
        print truths

        def distribution(xdf, predict="Hcc"):
            plt.figure(figsize=(10, 7))
            bins = np.linspace(0, 1, 70)
            trus = []
            for tru in truths:
                trus.append(xdf['truth' + tru].values)
            preds = [xdf['predict' + predict].values] * len(truths)
            plt.hist(preds,
                     bins=bins,
                     weights=trus,
                     alpha=0.8,
                     normed=True,
                     label=truths,
                     stacked=True)
            plt.xlabel("Probability " + predict)
            plt.title("Stacked Distributions")
            plt.semilogy()
            plt.legend(title="True labels:")
            plt.savefig(outputDir + '/dist' + predict + '.png', dpi=300)

        for pred in truths:
            distribution(xdf, predict=pred)

    dists(df, truthnames)

    print "Testing prediction:"
    print "Total: ", len(predict_test[:, 0])
    for lab in truthnames:
        print lab, ":", sum(df['truth' + lab].values)
    print "Finished"
Ejemplo n.º 11
0
    def train(self):

        placeholder_input, placeholder_output = self.model.get_placeholders()
        graph_output = self.model.get_compute_graphs()
        graph_loss = self.model.get_losses()
        graph_optmiser = self.model.get_optimizer()
        graph_summary = self.model.get_summary()

        if self.from_scratch:
            self.clean_summary_dir()

        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)
            if self.use_tf_records:
                coord = tf.train.Coordinator()
                threads = tf.train.start_queue_runners(coord=coord)
                record_batch_input, record_batch_target = self.get_record_placeholders(
                )
            else:
                input_data = self.config['train_data_path']
                train_data = DataCollection()
                train_data.readFromFile(input_data)

                val_data = train_data.split(0.1)
                train_data = train_data.split(0.9)
                train_data.setBatchSize(self.batch_size)
                val_data.setBatchSize(self.batch_size)
                val_data_generator = train_data.generator()
                train_data_generator = train_data.generator()

            summary_writer = tf.summary.FileWriter(self.summary_path,
                                                   sess.graph)

            if not self.from_scratch:
                self.saver_all.restore(sess, self.model_path)
                print("\n\nINFO: Loading model\n\n")
                with open(self.model_path + '.txt', 'r') as f:
                    iteration_number = int(f.read())
            else:
                iteration_number = 0

            print("Starting iterations")
            while iteration_number < self.train_for_iterations:
                if self.use_tf_records:
                    input, output = sess.run(
                        [record_batch_input, record_batch_target])
                    input = [
                        np.fromstring(''.join(i)).reshape(
                            13, 13, int(self.config['num_layers']),
                            int(self.config['num_channels'])) for i in input
                    ]
                    output = [
                        np.fromstring(''.join(i)).reshape(
                            13, 13, int(self.config['num_layers']))
                        for i in output
                    ]
                else:
                    input, output, _ = train_data_generator.next()
                    input = np.squeeze(input, axis=0)
                    output = np.squeeze(output, axis=0)

                _, eval_loss, _, eval_summary = sess.run(
                    [graph_output, graph_loss, graph_optmiser, graph_summary],
                    feed_dict={
                        placeholder_input: input,
                        placeholder_output: output
                    })
                print("Iteration %4d: loss %0.5f" %
                      (iteration_number, eval_loss))
                iteration_number += 1
                summary_writer.add_summary(eval_summary, iteration_number)
                if iteration_number % self.save_after_iterations == 0:
                    print("\n\nINFO: Saving model\n\n")
                    self.saver_all.save(sess, self.model_path)
                    with open(self.model_path + '.txt', 'w') as f:
                        f.write(str(iteration_number))
            if self.use_tf_records:
                # Stop the threads
                coord.request_stop()

                # Wait for threads to stop
                coord.join(threads)