Exemple #1
0
def dcToDf(dc_file, df_out):
    dc = DataCollection()
    dc.readFromFile(dc_file)

    NENT = 1  # Can skip some events
    filelist = []
    i = 0
    storeInputs = True
    count = 0

    feature_names = dc.dataclass.branches[1]
    spectator_names = dc.dataclass.branches[0]
    labels_names = dc.getUsedTruth()
    labels_names = ['truth' + l for l in labels_names]

    for s in dc.samples:
        if count > 1000000: break
        spath = dc.getSamplePath(s)
        filelist.append(spath)
        h5File = h5py.File(spath)
        f = h5File
        features_val_i = [
            h5File['x%i' % j][()]
            for j in range(0, h5File['x_listlength'][()][0])
        ]
        features_val_i = features_val_i[0][::NENT, 0, :]
        #predict_test_i = model.predict(features_val)
        weights_val_i = h5File['w0'][()]
        labels_val_i = h5File['y0'][()][::NENT, :]
        spectators_val_i = h5File['z0'][()][::NENT, 0, :]
        if storeInputs: raw_features_val_i = h5File['z1'][()][::NENT, 0, :]
        if i == 0:
            #predict_test = predict_test_i
            weights_val = weights_val_i
            labels_val = labels_val_i
            spectators_val = spectators_val_i
            features_val = features_val_i
            if storeInputs: raw_features_val = raw_features_val_i
        else:
            #predict_test = np.concatenate((predict_test,predict_test_i))
            weights_val = np.concatenate((weights_val, weights_val_i))
            labels_val = np.concatenate((labels_val, labels_val_i))
            features_val = np.concatenate((features_val, features_val_i))
            spectators_val = np.concatenate((spectators_val, spectators_val_i))
            if storeInputs:
                raw_features_val = np.concatenate(
                    (raw_features_val, raw_features_val_i))
        i += 1
        count += labels_val.shape[0]

    entries = np.hstack((raw_features_val, spectators_val, labels_val,
                         weights_val.reshape((len(weights_val), 1))))
    df = pd.DataFrame(entries,
                      columns=feature_names + spectator_names + labels_names +
                      ['weight'])
    #df = pd.DataFrame(raw_features_val+spectators_val , columns = feature_names+spectator_names)
    #print df
    if df_out != None:
        df.to_pickle(df_out)
        print "Saved df to", df_out
Exemple #2
0
def evaluate(testd, trainData, model, outputDir, storeInputs=False, adv=False):
    NENT = 1  # Can skip some events
    filelist = []
    i = 0
    for s in testd.samples:
        #for s in testd.samples[0:1]:
        spath = testd.getSamplePath(s)
        filelist.append(spath)
        h5File = h5py.File(spath)
        f = h5File
        #features_val = [h5File['x%i_shape'%j][()] for j in range(0, h5File['x_listlength'][()][0])]
        features_val = [
            h5File['x%i' % j][()]
            for j in range(0, h5File['x_listlength'][()][0])
        ]
        #features_val=testd.getAllFeatures()
        predict_test_i = model.predict(features_val)
        labels_val_i = h5File['y0'][()][::NENT, :]
        spectators_val_i = h5File['z0'][()][::NENT, 0, :]
        if storeInputs: raw_features_val_i = h5File['z1'][()][::NENT, 0, :]
        if i == 0:
            predict_test = predict_test_i
            labels_val = labels_val_i
            spectators_val = spectators_val_i
            if storeInputs: raw_features_val = raw_features_val_i
        else:
            predict_test = np.concatenate((predict_test, predict_test_i))
            labels_val = np.concatenate((labels_val, labels_val_i))
            spectators_val = np.concatenate((spectators_val, spectators_val_i))
            if storeInputs:
                raw_features_val = np.concatenate(
                    (raw_features_val, raw_features_val_i))
        i += 1

# Value
#labels_val=testd.getAllLabels()[0][::NENT,:]
#features_val=testd.getAllFeatures()[0][::NENT,0,:]
#spectators_val = testd.getAllSpectators()[0][::NENT,0,:]
#if storeInputs: raw_features_val = testd.getAllSpectators()[-1][::NENT,0,:]

# Labels
    print testd.dataclass.branches
    feature_names = testd.dataclass.branches[1]
    spectator_names = testd.dataclass.branches[0]
    #truthnames = testd.getUsedTruth()

    from DeepJetCore.DataCollection import DataCollection
    traind = DataCollection()
    traind.readFromFile(trainData)
    truthnames = traind.getUsedTruth()
    # Store features
    print "Coulmns", spectator_names
    df = pd.DataFrame(spectators_val, columns=spectator_names)

    if storeInputs:
        for i, tname in enumerate(feature_names):
            df[tname] = raw_features_val[:, i]

# Add predictions
    print truthnames
    print predict_test.shape
    for i, tname in enumerate(truthnames):
        df['truth' + tname] = labels_val[:, i]
        #print "Mean 0th label predict predict of ", tname, np.mean(predict_test[:,0]), ", Stats:", np.sum(labels_val[:,i]), "/", len(labels_val[:,i])
        if adv:
            df['predict' + tname] = predict_test[:, NBINS + i]
            for j in range(NBINS):
                df['predict_massbin%i' % j] = predict_test[:, j + i]
        else:
            df['predict' + tname] = predict_test[:, i]

    print "Testing prediction:"
    print "Total: ", len(predict_test[:, 0])
    for lab in truthnames:
        print lab, ":", sum(df['truth' + lab].values)

    df.to_pickle(outputDir +
                 '/output.pkl')  #to save the dataframe, df to 123.pkl
    return df
    print "Finished storing dataframe"
Exemple #3
0
def evaluate(testd, trainData, model, outputDir):
    NENT = 1  # Can skip some events
    filelist = []
    i = 0
    for s in testd.samples:
        spath = testd.getSamplePath(s)
        filelist.append(spath)
        h5File = h5py.File(spath)
        f = h5File
        #features_val = [h5File['x%i_shape'%j][()] for j in range(0, h5File['x_listlength'][()][0])]
        features_val = [
            h5File['x%i' % j][()]
            for j in range(0, h5File['x_listlength'][()][0])
        ]
        #features_val=testd.getAllFeatures()
        predict_test_i = model.predict(features_val)
        if i == 0:
            predict_test = predict_test_i
        else:
            predict_test = np.concatenate((predict_test, predict_test_i))
        i += 1

# Value
    labels_val = testd.getAllLabels()[0][::NENT, :]
    features_val = testd.getAllFeatures()[0][::NENT, 0, :]
    spectators_val = testd.getAllSpectators()[0][::NENT, 0, :]
    raw_features_val = testd.getAllSpectators()[-1][::NENT, 0, :]
    # Labels
    print testd.dataclass.branches
    feature_names = testd.dataclass.branches[1]
    spectator_names = testd.dataclass.branches[0]
    #truthnames = testd.getUsedTruth()

    from DeepJetCore.DataCollection import DataCollection
    traind = DataCollection()
    traind.readFromFile(trainData)
    truthnames = traind.getUsedTruth()
    # Store features
    df = pd.DataFrame(spectators_val)
    df.columns = [spectator_names]

    for i, tname in enumerate(feature_names):
        df[tname] = raw_features_val[:, i]

# Add predictions
    print truthnames
    print predict_test.shape
    for i, tname in enumerate(truthnames):
        df['truth' + tname] = labels_val[:, i]
        df['predict' + tname] = predict_test[:, i]

    df.to_pickle(outputDir +
                 '/output.pkl')  #to save the dataframe, df to 123.pkl
    print df
    dt = pd.read_pickle(outputDir + '/output.pkl')
    print dt

    def dists(xdf, truthnames):
        truths = truthnames
        print truths

        def distribution(xdf, predict="Hcc"):
            plt.figure(figsize=(10, 7))
            bins = np.linspace(0, 1, 70)
            trus = []
            for tru in truths:
                trus.append(xdf['truth' + tru].values)
            preds = [xdf['predict' + predict].values] * len(truths)
            plt.hist(preds,
                     bins=bins,
                     weights=trus,
                     alpha=0.8,
                     normed=True,
                     label=truths,
                     stacked=True)
            plt.xlabel("Probability " + predict)
            plt.title("Stacked Distributions")
            plt.semilogy()
            plt.legend(title="True labels:")
            plt.savefig(outputDir + '/dist' + predict + '.png', dpi=300)

        for pred in truths:
            distribution(xdf, predict=pred)

    dists(df, truthnames)

    print "Testing prediction:"
    print "Total: ", len(predict_test[:, 0])
    for lab in truthnames:
        print lab, ":", sum(df['truth' + lab].values)
    print "Finished"