Beispiel #1
0
def main():

    parser = argparse.ArgumentParser(
        description="Train a Keras model over you pre-processed files")
    parser.add_argument(
        "-i",
        "--input",
        help=
        "Provide input, pre-processed HDF5 file with training, validation, and scaling data",
        required=True)
    parser.add_argument(
        "-m",
        "--model-name",
        help="Provide the name of the model to build and train",
        required=True)
    parser.add_argument(
        "-o",
        "--outdir",
        help="Provide an output directory do dump files [default: ./]",
        default="./")
    parser.add_argument("-n",
                        "--name",
                        help="Provide output filename descriptor",
                        default="test")
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        default=False,
                        help="Be loud about it")
    parser.add_argument("--regress",
                        help="Provide a variable to regress on",
                        default="")
    args = parser.parse_args()

    training_samples, data_scaler = load_input_file(args)
    if len(training_samples) < 2:
        print(
            "ERROR there are not enough training samples loaded to perform a training"
        )
        sys.exit()
    print("Pre-processed file contained {} samples: {}, {}".format(
        len(training_samples), [s.name() for s in training_samples],
        [s.class_label() for s in training_samples]))

    input_features, targets, regression_targets = build_combined_input(
        training_samples,
        data_scaler=data_scaler,
        scale=True,
        regress_var=args.regress)

    if len(regression_targets) == 0:
        regression_targets = []

    #model = None
    #if args.regress == "" :
    #    model = build_keras_model( len(data_scaler.feature_list()), len(training_samples) )
    #else :
    #    model = build_keras_model_regression( len(data_scaler.feature_list()), len(training_samples) )

    ## TODO : save the fit_history object for later use
    #model, fit_history = train(len(training_samples), input_features, targets, model, regression_targets = regression_targets)
    n_inputs = len(data_scaler.feature_list())
    n_outputs = len(training_samples)
    my_model = wwbb_models.get_model(args.model_name)
    model, fit_history = build_and_train(my_model, n_inputs, n_outputs,
                                         input_features, targets)

    # dump the fit history to file for later use
    #with open("./ml_training_apr3_split_4/fit_history_{}.pkl".format(args.name), 'wb') as pickle_history :
    with open("fit_history_{}.pkl".format(args.name), 'wb') as pickle_history:
        pickle.dump(fit_history.history, pickle_history)

    # save
    job_suff = "_{}".format(args.name) if args.name else ""
    arch_name = "architecture{}.json".format(job_suff)
    weights_name = "weights{}.h5".format(job_suff)

    if args.outdir != "":
        mkdir_p(args.outdir)
    arch_name = "{}/{}".format(args.outdir, arch_name)
    weights_name = "{}/{}".format(args.outdir, weights_name)

    print("Saving architecture to: {}".format(os.path.abspath(arch_name)))
    print("Saving weights to     : {}".format(os.path.abspath(weights_name)))
    with open(arch_name, 'w') as arch_file:
        arch_file.write(model.to_json())
    model.save_weights(weights_name)
Beispiel #2
0
def extract_scale_dataset(args,
                          ignore_features=['eventweight', 'eventNumber']):
    """
    From the user-provided path to the input HDF5 file produced
    by 'preprocess.py', extract the scaling dataset and store
    in a new output file. Having the scaling data contained
    in the file made by 'preprocess.py' is useful as it keeps
    it near the data that produced it (makes it easier for training
    and validation) but when expanding out and using the network
    on general datasets, it should be nicer to have this separate.

    Currently this does not worry about handling "features to ignore", like
    event weights. The DataScaler which loads this should handle this.

    Args:
        args : command-line user-input
    """

    scaling_group_name = "scaling"
    scaling_dataset_name = "scaling_data"

    scaling_dataset = None

    with h5py.File(args.input, 'r', libver='latest') as input_file:

        if scaling_group_name in input_file:
            scaling_group = input_file[scaling_group_name]
            scaling_dataset = scaling_group[scaling_dataset_name]
        else:
            raise Exception("Input file (={}) does not contained the expected \
                scaling dataset".format(args.input))

        output_name = args.input.split("/")[-1].replace(".h5", "").replace(
            ".hdf5", "")
        output_name += "_scaling_data"
        if args.outdir != "":
            mkdir_p(args.outdir)
        output_name = "{}/{}".format(os.path.abspath(args.outdir), output_name)

        if args.to_json:
            output_json_name = output_name + ".json"

            # the fields are named in a specific way, if they are not there
            # as expected then exit
            if 'name' not in scaling_dataset.dtype.fields.keys():
                print("ERROR 'name' field not in scaling dataset")
                sys.exit()
            if 'mean' not in scaling_dataset.dtype.fields.keys():
                print("ERROR 'mean' field not in scaling dataset")
                sys.exit()
            if 'scale' not in scaling_dataset.dtype.fields.keys():
                print("ERROR 'scale' field not in scaling dataset")
                sys.exit()

            import json
            variables = scaling_dataset['name']
            scales = scaling_dataset['scale']
            offsets = scaling_dataset['mean']

            jdata = {}
            jdata["variables"] = []
            for ivar, varname in enumerate(variables):
                jdata["variables"].append({
                    "name": varname,
                    "offset": -1.0 * offsets[ivar],
                    "scale": 1.0 / scales[ivar]
                })

            with open(output_json_name, 'w') as jsonfile:
                json.dump(jdata, jsonfile)

        output_name += ".h5"
        with h5py.File(
                output_name,
                'w') as output_file:  #, libver = 'latest') as output_file :
            scaling_group = output_file.create_group(scaling_group_name)

            # store the name of the original input file so that we can (loosely)
            # correlated it with this output file if this output file's
            # name drastically differs
            scaling_group.attrs['original_input_file'] = args.input

            out_ds = scaling_group.create_dataset(scaling_dataset_name, shape = scaling_dataset.shape, \
                dtype = scaling_dataset.dtype, data = scaling_dataset, maxshape = (None,))
Beispiel #3
0
def dump_scores(input_file, model, data_scaler, args):
    """
    From the input HDF5 file, go through it and get the NN output
    for the features, storing them to a single output file whose
    name is based on the input filename.

    Args :
        input_file : input filename for HDF5 file to be opened and processed
        model : loaded Keras model
        data_scaler : loaded DataScaler object used to scale the input features
            prior to network evaluation
        args : command line inputs
    """

    outname = input_file.split("/")[-1].replace(".h5", "").replace(".hdf5", "")
    outname += "_scores.h5"
    if args.outdir != "":
        mkdir_p(args.outdir)
    outname = "{}/{}".format(args.outdir, outname)

    #out_ds_created = False
    #out_ds = None

    #gen = chunk_generator(input_file, dataset_name = args.dataset)
    #chunk = next(gen)
    #chunk = chunk [ (chunk['nBJets']==2) ]
    #row_count = chunk.shape[0]

    #weights = chunk['eventweight']
    #input_features = chunk[data_scaler.feature_list()]
    #input_features = floatify(chunk[data_scaler.feature_list()], data_scaler.feature_list())
    #input_features = (input_features - data_scaler.mean()) / data_scaler.scale()
    #scores = model.predict(input_features)
    #n_outputs = scores.shape[1]

    #ds = np.array( list(weights), dtype = [('eventweight', float)])
    #for io in range(n_outputs) :
    #    ds = recfunctions.append_fields( ds , names = 'nn_score_{}'.format(io), data = scores[:,io], dtypes = float )
    #dtype = ds.dtype
    #row_count = ds.shape[0]

    dataset_id = 0
    with h5py.File(outname, 'w', libver='latest') as outfile:

        for chunk in chunk_generator(input_file, dataset_name=args.dataset):

            # apply the selection here
            chunk = chunk[(chunk['nBJets'] >= 1) & (
                chunk['HT2Ratio'] > 0.5
            )]  # & (chunk['mt2_bb'] > 65) & (chunk['l1_pt']>20.) & (chunk['mll']>20.) ]
            #chunk = chunk[ (chunk['nBJets'] >= 1) & (chunk['mt2_bb'] > 55) & (chunk['l1_pt']>20.) & (chunk['mll']>20.) ]
            #chunk = chunk[ (chunk['nBJets'] >= 1) & (chunk['mt2_bb'] > 65) & (chunk['HT2Ratio']>0.5) ]
            if chunk.size == 0: continue

            weights = chunk['eventweight']
            input_features = chunk[data_scaler.feature_list()]
            input_features = floatify(input_features,
                                      data_scaler.feature_list())
            input_features = (input_features -
                              data_scaler.mean()) / data_scaler.scale()
            scores = model.predict(input_features)
            n_outputs = scores.shape[1]
            discriminants = build_discriminant_array(scores, n_outputs)

            ds = np.array(list(weights), dtype=[('eventweight', float)])
            for io in range(n_outputs):
                ds = recfunctions.append_fields(ds,
                                                names='nn_score_{}'.format(io),
                                                data=scores[:, io],
                                                dtypes=np.float64)
            for io in range(n_outputs):
                ds = recfunctions.append_fields(ds,
                                                names='nn_disc_{}'.format(io),
                                                data=discriminants[io],
                                                dtypes=np.float64)
            maxshape = (None, ) + ds.shape[1:]

            dsname = "nn_scores_{}".format(dataset_id)
            out_ds = outfile.create_dataset(dsname,
                                            shape=ds.shape,
                                            maxshape=maxshape,
                                            chunks=ds.shape,
                                            dtype=ds.dtype)
            out_ds[:] = ds
            dataset_id += 1

    print(" > output saved : {}".format(os.path.abspath(outname)))
def make_nn_roc_curve(output_scores=None,
                      samples=[],
                      inputs=None,
                      targets=None,
                      signal_class=0,
                      args=None):

    class_labels = set(targets)
    targets_list = list(targets)
    nn_scores_dict = {}

    names = {}
    for sample in samples:
        names[sample.class_label()] = sample.name()

    for ilabel, label in enumerate(class_labels):
        left = targets_list.index(label)
        right = len(targets_list) - 1 - targets_list[::-1].index(label)
        nn_scores_dict[label] = output_scores[left:right + 1]

    lowbin = 0
    highbin = 1

    edges = np.concatenate([[-np.inf],
                            np.linspace(lowbin, highbin, 500), [np.inf]])

    # we want the sample efficiency to pass the signal eff
    sample_eff = {}
    h_total = []
    w_total = []
    for label in nn_scores_dict:
        # select out the scores for class 'label' for NN output 'signal_class'
        scores = nn_scores_dict[label][:, signal_class]

        weights = sample_with_label(label, samples).eventweights
        h_nn, _ = np.histogram(scores,
                               bins=edges,
                               weights=weights.reshape((scores.shape[0], )))
        if label != signal_class:
            h_total.append(h_nn)
            w_total.append(weights)

        # We want to integrate from the high end and then flip
        # to give the yield "to the right" of the value at
        # which the integration starts, since "to the right" is
        # signal like. We also normalize to give the value as
        # a relative fraction, or efficiency, of selecting that sample at the
        # given value where we integrate from.
        eff = np.cumsum(h_nn[::-1])[::-1] / h_nn.sum()
        sample_eff[label] = eff

    summed_bkg = h_total[0]
    for h in h_total[1:]:
        summed_bkg += h
    summed_weights = w_total[0]
    for h in w_total[1:]:
        summed_weights += h
    eff_total_bkg = np.cumsum(summed_bkg[::-1])[::-1] / summed_bkg.sum()

    signal_eff = None
    bkg_eff = {}
    for e in sample_eff:
        if e == signal_class:
            signal_eff = sample_eff[e]
        else:
            bkg_eff[e] = sample_eff[e]

    fig, ax = plt.subplots(1, 1)
    for bkg_label in bkg_eff:

        bkg = bkg_eff[bkg_label]
        valid_rej = bkg > 0
        sig = np.array(signal_eff[:])

        valid_sig = (sig != 1.0)
        valid = valid_rej & valid_sig

        bkg = bkg[valid]
        sig = sig[valid]

        bkg_rej = 1 / bkg
        ax.plot(sig, bkg_rej, label=names[bkg_label])

    valid_rej_total = eff_total_bkg > 0
    sig = np.array(signal_eff[:])
    valid_sig_total = sig != 1.0
    valid_total = valid_rej_total & valid_sig_total

    bkg_total = eff_total_bkg[valid_total]
    sig_total = sig[valid_total]
    bkg_rej_total = 1 / bkg_total
    ax.plot(sig_total, bkg_rej_total, label="Total Bkg")

    ax.set_yscale('log')
    ax.set_xlabel('$hh$ efficiency', horizontalalignment='right', x=1)
    ax.set_ylabel('Background rejection, $1/\\epsilon_{bkg}$',
                  horizontalalignment='right',
                  y=1)
    ax.legend(loc='best', frameon=False)

    # save
    savename = "nn_output_ROC_{}.pdf".format(args.name)
    if args.outdir != "":
        mkdir_p(args.outdir)
    savename = "{}/{}".format(args.outdir, savename)
    fig.savefig(savename, bbox_inches='tight', dpi=200)
def make_discriminant_plots(model=None,
                            inputs=None,
                            samples=None,
                            targets=None,
                            args=None):

    nn_scores = model.predict(inputs)
    class_labels = set(targets)
    targets_list = list(targets)

    # index the sample names by their class label
    names = {}
    for sample in samples:
        names[sample.class_label()] = sample.name()

    discriminants = build_discriminants(scores=nn_scores,
                                        labels=class_labels,
                                        targets_list=targets_list)

    idx_map = {}
    for ilabel, label in enumerate(class_labels):
        # left-most
        left = targets_list.index(label)
        # right-most
        right = len(targets_list) - 1 - targets_list[::-1].index(label)
        idx_map[label] = [left, right + 1]

    for label in class_labels:
        fig, ax = plt.subplots(1, 1)
        #        ax.set_xlim([-40,15])
        #        ax.set_ylim([1e-2,2])
        binning = np.arange(-40, 20, 1)
        centers = (binning[1:-2] + binning[2:-1]) / 2
        ax.set_xlim((centers[0] - 0.1, centers[-1] + 0.1))
        ax.set_yscale('log')

        for sample_label in discriminants:
            left, right = idx_map[sample_label][0], idx_map[sample_label][1]
            disc_scores_for_sample = discriminants[label][left:right]

            # since we took the log_ratio, lets clear out any invalid numbers
            ok_idx = valid_idx(disc_scores_for_sample)
            disc_scores_for_sample = disc_scores_for_sample[ok_idx]
            sample_weights = sample_with_label(sample_label,
                                               samples).eventweights
            sample_weights = sample_weights[ok_idx]
            yields, _ = np.histogram(disc_scores_for_sample,
                                     bins=binning,
                                     weights=sample_weights.reshape(
                                         (disc_scores_for_sample.shape[0], )))
            yields = yields / yields.sum()
            ax.step(centers,
                    yields[1:-1],
                    label=names[sample_label],
                    where='mid')

        ax.legend(loc='best', frameon=False)

        savename = "nn_discriminant_{}_class_{}.pdf".format(
            args.name, names[label])
        if args.outdir != "":
            mkdir_p(args.outdir)
        savename = "{}/{}".format(args.outdir, savename)
        fig.savefig(savename, bbox_inches='tight', dpi=200)
def make_nn_output_plots(model=None,
                         inputs=None,
                         samples=None,
                         targets=None,
                         args=None):

    # set of scores for each label: shape = (n_samples, n_outputs)
    nn_scores = model.predict(inputs)

    class_labels = set(targets)
    targets_list = list(targets)
    nn_scores_dict = {}

    # index the sample names by their class label
    names = {}
    for sample in samples:
        names[sample.class_label()] = sample.name()

    # break up the predicted scores by the class label
    for ilabel, label in enumerate(class_labels):
        # left-most appearance of the label
        left = targets_list.index(label)
        # right-most appearance of the label
        right = len(targets_list) - 1 - targets_list[::-1].index(label)
        nn_scores_dict[label] = nn_scores[left:right + 1]

    # start plotting
    for label in class_labels:
        fig, ax = plt.subplots(1, 1)
        ax.grid(color='k',
                which='both',
                linestyle='--',
                lw=0.5,
                alpha=0.1,
                zorder=0)
        ax.set_xlabel("NN output for label {}".format(names[label]),
                      horizontalalignment='right',
                      x=1)
        #ax.set_xlim([1e-2,1.0])
        ax.set_xlim([-0.01, 1.01])
        ax.set_yscale('log')
        binning = np.arange(0, 1, 0.02)
        centers = (binning[1:-2] + binning[2:-1]) / 2
        ax.set_xlim((centers[0] - 0.1, centers[-1] + 0.1))
        for sample_label in nn_scores_dict:
            sample_scores_for_label = nn_scores_dict[sample_label][:, label]
            sample_weights = sample_with_label(sample_label,
                                               samples).eventweights

            yields, _ = np.histogram(sample_scores_for_label,
                                     bins=binning,
                                     weights=sample_weights.reshape(
                                         (sample_scores_for_label.shape[0], )))
            yields = yields / yields.sum()
            ax.step(centers,
                    yields[1:-1],
                    label=names[sample_label],
                    where='mid')

            #ax.hist(sample_scores_for_label, bins = binning, alpha = 0.3, label = names[sample_label], density = True)
        ax.legend(loc='best', frameon=False)
        savename = "nn_outputs_{}_class_{}.pdf".format(args.name, names[label])
        if args.outdir != "":
            mkdir_p(args.outdir)
        savename = "{}/{}".format(args.outdir, savename)
        fig.savefig(savename, bbox_inches='tight', dpi=200)

    return nn_scores