Python get_h5_generator Examples, orcanet.h5_generator.get_h5_generator Python Examples

Example #1

0

Show file

File: backend.py Project: KM3NeT/OrcaNet

def train_model(orga, model, epoch, batch_logger=False):
    """
    Train a model on one file and return the history.

    Parameters
    ----------
    orga : orcanet.core.Organizer
        Contains all the configurable options in the OrcaNet scripts.
    model : keras.Model
        A compiled keras model.
    epoch : tuple
        Current epoch and the no of the file to train on.
    batch_logger : bool
        Use the orcanet batchlogger to log the training.

    Returns
    -------
    history : dict
        The history of the training on this file. A record of training
        loss values and metrics values.

    """
    callbacks = [
        nn_utilities.RaiseOnNaN(),
        nn_utilities.TimeModel(print_func=orga.io.print_log),
    ]
    if batch_logger:
        callbacks.append(BatchLogger(orga, epoch))
    if orga.cfg.callback_train is not None:
        try:
            callbacks.extend(orga.cfg.callback_train)
        except TypeError:
            callbacks.append(orga.cfg.callback_train)

    training_generator = h5_generator.get_h5_generator(
        orga,
        files_dict=orga.io.get_file("train", epoch[1]),
        f_size=orga.cfg.n_events,
        phase="training",
        zero_center=orga.cfg.zero_center_folder is not None,
        shuffle=orga.cfg.shuffle_train,
    )
    # status tf.2.5: In order to use ragged Tensors as input to fit,
    #  we have to use a tf dataset and not a generator
    dataset = h5_generator.make_dataset(training_generator)

    history = model.fit(
        dataset,
        steps_per_epoch=len(training_generator),
        verbose=orga.cfg.verbose_train,
        max_queue_size=orga.cfg.max_queue_size,
        callbacks=callbacks,
        initial_epoch=epoch[0] - 1,
        epochs=epoch[0],
    )
    training_generator.print_timestats(print_func=orga.io.print_log)
    # get a dict with losses and metrics
    # only trained for one epoch, so value is list of len 1
    history = {key: value[0] for key, value in history.history.items()}
    return history

Example #2

0

Show file

    def test_batch(self):
        filepaths = self.filepaths_file_1
        gene = iter(get_h5_generator(self.orga, filepaths))

        target_xs_batch_1 = {
            "input_A": self.train_A_file_1_ctnt[0][:2],
            "input_B": self.train_B_file_1_ctnt[0][:2],
        }

        target_ys_batch_1 = label_modifier(
            {"y_values": self.train_A_file_1_ctnt[1][:2]})

        target_xs_batch_2 = {
            "input_A": self.train_A_file_1_ctnt[0][2:4],
            "input_B": self.train_B_file_1_ctnt[0][2:4],
        }

        target_ys_batch_2 = label_modifier(
            {"y_values": self.train_A_file_1_ctnt[1][2:4]})

        xs, ys = next(gene)
        assert_dict_arrays_equal(xs, target_xs_batch_1)
        assert_dict_arrays_equal(ys, target_ys_batch_1)

        xs, ys = next(gene)
        assert_dict_arrays_equal(xs, target_xs_batch_2)
        assert_dict_arrays_equal(ys, target_ys_batch_2)

        with self.assertRaises(StopIteration):
            next(gene)

Example #3

0

Show file

    def test_batch_mc_infos(self):
        filepaths = self.filepaths_file_1

        gene = iter(get_h5_generator(self.orga, filepaths, keras_mode=False))

        target_xs_batch_1 = {
            "input_A": self.train_A_file_1_ctnt[0][:2],
            "input_B": self.train_B_file_1_ctnt[0][:2],
        }

        target_ys_batch_1 = label_modifier(
            {"y_values": self.train_A_file_1_ctnt[1][:2]})
        target_mc_info_batch_1 = self.train_A_file_1_ctnt[1][:2]

        target_xs_batch_2 = {
            "input_A": self.train_A_file_1_ctnt[0][2:4],
            "input_B": self.train_B_file_1_ctnt[0][2:4],
        }

        target_ys_batch_2 = label_modifier(
            {"y_values": self.train_A_file_1_ctnt[1][2:4]})
        target_mc_info_batch_2 = self.train_A_file_1_ctnt[1][2:4]

        info_blob = next(gene)
        assert_dict_arrays_equal(info_blob["xs"], target_xs_batch_1)
        assert_dict_arrays_equal(info_blob["ys"], target_ys_batch_1)
        assert_equal_struc_array(info_blob["y_values"], target_mc_info_batch_1)

        info_blob = next(gene)
        assert_dict_arrays_equal(info_blob["xs"], target_xs_batch_2)
        assert_dict_arrays_equal(info_blob["ys"], target_ys_batch_2)
        assert_equal_struc_array(info_blob["y_values"], target_mc_info_batch_2)

Example #4

0

Show file

    def test_batch_sample_modifier(self):
        filepaths = self.filepaths_file_1

        def sample_modifier(info_blob):
            xs_in = info_blob["x_values"]
            mod = {name: val * 2 for name, val in xs_in.items()}
            return mod

        self.orga.cfg.sample_modifier = sample_modifier
        gene = iter(get_h5_generator(self.orga, filepaths))

        target_xs_batch_1 = {
            "input_A": self.train_A_file_1_ctnt[0][:2] * 2,
            "input_B": self.train_B_file_1_ctnt[0][:2] * 2,
        }

        target_ys_batch_1 = label_modifier(
            {"y_values": self.train_A_file_1_ctnt[1][:2]})

        target_xs_batch_2 = {
            "input_A": self.train_A_file_1_ctnt[0][2:4] * 2,
            "input_B": self.train_B_file_1_ctnt[0][2:4] * 2,
        }

        target_ys_batch_2 = label_modifier(
            {"y_values": self.train_A_file_1_ctnt[1][2:4]})

        xs, ys = next(gene)
        assert_dict_arrays_equal(xs, target_xs_batch_1)
        assert_dict_arrays_equal(ys, target_ys_batch_1)

        xs, ys = next(gene)
        assert_dict_arrays_equal(xs, target_xs_batch_2)
        assert_dict_arrays_equal(ys, target_ys_batch_2)

Example #5

0

Show file

File: test_h5_generator.py Project: kabartay/OrcaNet

    def test_batch_zero_center(self):
        filepaths = self.filepaths_file_1

        xs_mean = {name: np.ones(shape) * 0.5 for name, shape in self.n_bins.items()}

        self.orga.get_xs_mean = MagicMock(return_value=xs_mean)
        gene = iter(get_h5_generator(self.orga, filepaths, zero_center=True))

        target_xs_batch_1 = {
            "input_A": np.subtract(self.train_A_file_1_ctnt[0][:2], xs_mean["input_A"]),
            "input_B": np.subtract(self.train_B_file_1_ctnt[0][:2], xs_mean["input_B"]),
        }

        target_ys_batch_1 = label_modifier({"y_values": self.train_A_file_1_ctnt[1][:2]})

        target_xs_batch_2 = {
            "input_A": np.subtract(self.train_A_file_1_ctnt[0][2:], xs_mean["input_A"]),
            "input_B": np.subtract(self.train_B_file_1_ctnt[0][2:], xs_mean["input_B"]),
        }

        target_ys_batch_2 = label_modifier({"y_values": self.train_A_file_1_ctnt[1][2:]})

        xs, ys = next(gene)
        assert_dict_arrays_equal(xs, target_xs_batch_1)
        assert_dict_arrays_equal(ys, target_ys_batch_1)

        xs, ys = next(gene)
        assert_dict_arrays_equal(xs, target_xs_batch_2)
        assert_dict_arrays_equal(ys, target_ys_batch_2)

Example #6

0

Show file

def validate_model(orga, model):
    """
    Validates a model on all validation files and return the history.

    Parameters
    ----------
    orga : orcanet.core.Organizer
        Contains all the configurable options in the OrcaNet scripts.
    model : keras.Model
        A compiled keras model.

    Returns
    -------
    history : dict
        The history of the validation on all files. A record of validation
        loss values and metrics values.

    """
    # One history for each val file
    histories = []
    f_sizes = orga.io.get_file_sizes("val")

    for i, files_dict in enumerate(orga.io.yield_files("val")):
        f_size = f_sizes[i]
        if orga.cfg.n_events is not None:
            f_size = orga.cfg.n_events  # for testing purposes

        val_generator = get_h5_generator(
            orga,
            files_dict,
            f_size=f_size,
            phase="validation",
            zero_center=orga.cfg.zero_center_folder is not None)

        history_file = model.evaluate(val_generator,
                                      steps=int(f_size / orga.cfg.batchsize),
                                      max_queue_size=orga.cfg.max_queue_size,
                                      verbose=orga.cfg.verbose_val)
        if not isinstance(history_file, list):
            history_file = [
                history_file,
            ]
        histories.append(history_file)

    # average over all val files
    history = weighted_average(histories, f_sizes)

    # This history is just a list, not a dict like with fit_generator
    # so transform to dict
    history = dict(zip(model.metrics_names, history))

    return history

Example #7

0

Show file

File: backend.py Project: KM3NeT/OrcaNet

def validate_model(orga, model):
    """
    Validates a model on all validation files and return the history.

    Parameters
    ----------
    orga : orcanet.core.Organizer
        Contains all the configurable options in the OrcaNet scripts.
    model : keras.Model
        A compiled keras model.

    Returns
    -------
    history : dict
        The history of the validation on all files. A record of validation
        loss values and metrics values.

    """
    # One history for each val file
    histories = []
    f_sizes = orga.io.get_file_sizes("val")

    for i, files_dict in enumerate(orga.io.yield_files("val")):
        val_generator = h5_generator.get_h5_generator(
            orga,
            files_dict,
            f_size=orga.cfg.n_events,
            phase="validation",
            zero_center=orga.cfg.zero_center_folder is not None,
        )
        # status tf.2.5: In order to use ragged Tensors as input to fit,
        #  we have to use a tf dataset and not a generator
        dataset = h5_generator.make_dataset(val_generator)
        history_file = model.evaluate(
            dataset,
            steps=len(val_generator),
            max_queue_size=orga.cfg.max_queue_size,
            verbose=orga.cfg.verbose_val)
        if not isinstance(history_file, list):
            history_file = [history_file, ]
        histories.append(history_file)

    # average over all val files
    history = weighted_average(histories, f_sizes)
    # This history is just a list, not a dict like with fit_generator
    # so transform to dict
    history = dict(zip(model.metrics_names, history))

    return history

Example #8

0

Show file

def train_model(orga, model, epoch, batch_logger=False):
    """
    Train a model on one file and return the history.

    Parameters
    ----------
    orga : orcanet.core.Organizer
        Contains all the configurable options in the OrcaNet scripts.
    model : keras.Model
        A compiled keras model.
    epoch : tuple
        Current epoch and the no of the file to train on.
    batch_logger : bool
        Use the orcanet batchlogger to log the training.

    Returns
    -------
    history : dict
        The history of the training on this file. A record of training
        loss values and metrics values.

    """
    files_dict = orga.io.get_file("train", epoch[1])

    if orga.cfg.n_events is not None:
        # TODO Can throw an error if n_events is larger than the file
        f_size = orga.cfg.n_events  # for testing purposes
    else:
        f_size = orga.io.get_file_sizes("train")[epoch[1] - 1]

    callbacks = [
        nn_utilities.RaiseOnNaN(),
        nn_utilities.TimeModel(print_func=orga.io.print_log),
    ]
    if batch_logger:
        callbacks.append(BatchLogger(orga, epoch))
    if orga.cfg.callback_train is not None:
        try:
            callbacks.extend(orga.cfg.callback_train)
        except TypeError:
            callbacks.append(orga.cfg.callback_train)

    training_generator = get_h5_generator(
        orga,
        files_dict,
        f_size=f_size,
        phase="training",
        zero_center=orga.cfg.zero_center_folder is not None,
        shuffle=orga.cfg.shuffle_train)

    history = model.fit(
        training_generator,
        steps_per_epoch=int(f_size / orga.cfg.batchsize),
        verbose=orga.cfg.verbose_train,
        max_queue_size=orga.cfg.max_queue_size,
        callbacks=callbacks,
        initial_epoch=epoch[0] - 1,
        epochs=epoch[0],
    )
    training_generator.print_timestats(print_func=orga.io.print_log)
    # get a dict with losses and metrics
    # only trained for one epoch, so value is list of len 1
    history = {key: value[0] for key, value in history.history.items()}
    return history

Example #9

0

Show file

def h5_inference(orga,
                 model,
                 files_dict,
                 output_path,
                 samples=None,
                 use_def_label=True):
    """
    Let a model predict on all samples in a h5 file, and save it as a h5 file.

    Per default, the h5 file will contain a datagroup y_values straight from
    the given files, as well as two datagroups per output layer of the network,
    which have the labels and the predicted values in them as numpy arrays,
    respectively.

    Parameters
    ----------
    orga : orcanet.core.Organizer
        Contains all the configurable options in the OrcaNet scripts.
    model : keras.Model
        Trained Keras model of a neural network.
    files_dict : dict
        Dict mapping model input names to h5 file paths.
    output_path : str
        Name of the output h5 file containing the predictions.
    samples : int, optional
        Dont use all events in the file, but instead only the given number.
    use_def_label : bool
        If True and no label modifier is given by user, use the default
        label modifier instead of none.

    """
    file_size = h5_get_number_of_rows(list(files_dict.values())[0],
                                      datasets=[orga.cfg.key_x_values])
    generator = get_h5_generator(
        orga,
        files_dict,
        zero_center=orga.cfg.zero_center_folder is not None,
        keras_mode=False,
        use_def_label=use_def_label,
        phase="inference",
    )
    itergen = iter(generator)

    if samples is None:
        steps = len(generator)
    else:
        steps = int(samples / orga.cfg.batchsize)
    print_every = max(100, min(int(round(steps / 10, -2)), 1000))
    model_time_total = 0.

    temp_output_path = os.path.join(
        os.path.dirname(output_path), "temp_" + os.path.basename(output_path) +
        "_" + time.strftime("%d-%m-%Y-%H-%M-%S", time.gmtime()))
    print(f"Creating temporary file {temp_output_path}")
    with h5py.File(temp_output_path, 'x') as h5_file:
        # add version and paths of h5files
        h5_file.attrs.create("orcanet", orcanet.__version__)
        for input_key, file in files_dict.items():
            h5_file.attrs.create(f"orcanet_inp_{input_key}", file)

        for s in range(steps):
            if s % print_every == 0:
                print('Predicting in step {}/{} ({:0.2%})'.format(
                    s, steps, s / steps))

            info_blob = next(itergen)

            start_time = time.time()
            y_pred = model.predict_on_batch(info_blob["xs"])
            model_time_total += time.time() - start_time

            if not isinstance(y_pred, list):
                # if only one output, transform to a list
                y_pred = [y_pred]
            # transform y_pred to dict
            y_pred = {
                out: y_pred[i]
                for i, out in enumerate(model.output_names)
            }
            info_blob["y_pred"] = y_pred

            if info_blob.get("org_batchsize") is not None:
                _slice_to_size(info_blob)

            if orga.cfg.dataset_modifier is None:
                datasets = dataset_modifiers.as_array(info_blob)
            else:
                datasets = orga.cfg.dataset_modifier(info_blob)

            if s == 0:  # create datasets in the first step
                for dataset_name, data in datasets.items():
                    h5_file.create_dataset(
                        dataset_name,
                        data=data,
                        maxshape=(file_size, ) + data.shape[1:],
                        chunks=True,  # (batchsize,) + data.shape[1:]
                        compression="gzip",
                        compression_opts=1,
                    )

            else:
                for dataset_name, data in datasets.items():
                    # append data at the end of the dataset
                    h5_file[dataset_name].resize(
                        h5_file[dataset_name].shape[0] + data.shape[0], axis=0)
                    h5_file[dataset_name][-data.shape[0]:] = data

    if os.path.exists(output_path):
        raise FileExistsError(
            f"{output_path} exists already! But file {temp_output_path} "
            f"is finished and can be safely used.")
    os.rename(temp_output_path, output_path)
    generator.print_timestats()
    print("Statistics of model prediction:")
    print(f"\tTotal time:\t{model_time_total / 60:.2f} min")
    print(f"\tPer batch:\t{1000 * model_time_total / steps:.5} ms")

Example #10

0

Show file

 def test_y_field_names(self):
     y_field_names = ("mc_A", )
     filepaths = self.filepaths_file_1
     self.orga.cfg.y_field_names = y_field_names
     gene = get_h5_generator(self.orga, filepaths, keras_mode=False)
     self.assertTupleEqual(gene[0]["y_values"].dtype.names, y_field_names)