def test_as_array(self): y_values = "y_values gets simply passed forward" y_true = { "out_A": 1, "out_B": 2, } y_pred = { "out_pred_A": 3, "out_pred_B": 4, } info_blob = { "y_values": y_values, "ys": y_true, "y_pred": y_pred, } target = { "y_values": y_values, "label_out_A": 1, "label_out_B": 2, "pred_out_pred_A": 3, "pred_out_pred_B": 4, } datasets = dmods.as_array(info_blob) self.assertDictEqual(datasets, target)
def h5_inference(orga, model, files_dict, output_path, samples=None, use_def_label=True): """ Let a model predict on all samples in a h5 file, and save it as a h5 file. Per default, the h5 file will contain a datagroup y_values straight from the given files, as well as two datagroups per output layer of the network, which have the labels and the predicted values in them as numpy arrays, respectively. Parameters ---------- orga : orcanet.core.Organizer Contains all the configurable options in the OrcaNet scripts. model : keras.Model Trained Keras model of a neural network. files_dict : dict Dict mapping model input names to h5 file paths. output_path : str Name of the output h5 file containing the predictions. samples : int, optional Dont use all events in the file, but instead only the given number. use_def_label : bool If True and no label modifier is given by user, use the default label modifier instead of none. """ file_size = h5_get_number_of_rows(list(files_dict.values())[0], datasets=[orga.cfg.key_x_values]) generator = get_h5_generator( orga, files_dict, zero_center=orga.cfg.zero_center_folder is not None, keras_mode=False, use_def_label=use_def_label, phase="inference", ) itergen = iter(generator) if samples is None: steps = len(generator) else: steps = int(samples / orga.cfg.batchsize) print_every = max(100, min(int(round(steps / 10, -2)), 1000)) model_time_total = 0. temp_output_path = os.path.join( os.path.dirname(output_path), "temp_" + os.path.basename(output_path) + "_" + time.strftime("%d-%m-%Y-%H-%M-%S", time.gmtime())) print(f"Creating temporary file {temp_output_path}") with h5py.File(temp_output_path, 'x') as h5_file: # add version and paths of h5files h5_file.attrs.create("orcanet", orcanet.__version__) for input_key, file in files_dict.items(): h5_file.attrs.create(f"orcanet_inp_{input_key}", file) for s in range(steps): if s % print_every == 0: print('Predicting in step {}/{} ({:0.2%})'.format( s, steps, s / steps)) info_blob = next(itergen) start_time = time.time() y_pred = model.predict_on_batch(info_blob["xs"]) model_time_total += time.time() - start_time if not isinstance(y_pred, list): # if only one output, transform to a list y_pred = [y_pred] # transform y_pred to dict y_pred = { out: y_pred[i] for i, out in enumerate(model.output_names) } info_blob["y_pred"] = y_pred if info_blob.get("org_batchsize") is not None: _slice_to_size(info_blob) if orga.cfg.dataset_modifier is None: datasets = dataset_modifiers.as_array(info_blob) else: datasets = orga.cfg.dataset_modifier(info_blob) if s == 0: # create datasets in the first step for dataset_name, data in datasets.items(): h5_file.create_dataset( dataset_name, data=data, maxshape=(file_size, ) + data.shape[1:], chunks=True, # (batchsize,) + data.shape[1:] compression="gzip", compression_opts=1, ) else: for dataset_name, data in datasets.items(): # append data at the end of the dataset h5_file[dataset_name].resize( h5_file[dataset_name].shape[0] + data.shape[0], axis=0) h5_file[dataset_name][-data.shape[0]:] = data if os.path.exists(output_path): raise FileExistsError( f"{output_path} exists already! But file {temp_output_path} " f"is finished and can be safely used.") os.rename(temp_output_path, output_path) generator.print_timestats() print("Statistics of model prediction:") print(f"\tTotal time:\t{model_time_total / 60:.2f} min") print(f"\tPer batch:\t{1000 * model_time_total / steps:.5} ms")