def evaluate_tflite(self, tflite_filepath: str, dataset: tf.data.Dataset, steps: int, json_file: str = None) -> Dict[str, float]: """Evaluate the EfficientDet TFLite model. Args: tflite_filepath: File path to the TFLite model. dataset: tf.data.Dataset used for evaluation. steps: Number of steps to evaluate the model. json_file: JSON with COCO data format containing golden bounding boxes. Used for validation. If None, use the ground truth from the dataloader. Refer to https://towardsdatascience.com/coco-data-format-for-object-detection-a4c5eaf518c5 for the description of COCO data format. Returns: A dict contains AP metrics. """ # TODO(b/182441458): Use the task library for evaluation instead once it # supports python interface. evaluator, label_map = self._get_evaluator_and_label_map(json_file) dataset = dataset.take(steps) lite_runner = eval_tflite.LiteRunner(tflite_filepath, only_network=False) progbar = tf.keras.utils.Progbar(steps) for i, (images, labels) in enumerate(dataset): # Get the output result after post-processing NMS op. nms_boxes, nms_classes, nms_scores, _ = lite_runner.run(images) # CLASS_OFFSET is used since label_id for `background` is 0 in label_map # while it's not actually included the model. We don't need to add the # offset in the Android application. nms_classes += postprocess.CLASS_OFFSET height, width = utils.parse_image_size(self.config.image_size) normalize_factor = tf.constant([height, width, height, width], dtype=tf.float32) nms_boxes *= normalize_factor if labels['image_scales'] is not None: scales = tf.expand_dims( tf.expand_dims(labels['image_scales'], -1), -1) nms_boxes = nms_boxes * tf.cast(scales, nms_boxes.dtype) detections = postprocess.generate_detections_from_nms_output( nms_boxes, nms_classes, nms_scores, labels['source_ids']) detections = postprocess.transform_detections(detections) evaluator.update_state(labels['groundtruth_data'].numpy(), detections.numpy()) progbar.update(i) metric_dict = self._get_metric_dict(evaluator, label_map) return metric_dict
def _train(dataset: tf.data.Dataset, image_shape: Tuple[int, int, int], epochs: int) -> tf.keras.Model: """学習処理を指定エポック数実行する Args: dataset (tf.data.Dataset): 学習データセット image_shape (Tuple[int, int, int]): 学習画像一枚当たりのサイズ epochs (int): 学習するエポック数 Returns: tf.keras.Model: 学習したモデル """ output_base = pathlib.Path("data/simple_cae") history_filepath = output_base.joinpath("history.pkl") history_imagepath = output_base.joinpath("history.png") reconstruct_filepath = output_base.joinpath("reconstruct.png") model = network.Autoencoder(image_shape) optimizer = tf.keras.optimizers.Adam(1e-4) loss = tf.keras.losses.mean_squared_error checkpoint = Checkpoint( save_dir=str(output_base.joinpath("ckpts")), max_to_keep=3, restore=True, model=model, optimizer=optimizer, ) epoch_history = history.restore(history_filepath) input_example = [data for data in dataset.take(1)][-1] progress_bar = tqdm(range(checkpoint.save_counter(), epochs)) for epoch in progress_bar: # learning batch_history = history.Batch() for batch in dataset: model.train_step(batch, loss, optimizer, batch_history) # save results checkpoint.save() batch_history.result() epoch_history.result(batch_history) history.save(epoch_history, history_filepath) # show results progress_bar.set_description( f"epoch: {epoch}, {epoch_history.get_latest()}") history.show_image(epoch_history, filepath=history_imagepath) visualize.show_images( input_example, network.reconstruct(model, input_example), image_shape, reconstruct_filepath, ) return model
def convert_snippets_to_character_sequence_examples( dataset: tf.data.Dataset, batch_size: int, epochs: int, shuffle_buffer_size: int = 50, sequence_length: int = SEQUENCE_LENGTH, max_batches_per_client: int = -1) -> tf.data.Dataset: """Convert a dataset of string snippets to a dataset of input/output character ID sequences. Args: dataset: the `tf.data.Dataset` to apply preprocessing to. batch_size: the number of examples per yielded batch epochs: the number of times to repeat the dataset in one epoch. shuffle_buffer_size: Buffer size for shuffling the dataset. If nonpositive, no shuffling occurs. sequence_length: the length of each example in the batch. max_batches_per_client: If set to a positive integer, the maximum number of batches in each client's dataset. Returns: A `tf.data.Dataset` yielding `(sequence of character IDs, sequence of character IDs)` where each sequence has `sequence_length` values. """ to_tokens = _build_tokenize_fn(split_length=sequence_length + 1) dataset = dataset.repeat(epochs) if shuffle_buffer_size > 0: dataset = dataset.shuffle(shuffle_buffer_size) return ( # Convert snippets to int64 tokens and pad. dataset.map(to_tokens, num_parallel_calls=tf.data.experimental.AUTOTUNE) # Separate into individual tokens .unbatch() # Join into sequences of the desired length. The previous call of # map(to_ids,...) ensures that the collection of tokens has length # divisible by sequence_length + 1, so no batch dropping is expected. .batch(sequence_length + 1, drop_remainder=True) # Batch sequences together for mini-batching purposes. .batch(batch_size) # Convert batches into training examples. .map(_split_target, num_parallel_calls=tf.data.experimental.AUTOTUNE) # Take a maximum number of batches .take(max_batches_per_client))
def processing(dataset: tf.data.Dataset, window_size, batch_size): dataset = dataset.map(lambda x: table.lookup(x)) dataset = dataset.unbatch() dataset = dataset.window(window_size+1, shift = 1, drop_remainder=True) dataset = dataset.flat_map(lambda ds: ds.batch(window_size+1)) dataset = dataset.map(lambda x: (x[:-1], x[-1]-1)) dataset = dataset.shuffle(10000) dataset = dataset.batch(batch_size).prefetch(1) return dataset
def compute_predictions( model: PredictionModel, dataset: tf.data.Dataset, strategy: tf.distribute.Strategy, batch_size: int ) -> Iterator[Tuple[types.ModelPredictions, types.Features]]: """Yield the predictions of the model on the given dataset. Args: model: A function that takes tensor-valued features and returns a vector of predictions. dataset: The dataset that the function consumes to produce the predictions. strategy: The distribution strategy to use when computing. batch_size: The batch size that should be used. Yields: Pairs of model predictions and the corresponding metadata. """ with strategy.scope(): dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE) options = tf.data.Options() options.experimental_distribute.auto_shard_policy = ( tf.data.experimental.AutoShardPolicy.DATA) dataset = dataset.with_options(options) for features in strategy.experimental_distribute_dataset(dataset): time_start = time.time() if isinstance(strategy, tf.distribute.experimental.TPUStrategy): # TODO(josipd): Figure this out better. We can't easily filter, # as they are PerReplica values, not tensors. features_model = {"image": features["image"]} else: features_model = features predictions = materialize(strategy, strategy.run(model, args=(features_model,))) time_end = time.time() time_delta_per_example = (time_end - time_start) / predictions.shape[0] metadatas = materialize(strategy, features["metadata"]) for i in range(predictions.shape[0]): model_predictions = types.ModelPredictions( predictions=[predictions[i]], time_in_s=time_delta_per_example) metadata_i = _slice_dictionary(metadatas, i) yield model_predictions, metadata_i
def __init__(self, factory: TFToxicDataSetsFactory, dataset: tf.data.Dataset, size: int): assert isinstance(factory, TFToxicDataSetsFactory) and isinstance( dataset, tf.data.Dataset) assert size > 0 self._factory = factory self._dataset = dataset.shuffle(1000).batch( self.batch_size).prefetch(1) self._size = size self._batch_index = 0
def preprocess(self, dataset: tf.data.Dataset) -> tf.data.Dataset: """Applies the preprocessing to the inputs and the targets.""" def preprocess(target_ghi_dummy, metadata, image, target_ghi): image = self.scaling_image.normalize(image) metadata = self.scaling_ghi.normalize(metadata) target_ghi = self.scaling_ghi.normalize(target_ghi) return target_ghi_dummy, metadata, image, target_ghi return dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def transform_dataset(self, input_ds: tf.data.Dataset) -> tf.data.Dataset: """Create a dataset that contains filtered data.""" def filter_keys(example): """Local processing function for dataset mapping.""" return {key: example[key] for key in self.keep_keys} # Map the main processing function to each example. output_ds = input_ds.map( filter_keys, num_parallel_calls=tf.data.experimental.AUTOTUNE) return output_ds
def pretext_dataset(dataset:tf.data.Dataset, start_label:int)->tf.data.Dataset: filtered = dataset.filter(lambda data:data['label'] >= start_label) def supervised_transform(data): image = data['image'] image = tf.cast(image, tf.float32) image = image / 255.0 def random_transform(image): pass
def _add_parsing(dataset: tf.data.Dataset) -> tf.data.Dataset: def _parse_example_bytes(serialized_proto_tensor): field_dict = { 'snippets': tf.io.FixedLenFeature(shape=(), dtype=tf.string) } parsed_fields = tf.io.parse_example(serialized_proto_tensor, field_dict) return collections.OrderedDict(snippets=parsed_fields['snippets']) return dataset.map(_parse_example_bytes, num_parallel_calls=tf.data.AUTOTUNE)
def wrap_detection_dataset(ds: tf.data.Dataset, im_size: Tuple[int, int], num_classes: int) -> tf.data.Dataset: anchors = _generate_anchors(config.AnchorsConfig(), im_size[0]) # Wrap datasets so they return the anchors labels dataset_training_head_fn = functools.partial(_compute_gt, anchors=anchors, num_classes=num_classes) return ds.map(dataset_training_head_fn)
def plot_predicted_images(y_pred: np.ndarray, test_dataset: tf.data.Dataset) -> None: batch = test_dataset.take(1) images, _ = batch.as_numpy_iterator().next() for i in range(10): if y_pred[i][0] > 0.5: print("I am {a:.2%} sure I am Cat".format(a=y_pred[i][0])) else: print("I am {a:.2%} sure I am Dog".format(a=(1 - y_pred[i][0]))) plt.imshow(images[i]) plt.show()
def to_batch_dataset(dataset: tf.data.Dataset, batchsize: int = 100, drop_remainder: bool = False): """ Function for converting from tf.data.Dataset type output by the `from_generator` function to a `BatchDataset` :param dataset: Tensorflow dataset generated from the use of `from_generator` Tensorflow function :param batchsize: The number of data records to be included in the batches for training :param drop_remainder: Boolean for determining whether or not data samples that dont fit in the specified batches should be dropped or not :return: """ return dataset.batch(batchsize, drop_remainder)
def split_dataset(dataset: tf.data.Dataset, val_split: float, test_split: float): # Splits a dataset of type tf.data.Dataset into a training and test dataset using given ratio. Fractions are # rounded up to two decimal places. # Input: # dataset: the input dataset to split. # val_split: the fraction of val data as a float between 0 and 1. # test_split: the fraction of the test data as a float between 0 and 1. # Return: # a tuple of two tf.data.Datasets as (training, test) # Source: https://stackoverflow.com/questions/59669413/what-is-the-canonical-way-to-split-tf-dataset-into-test-and-validation-subsets test_data_percent = round(test_split * 100) if not (0 <= test_data_percent <= 100): raise ValueError("test data fraction must be ∈ [0,1]") val_data_percent = round(val_split * 100) if not (0 <= val_data_percent <= 100): raise ValueError("val data fraction must be ∈ [0,1]") dataset = dataset.enumerate() train_val_dataset = dataset.filter( lambda f, data: f % 100 > test_data_percent) test_dataset = dataset.filter(lambda f, data: f % 100 <= test_data_percent) # remove enumeration train_val_dataset = train_val_dataset.map(lambda f, data: data) test_dataset = test_dataset.map(lambda f, data: data) # add validation from training train_val_dataset = train_val_dataset.enumerate() train_dataset = train_val_dataset.filter( lambda f, data: f % 100 > val_data_percent) val_dataset = train_val_dataset.filter( lambda f, data: f % 100 <= val_data_percent) # remove enumeration train_dataset = train_dataset.map(lambda f, data: data) val_dataset = val_dataset.map(lambda f, data: data) return train_dataset, val_dataset, test_dataset
def get_top_tokens(corpus: tf.data.Dataset, n_top: int = 1000) -> Tuple[dict, int, int]: """ Builds the token mapping which is used to initialize the word embeddings in the model. Get the most frequent terms which appear in the training corpus. Parameters ---------- corpus : tf.data.Dataset Entire dataset object n_top : int, optional Number of most frequent vocab terms to keep for training, by default 1000 Returns ------- (dict, int, int) (token->integer lookup, maximum sequence length, size of data set) """ lookup = Counter() max_sequence_length, data_set_size = 0, 0 corpus = corpus.map(lambda x: tf.strings.split(x, sep=''), num_parallel_calls=tf.data.experimental.AUTOTUNE) for tokens_list in corpus.apply( tf.data.experimental.dense_to_ragged_batch(32)).prefetch(5): lookup.update(tokens_list.flat_values.numpy()) max_batch_seq_len = int(tokens_list.row_lengths().numpy().max()) if max_batch_seq_len > max_sequence_length: max_sequence_length = max_batch_seq_len data_set_size += int(tokens_list.nrows()) # tensorflow converts strings to bytes, let's maintain that (no decoding) hash_map = { key: idx + 2 for idx, (key, value) in enumerate(lookup.most_common(n_top)) } hash_map["<s>".encode('utf8')] = 0 hash_map["</s>".encode('utf8')] = 1 return hash_map, max_sequence_length, data_set_size
def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset: """ Returns a test :class:`~tf.data.Dataset`. Args: test_dataset (:class:`~tf.data.Dataset`): The dataset to use. """ ds = test_dataset.batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last) return self.args.strategy.experimental_distribute_dataset(ds)
def _ApplyDecoderToDataset( self, dataset: tf.data.Dataset) -> tf.data.Dataset: decoder = tf_graph_record_decoder.load_decoder(self._saved_decoder_path) def _ParseFn(record): tensors_dict = decoder.decode_record(record) return { k: v for k, v in tensors_dict.items() if k in self.TensorRepresentations() } return dataset.map(_ParseFn)
def get_label(max_k: tf.data.Dataset, labels: List[str]) -> str: # to bytes: https://stackoverflow.com/questions/6269765/what-does-the-b-character-do-in-front-of-a-string-literal cats: List[bytes] = list(map(lambda s: s.encode("UTF-8"), labels)) values, labels = max_k.as_numpy_iterator().next() out = (np.zeros(len(cats)), np.array(cats)) # effectively reducing and leveraging the input category list for i, v in enumerate(values): out[0][np.argwhere( out[1] == labels[i] )] += v # weights decline with distance because they have been inverted # returning the label with the highest aggregate, distance discounted weight return out[1][np.argmax(out[0])]
def pack_as_supervised_ds( ds: tf.data.Dataset, ds_info: DatasetInfo, ) -> tf.data.Dataset: """Pack `(input, label)` dataset as `{'key0': input, 'key1': label}`.""" if (ds_info.supervised_keys and isinstance(ds.element_spec, tuple) and len(ds.element_spec) == 2): x_key, y_key = ds_info.supervised_keys ds = ds.map(lambda x, y: {x_key: x, y_key: y}) return ds else: # If dataset isn't a supervised tuple (input, label), return as-is return ds
def _prepare_ds( ds: tf.data.Dataset, img_shape: Tuple[Optional[int], Optional[int], Optional[int]], batch_size: int = 8, ): def prepare_img(image, label): size = list(img_shape)[:2] return tf.image.resize(image, size), label return (ds.map( prepare_img, num_parallel_calls=AUTOTUNE).batch(batch_size).prefetch(AUTOTUNE))
def get_normalization_layer(name: str, ds: tf.data.Dataset, weighted=False): """Function creates a normalization layer for the specified numeric feature. :param name: Name of the numeric column (feature) :param ds: Tensorflow Dataset object containing x and y values :param weighted: Boolean argument specifying if the dataset contains sample weights :return: Normalization layer adapted to the feature scale """ # Normalization layer for the feature normalizer = tf.keras.layers.experimental.preprocessing.Normalization( axis=None) # Dataset that only yields specified feature if weighted: feature_ds = ds.map(lambda x, y, w: x[name]) else: feature_ds = ds.map(lambda x, y: x[name]) # Adapt the layer to the data scale normalizer.adapt(feature_ds) return normalizer
def make_submission(model, image_ds: tf.data.Dataset, filename: str = "submission.csv"): f = open(filename, "w") f.write("image_id,label\n") for image, image_id in image_ds.take(-1): pred = tf.argmax(model(image), axis=-1) f.write(f"{image_id.numpy().decode('utf-8')},{pred[0]}\n") f.close()
def _prepare_train_dataset(dataset: tf.data.Dataset, batch_size, cache_path='', shuffle_buffer_size=1000): if cache_path != '': cache_filename = 'dataset_train.tfcache' dataset = dataset.cache( os.path.join(opt.data_path, cache_path, cache_filename)) # dataset = dataset.cache(''.join([cache_path, '/', cache_filename])) dataset = dataset.shuffle(buffer_size=shuffle_buffer_size) # repeat forever dataset = dataset.repeat() dataset = dataset.batch(batch_size=batch_size) # `prefetch` lets the dataset fetch batches in the background # while the model is training. dataset = dataset.prefetch(buffer_size=AUTOTUNE) return dataset
def preprocess(self, dataset: tf.data.Dataset) -> tf.data.Dataset: """Encode images and return it as input and target.""" def encoder(images): return self.encoder(images, training=False) def preprocess(images): images = self.scaling_image.normalize(images) image_features = tf.py_function(func=encoder, inp=[images], Tout=tf.float32) return (image_features[0:-1], image_features[1:]) return dataset.map(preprocess)
def prepare_dataset( ds: tf.data.Dataset, batch_size: int, shuffle: bool = False, drop_remainder: bool = False, ): size_of_dataset = ds.reduce(0, lambda x, _: x + 1).numpy() if shuffle: ds = ds.shuffle(buffer_size=size_of_dataset, seed=SEED) ds: tf.data.Dataset = ds.batch(batch_size, drop_remainder=drop_remainder) @tf.function def prepare_data(features): image = tf.cast(features["image"], tf.float32) bs = tf.shape(image)[0] image = tf.reshape(image / 255.0, (bs, -1)) return image, features["label"] autotune = tf.data.experimental.AUTOTUNE ds = ds.map(prepare_data, num_parallel_calls=autotune).prefetch(autotune) return ds
def batch(self, dataset: tf.data.Dataset) -> tf.data.Dataset: bounds = list(range(self.hist_min, self.hist_max, self.hist_step)) logging.info("Quantile bucketing from %d-%d with %d buckets" % (bounds[0], bounds[-1], len(bounds))) return dataset.apply( ops.bucket_by_quantiles( len_fn=lambda x: tf.shape(x[PREMISE_KEY])[0], batch_size=self.batch_size, n_buckets=self.n_buckets, hist_bounds=bounds))
def _valid_step(self, dataset: tf.data.Dataset, steps_per_epoch: int, progress_bar: ProgressBar, *args, **kwargs) -> Dict: """ 验证步 :param dataset: 验证步的dataset :param valid_loss: 损失计算器 :param steps_per_epoch: 验证总步数 :param batch_size: batch大小 :param valid_accuracy: 精度计算器 :return: 返回所得指标字典 """ print("验证轮次") start_time = time.time() self.loss_metric.reset_states() self.accuracy_metric.reset_states() progress_bar = ProgressBar(total=steps_per_epoch, num=self.batch_size) scores = tf.constant([], dtype=self.model.dtype) labels = tf.constant([], dtype=self.model.dtype) for (batch, (utterances, responses, label)) in enumerate(dataset.take(steps_per_epoch)): score = self._valid_ont_step(utterances=utterances, responses=responses, label=label) scores = tf.concat(values=[scores, score[:, 1]], axis=0) labels = tf.concat( values=[labels, tf.cast(x=label, dtype=self.model.dtype)], axis=0) progress_bar( current=batch + 1, metrics="- train_loss: {:.4f} - train_accuracy: {:.4f}".format( self.loss_metric.result(), self.accuracy_metric.result())) rn_k = recall_at_position_k_in_n( labels=[scores.numpy(), labels.numpy()], k=[1, 2, 5], n=10, tar=1.0) message = { "train_loss": self.loss_metric.result(), "train_accuracy": self.accuracy_metric.result(), "valid_R10@1": rn_k[0], "valid_R10@2": rn_k[1], "valid_R10@5": rn_k[2] } progress_bar(current=steps_per_epoch, metrics=get_dict_string(data=message)) progress_bar.done(step_time=time.time() - start_time) return message
def supervised_dataset(dataset:tf.data.Dataset, max_label:int)->tf.data.Dataset: filtered = dataset.filter(lambda data:data['label'] < max_label) def supervised_transform(data): image = data['image'] image = tf.cast(image, tf.float32) image = image / 255.0 label = data['label'] label = tf.one_hot(label, max_label) return image, label return filtered.map(supervised_transform, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def transform_dataset(self, input_ds: tf.data.Dataset) -> tf.data.Dataset: """Create a dataset that contains instance cropped data.""" keys_to_expand = ["scale", "video_ind", "frame_ind"] if self.other_keys_to_keep: keys_to_expand.extend(self.other_keys_to_keep) if self.keep_instances_gt: keys_to_expand.append("instances") def crop_instances(frame_data): """Local processing function for dataset mapping.""" # Make bounding boxes from centroids. full_centroids = frame_data[self.centroids_key] / frame_data["scale"] full_centroids = full_centroids * frame_data[self.full_image_scale_key] bboxes = make_centered_bboxes( full_centroids, box_height=self.crop_height, box_width=self.crop_width ) frame_data["scale"] = frame_data[self.full_image_scale_key] # Crop images from bounding boxes. instance_images = crop_bboxes(frame_data[self.full_image_key], bboxes) n_instances = tf.shape(bboxes)[0] # Create multi-instance example. instances_data = { "instance_image": instance_images, "bbox": bboxes, "center_instance_ind": tf.range(n_instances, dtype=tf.int32), "centroid": full_centroids, "centroid_confidence": frame_data[self.centroid_confidences_key], "full_image_height": tf.repeat( tf.shape(frame_data[self.full_image_key])[0], n_instances ), "full_image_width": tf.repeat( tf.shape(frame_data[self.full_image_key])[1], n_instances ), } for key in keys_to_expand: instances_data[key] = tf.repeat( tf.expand_dims(frame_data[key], axis=0), n_instances, axis=0 ) return instances_data # Map the main processing function to each example. output_ds = input_ds.map( crop_instances, num_parallel_calls=tf.data.experimental.AUTOTUNE ) # Unbatch to split frame-level examples into individual instance-level examples. output_ds = output_ds.unbatch() return output_ds
def separate_by_target(ds: tf.data.Dataset, idx: int = 1, thr: float = 0.5 ) -> typing.Tuple[tf.data.Dataset, tf.data.Dataset]: def _cond0(*args): return tf.cast(args[idx], tf.float32) < thr def _cond1(*args): return tf.cast(args[idx], tf.float32) >= thr ds0 = ds.filter(_cond0) ds1 = ds.filter(_cond1) return ds0, ds1
def draw_images(self, ds: tf.data.Dataset, n=9): """Draw images from dataset. Args: ds: dataset n: first most n images to draw """ import matplotlib.pyplot as plt cols = 3 rows = n // cols n = rows * cols fig, ax = plt.subplots(ncols=cols, nrows=rows) it = ds.make_one_shot_iterator() b = it.get_next() i = 0 with tf.Session() as s: while True: if i >= n: break try: image, label = s.run(b) except tf.errors.OutOfRangeError: break class_idx = next( idx for idx, i in enumerate(label[0]) if i == 1) class_name = self.image_classes[class_idx] image_data = np.asarray(image).astype(np.uint8) image_data = np.reshape(image_data, (224, 224, 3)) image_fig = ax[i // 3, i % 3] image_fig.imshow(image_data) image_fig.set_title(class_name) i = i + 1 fig.tight_layout()