def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset: """Build a pipeline fetching, shuffling, and preprocessing the dataset. Args: dataset: A `tf.data.Dataset` that loads raw files. Returns: A TensorFlow dataset outputting batched images and labels. """ if self._num_gpus > 1: dataset = dataset.shard(self._num_gpus, hvd.rank()) if self.is_training: # Shuffle the input files. dataset.shuffle(buffer_size=self._file_shuffle_buffer_size) if self.is_training and not self._cache: dataset = dataset.repeat() # Read the data from disk in parallel dataset = dataset.interleave( tf.data.TFRecordDataset, cycle_length=10, block_length=1, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self._cache: dataset = dataset.cache() if self.is_training: dataset = dataset.shuffle(self._shuffle_buffer_size) dataset = dataset.repeat() # Parse, pre-process, and batch the data in parallel preprocess = self.parse_record dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self._num_gpus > 1: # The batch size of the dataset will be multiplied by the number of # replicas automatically when strategy.distribute_datasets_from_function # is called, so we use local batch size here. dataset = dataset.batch(self.local_batch_size, drop_remainder=self.is_training) else: dataset = dataset.batch(self.global_batch_size, drop_remainder=self.is_training) # Apply Mixup mixup_alpha = self.mixup_alpha if self.is_training else 0.0 dataset = dataset.map( functools.partial(self.mixup, self.local_batch_size, mixup_alpha), num_parallel_calls=64) # Prefetch overlaps in-feed with training dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def model_fn(self, train_dataset: tf.data.Dataset, eval_dataset: tf.data.Dataset): """ Function defining the training flow of the feedforward neural network model. Args: train_dataset: tf.data.Dataset containing the training data. eval_dataset: tf.data.Dataset containing the evaluation data. Returns: model: A trained feedforward neural network model. """ train_dataset = train_dataset.batch(self.batch_size, drop_remainder=True) eval_dataset = eval_dataset.batch(self.batch_size, drop_remainder=True) features = list(train_dataset.element_spec[0].keys()) labels = list(train_dataset.element_spec[1].keys()) input_layers = [ tf.keras.layers.Input(shape=(1, ), name=k) for k in features ] d = tf.keras.layers.Concatenate()(input_layers) for size in self.hidden_layers: d = tf.keras.layers.Dense(size, activation=self.hidden_activation)(d) d = tf.keras.layers.Dropout(self.dropout_chance)(d) output_layers = { l: tf.keras.layers.Dense(self.output_units, activation=self.last_activation, name=l)(d) for l in labels } model = tf.keras.Model(inputs=input_layers, outputs=output_layers) model.compile(loss=self.loss, optimizer=tf.keras.optimizers.Adam(lr=self.lr), metrics=self.metrics) model.summary() tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=self.log_dir) model.fit(train_dataset, epochs=self.epochs, validation_data=eval_dataset, callbacks=[tensorboard_callback]) return model
def iterator_from_dataset( dataset: tf.data.Dataset, batch_size: int, repeat: bool = True, prefetch_size: int = 0, devices: Optional[Sequence[Any]] = None, ): """Create a data iterator that returns JAX arrays from a TF dataset. Args: dataset: the dataset to iterate over. batch_size: the batch sizes the iterator should return. repeat: whether the iterator should repeat the dataset. prefetch_size: the number of batches to prefetch to device. devices: the devices to prefetch to. Returns: An iterator that returns data batches. """ if repeat: dataset = dataset.repeat() if batch_size > 0: dataset = dataset.batch(batch_size) it = map(prepare_tf_data, dataset) else: it = map(prepare_tf_data_unbatched, dataset) if prefetch_size > 0: it = jax_utils.prefetch_to_device(it, prefetch_size, devices) return it
def __init__(self, tf_dataset: tf.data.Dataset, train_ratio: float, validation_ratio: float, batch_size: int = 300): self.article_length = len(list(tf_dataset.as_numpy_iterator())[0][0]) self.theme_count = len(list(tf_dataset.as_numpy_iterator())[0][1]) self.count = len(list(tf_dataset.as_numpy_iterator())) self.dataset = tf_dataset.batch(batch_size).repeat().shuffle( batch_size) self.trainSize = int(train_ratio * self.count) self.validationSize = int(validation_ratio * self.count) self.testSize = self.count - self.trainSize - self.validationSize self.trainData = self.dataset.take(self.trainSize).repeat() self.validationData = self.dataset.skip(self.trainSize).take( self.validationSize).repeat() self.testData = self.dataset.skip(self.testSize) self.train_batch_count = int(math.ceil(self.trainSize / batch_size)) self.test_batch_count = int(math.ceil(self.testSize / batch_size)) self.validation_batch_count = int( math.ceil(self.validationSize / batch_size))
def _prepare_dataset( self, dataset: tf.data.Dataset, shuffle: bool = False, augment: bool = False ) -> tf.data.Dataset: preprocessing_model = self._build_preprocessing() dataset = dataset.map( map_func=lambda x, y: (preprocessing_model(x, training=False), y), num_parallel_calls=tf.data.experimental.AUTOTUNE ) if shuffle: dataset = dataset.shuffle(buffer_size=1_000) dataset = dataset.batch(batch_size=self.batch_size) if augment: data_augmentation_model = self._build_data_augmentation() dataset = dataset.map( map_func=lambda x, y: (data_augmentation_model(x, training=False), y), num_parallel_calls=tf.data.experimental.AUTOTUNE ) return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
def create_text_vectorization_model( text_vectorization_filepath: str, dataset_all_tokens: tf.data.Dataset) -> tf.keras.models.Sequential: """ create text vectorization model this vectorizer converts an array of strings to an array of integers """ if exists(text_vectorization_filepath): logger.info('found text vectorization model') return tf.keras.models.load_model(text_vectorization_filepath, compile=False) vectorize_layer = TextVectorization(max_tokens=vocab_size, output_mode='int') logger.success('created text vectorization layer') # batch the dataset to make it easier to store # in memory vectorize_layer.adapt(dataset_all_tokens.batch(batch_size)) logger.success('adapted vectorization to training dataset') text_vectorization_model = tf.keras.models.Sequential( [tf.keras.Input(shape=(1, ), dtype=tf.string), vectorize_layer]) # simple text vectorization test logger.info(text_vectorization_model.predict(["this is a test"])) text_vectorization_model.save(text_vectorization_filepath) return text_vectorization_model
def extract_patches(tf_dataset: tf.data.Dataset, k=1, stride=1, scale=1, batch_size=100, batch_input=False): scales = scale if hasattr(scale, '__iter__') else [scale] def _extract_patches(*xs): assert len(xs) == len( scales ), 'inputs must be aligned with scales: got {}, expected {}'.format( len(xs), len(scales)) if len(xs) > 1: return tuple([ nn.extract_patches(x, k * scale, stride * scale) for x, scale in zip(xs, scales) ]) else: return nn.extract_patches(xs[0], k * scale, stride * scale) if batch_input: return tf_dataset.map(_extract_patches) else: return tf_dataset.batch(batch_size).map(_extract_patches).unbatch()
def get_augmented_data( dataset: tf.data.Dataset, batch_size: int, map_func: Callable, shuffle_buffer: Optional[int] = None, shuffle_seed: Optional[int] = None, augment_seed: Optional[int] = None, use_stateless_map: bool = False, ) -> RepeatedData: if shuffle_buffer is not None: dataset = dataset.shuffle(shuffle_buffer, seed=shuffle_seed) dataset = dataset.batch(batch_size) steps_per_epoch = tf.keras.backend.get_value(dataset.cardinality()) # repeat before map so stateless map is different across epochs dataset = dataset.repeat() AUTOTUNE = tf.data.experimental.AUTOTUNE if use_stateless_map: dataset = dataset.apply( tfrng.data.stateless_map( map_func, seed=augment_seed, num_parallel_calls=AUTOTUNE, )) else: # if map_func has random elements this won't be deterministic dataset = dataset.map(map_func, num_parallel_calls=AUTOTUNE) dataset = dataset.prefetch(AUTOTUNE) return RepeatedData(dataset, steps_per_epoch)
def create_dataset(dataset: tf.data.Dataset) -> tf.data.Dataset: dataset = dataset.map(normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def fit(self, dataset: tf.data.Dataset, epoches: int = 10, batch_size: int = 10) -> List[float]: """ Trains model. :param dataset: TensorFlow dataset :param epoches: number of epoches (default: 10) :param batch_size: batch size (default: 10) :return: list of batch errors """ assert epoches > 0, "Number of epoches must be positive" errors = [] for epoch in range(epoches): self.logger.info('Starting epoch %d', epoch) epoch_err_sum = 0.0 epoch_err_num = 0 for batch in dataset.batch(batch_size): err_t = self.step(batch) err_f: float = err_t.numpy().item() self.logger.debug('Batch error: %f', err_f) errors.append(err_f) epoch_err_sum += err_f epoch_err_num += 1 self.logger.info('Epoch error: %f', epoch_err_sum / epoch_err_num) return errors
def evaluate_fn(model: tff.learning.Model, dataset: tf.data.Dataset) -> OrderedDict[str, tf.Tensor]: """Evaluates a model on the given dataset. The returned metrics include those given by `model.report_local_unfinalized_metrics`. These are specified by the `loss` and `metrics` arguments when the model is created by `tff.learning.from_keras_model`. The returned metrics also contain an integer metric with name 'num_test_examples'. Args: model: A `tff.learning.Model` created by `tff.learning.from_keras_model`. dataset: An unbatched `tf.data.Dataset`. Returns: An `OrderedDict` of metric names to scalar `tf.Tensor`s. """ # Resets the model's local variables. This is necessary because # `model.report_local_unfinalized_metrics()` aggregates the metrics from *all* # previous calls to `forward_pass` (which include the metrics computed in # training). # Resetting ensures that the returned metrics are computed on test data. # Similar to the `reset_states` method of `tf.keras.metrics.Metric`. model.reset_metrics() def reduce_fn(num_examples_sum, batch): output = model.forward_pass(batch, training=False) return num_examples_sum + output.num_examples # Runs `reduce_fn` over the input dataset. The final metrics can be accessed # by `model.report_local_unfinalized_metrics()`. num_examples_sum = dataset.batch(_EVAL_BATCH_SIZE).reduce( initial_state=0, reduce_func=reduce_fn) eval_metrics = collections.OrderedDict() eval_metrics['num_test_examples'] = num_examples_sum local_outputs = model.report_local_unfinalized_metrics() # Postprocesses the metric values. This is needed because the values returned # by `model.report_local_unfinalized_metrics()` are values of the state # variables in each `tf.keras.metrics.Metric`. These values should be # processed in the same way as the `result()` method of a # `tf.keras.metrics.Metric`. for name, metric in local_outputs.items(): if not isinstance(metric, list): raise TypeError( f'The metric value returned by `report_local_unfinalized_metrics` is ' f'expected to be a list, but found an instance of ' f'{type(metric)}. Please check that your TFF model is ' 'built from a keras model.') if len(metric) == 2: # The loss and accuracy metrics used in this p13n example has two values: # one represents `sum`, and the other represents `count`. eval_metrics[name] = metric[0] / metric[1] elif len(metric) == 1: eval_metrics[name] = metric[0] else: raise ValueError( f'The metric value returned by `report_local_unfinalized_metrics` ' f'is expected to be a list of length 1 or 2, but found ' f'one with length {len(metric)}.') return eval_metrics
def get_prediction_uncertainty_deepensemble( model_paths: list, dataset_evaluation: tf.data.Dataset, batch_size: int = 256) -> np.ndarray: """Predict standard deviation of multiple predictions with different dropouts Args: model_path: Path of the trained model dataset_evaluation: dataset in which the evaluation need to performed Returns: predictions, array shape (N_SAMPLES, ) """ dataset = dataset_evaluation.batch(batch_size) logger.info("Start predicting uncertainty") # Go through all models and compute standard deviation of predictions start = time.time() predictions_list = [ _load_and_predict(model_path, dataset) for model_path in model_paths ] end = time.time() logger.info("Total time for uncertainty prediction experiment: %f:.3 sec", end - start) std = _calculate_std(predictions_list) return std
def mixup( ds: tf.data.Dataset, postmix_fn: typing.Callable[..., typing.Any] = None, num_parallel_calls: int = None, ): """tf.dataでのmixup: <https://arxiv.org/abs/1710.09412> Args: ds: 元のデータセット postmix_fn: mixup後の処理 num_parallel_calls: premix_fnの並列数 """ @tf.function def mixup_fn(*data): r = _tf_random_beta(alpha=0.2, beta=0.2) data = [ tf.cast(d[0], tf.float32) * r + tf.cast(d[1], tf.float32) * (1 - r) for d in data ] return data if postmix_fn is None else postmix_fn(*data) ds = ds.repeat() ds = ds.batch(2) ds = ds.map( mixup_fn, num_parallel_calls=num_parallel_calls, deterministic=None if num_parallel_calls is None else False, ) return ds
def create_dataset(dataset: tf.data.Dataset, num_classes: int, is_training: bool) -> tf.data.Dataset: """Produces a full, augmented dataset from the inptu dataset.""" _, _, resolution, _ = efficientnet_builder.efficientnet_params( FLAGS.model_name) def process_data(image, label): image = preprocessing.preprocess_image( image, is_training=is_training, use_bfloat16=FLAGS.strategy == 'tpus', image_size=resolution, augment_name=FLAGS.augment_name, randaug_num_layers=FLAGS.randaug_num_layers, randaug_magnitude=FLAGS.randaug_magnitude, resize_method=None) label = tf.one_hot(label, num_classes) return image, label dataset = dataset.map( process_data, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def prepare_Dataset(dataset: tf.data.Dataset, shuffle: bool = False, augment: bool = False) -> tf.data.Dataset: """Prepare the dataset object with preprocessing and data augmentation. Parameters ---------- dataset : tf.data.Dataset The dataset object shuffle : bool, optional Whether to shuffle the dataset, by default False augment : bool, optional Whether to augment the train dataset, by default False Returns ------- tf.data.Dataset The prepared dataset """ preprocessing_model = build_preprocessing() dataset = dataset.map(map_func=lambda x, y: (preprocessing_model(x), y), num_parallel_calls=AUTOTUNE) if shuffle: dataset = dataset.shuffle(buffer_size=1_000) dataset = dataset.batch(batch_size=BATCH_SIZE) if augment: data_augmentation_model = build_data_augmentation() dataset = dataset.map(map_func=lambda x, y: (data_augmentation_model(x), y), num_parallel_calls=AUTOTUNE) return dataset.prefetch(buffer_size=AUTOTUNE)
def batch_dataset(dataset: tf.data.Dataset, model: LineRecognizer, batch_size=32, bucket_boundaries=None, padded=True): # add image widths and text length dataset = dataset.map(lambda i, t: (i, tf.shape(i)[ 1], t, tf.strings.length(t, unit='UTF8_CHAR'))) dataset = dataset.map(lambda image, width, text, length: (image, width, model.encoder.encode(text), length)) output_shapes = (model.image_shape, [], [None], []) if bucket_boundaries: if isinstance(batch_size, int): batch_size = [batch_size] * (len(bucket_boundaries) + 1) dataset = dataset.apply( tf.data.experimental.bucket_by_sequence_length( lambda i, w, label, length: w, bucket_boundaries=bucket_boundaries, bucket_batch_sizes=batch_size, padded_shapes=output_shapes)) elif padded: dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=output_shapes) else: dataset = dataset.batch(batch_size) return dataset
def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset: """ Returns a test :class:`~tf.data.Dataset`. Args: test_dataset (:class:`~tf.data.Dataset`): The dataset to use. The dataset should yield tuples of ``(features, labels)`` where ``features`` is a dict of input features and ``labels`` is the labels. If ``labels`` is a tensor, the loss is calculated by the model by calling ``model(features, labels=labels)``. If ``labels`` is a dict, such as when using a QuestionAnswering head model with multiple targets, the loss is instead calculated by calling ``model(features, **labels)``. Subclass and override this method if you want to inject some custom behavior. """ num_examples = test_dataset.cardinality().numpy() if num_examples < 0: raise ValueError( "The training dataset must have an asserted cardinality") steps = math.ceil(num_examples / self.args.eval_batch_size) ds = test_dataset.batch(self.args.eval_batch_size).prefetch( tf.data.experimental.AUTOTUNE) return self.args.strategy.experimental_distribute_dataset( ds), steps, num_examples
def __init__(self, dataset: tf.data.Dataset, key: jnp.ndarray, batch_size: int): """Creates an iterator. Args: dataset: underlying tf Dataset key: a key to be used for random number generation batch_size: batch size """ # Read the whole dataset. We use artificially large batch_size to make sure # we capture the whole dataset. data = next(dataset.batch(1000000000).as_numpy_iterator()) dataset_size = jax.tree_flatten( jax.tree_map(lambda x: x.shape[0], data))[0][0] self._jax_dataset = jax.tree_map(jnp.asarray, data) logging.info('Finished loading a dataset into memory. Elements: %d', dataset_size) self._key = key def sample(key: jnp.ndarray) -> Tuple[Any, jnp.ndarray]: key, key_randint = jax.random.split(key) indices = jax.random.randint(key_randint, (batch_size, ), minval=0, maxval=dataset_size) demo_transitions = jax.tree_map( lambda d: jnp.take(d, indices, axis=0), self._jax_dataset) return demo_transitions, key self._sample = jax.jit(sample)
def preprocessing(dsData: tf.data.Dataset, window_size, batch_size): dsData = dsData.window(window_size + 1, shift=1, drop_remainder=True) dsData = dsData.flat_map(lambda w: w.batch(window_size + 1)) dsData = dsData.map(lambda x: (x[:-1], x[-1])) dsData = dsData.shuffle(1000) dsData = dsData.batch(batch_size).prefetch(1) return dsData
def train(self, dataset: tf.data.Dataset, nr_records: int): dataset = dataset.batch(self.batch_size).map(self.transform_example) dataset = dataset.repeat() dataset = dataset.shuffle(1000) self.model.fit(dataset, epochs=self.epochs, steps_per_epoch=nr_records // self.batch_size)
def generate_recovery_examples(tf_dataset: tf.data.Dataset, modes: List[str], mode: str, fwd_model, classifier_model, dataset, labeling_params, batch_size, start_at, stop_at): action_sequence_horizon = labeling_params['action_sequence_horizon'] tf_dataset = tf_dataset.batch(batch_size) action_rng = np.random.RandomState(0) n_batches = 0 for _ in tf_dataset: n_batches += 1 t0 = perf_counter() for in_batch_idx, example in enumerate(tf_dataset): if start_at is not None and (modes.index(mode) == modes.index( start_at[0]) and in_batch_idx < start_at[1]): continue if stop_at is not None and (modes.index(mode) == modes.index( stop_at[0]) and in_batch_idx >= stop_at[1]): print(Fore.GREEN + "Done!" + Fore.RESET) return dt = perf_counter() - t0 print(Fore.GREEN + f"{mode}: {in_batch_idx}/{n_batches}, {dt:.3f}s" + Fore.RESET) actual_batch_size = int(example['traj_idx'].shape[0]) # iterate over every subsequence of exactly length actions_sequence_horizon for start_t in range( 0, dataset.steps_per_traj - action_sequence_horizon + 1, labeling_params['start_step']): end_t = start_t + action_sequence_horizon actual_states_from_start_t = { k: example[k][:, start_t:end_t] for k in fwd_model.state_keys } actions_from_start_t = { k: example[k][:, start_t:end_t - 1] for k in fwd_model.action_keys } data = ( example, actions_from_start_t, actual_states_from_start_t, labeling_params, dataset.data_collection_params, ) constants = (actual_batch_size, action_sequence_horizon, classifier_model.horizon, actual_batch_size, start_t, end_t) out_examples = generate_recovery_actions_examples( fwd_model=fwd_model, classifier_model=classifier_model, scenario_metadata=dataset.scenario_metadata, data=data, constants=constants, action_rng=action_rng) yield out_examples
def create_dataset(self, dataset: tf.data.Dataset, input_columns, output_columns, batch_size: int, use_cache: bool): dataset = self.add_feature_columns_to_dataset(dataset, input_columns, output_columns) if use_cache: dataset = dataset.cache("cache").repeat() dataset = dataset.shuffle(1000, reshuffle_each_iteration=True) dataset = dataset.batch(batch_size, drop_remainder=True) return dataset
def prepare_for_testing(data_set: tf.data.Dataset, batch_size, cache_path=''): if cache_path != '': cache_filename = 'dataset_test.tfcache' data_set = data_set.cache(''.join([cache_path, '/', cache_filename])) data_set = data_set.repeat() data_set = data_set.batch(batch_size=batch_size) return data_set
def configure_for_performance(ds: tf.data.Dataset) -> tf.data.Dataset: """Function applies batch() and prefetch() functions to the dataset to optimize data processing. :param ds: TensorFlow Dataset object :return Batched TensorFlow Dataset object """ ds = ds.batch(BATCH_SIZE) ds = ds.prefetch(buffer_size=AUTOTUNE).cache() return ds
def processing(dataset: tf.data.Dataset, window_size, batch_size): dataset = dataset.map(lambda x: table.lookup(x)) dataset = dataset.unbatch() dataset = dataset.window(window_size+1, shift = 1, drop_remainder=True) dataset = dataset.flat_map(lambda ds: ds.batch(window_size+1)) dataset = dataset.map(lambda x: (x[:-1], x[-1]-1)) dataset = dataset.shuffle(10000) dataset = dataset.batch(batch_size).prefetch(1) return dataset
def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset: """ Returns a test :class:`~tf.data.Dataset`. Args: test_dataset (:class:`~tf.data.Dataset`): The dataset to use. """ ds = test_dataset.batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last) return self.args.strategy.experimental_distribute_dataset(ds)
def _prepare_dataset(self, dataset: tf.data.Dataset, shuffle: bool = False, augment: bool = False) -> tf.data.Dataset: if shuffle: dataset = dataset.shuffle(buffer_size=1_000) dataset = dataset.batch(batch_size=self.batch_size) return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
def keras_fit( model: tf.keras.Model, dataset: tf.data.Dataset, num_epochs: int, batch_size: int, callbacks: List[tf.keras.callbacks.Callback], ) -> None: """Train the model using model.fit(...).""" ds_train = dataset.batch(batch_size=batch_size, drop_remainder=False) model.fit(ds_train, epochs=num_epochs, callbacks=callbacks, verbose=2)
def to_batch_dataset(dataset: tf.data.Dataset, batchsize: int = 100, drop_remainder: bool = False): """ Function for converting from tf.data.Dataset type output by the `from_generator` function to a `BatchDataset` :param dataset: Tensorflow dataset generated from the use of `from_generator` Tensorflow function :param batchsize: The number of data records to be included in the batches for training :param drop_remainder: Boolean for determining whether or not data samples that dont fit in the specified batches should be dropped or not :return: """ return dataset.batch(batchsize, drop_remainder)
def train(self, dataset: tf.data.Dataset, nr_records: int): dataset = dataset.batch(self.batch_size) dataset = dataset.shuffle(1000) nr_steps = nr_records // self.batch_size for i in range(self.epochs): step = 0 for data in dataset: loss_value, grads = grad(self.model, data) self.optimizer.apply_gradients(zip(grads, [self.model.U, self.model.P])) printProgressBar(step, nr_steps, 'Epoch {}, loss: {:.3f}'.format(i, loss_value),length=80) step += 1