Esempio n. 1
0
  def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
    """Build a pipeline fetching, shuffling, and preprocessing the dataset.

    Args:
      dataset: A `tf.data.Dataset` that loads raw files.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
    if self._num_gpus > 1:
      dataset = dataset.shard(self._num_gpus, hvd.rank())

    if self.is_training:
      # Shuffle the input files.
      dataset.shuffle(buffer_size=self._file_shuffle_buffer_size)

    if self.is_training and not self._cache:
      dataset = dataset.repeat()

    # Read the data from disk in parallel
    dataset = dataset.interleave(
        tf.data.TFRecordDataset,
        cycle_length=10,
        block_length=1,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)

    if self._cache:
      dataset = dataset.cache()

    if self.is_training:
      dataset = dataset.shuffle(self._shuffle_buffer_size)
      dataset = dataset.repeat()

    # Parse, pre-process, and batch the data in parallel
    preprocess = self.parse_record
    dataset = dataset.map(preprocess,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    if self._num_gpus > 1:
      # The batch size of the dataset will be multiplied by the number of
      # replicas automatically when strategy.distribute_datasets_from_function
      # is called, so we use local batch size here.
      dataset = dataset.batch(self.local_batch_size,
                              drop_remainder=self.is_training)
    else:
      dataset = dataset.batch(self.global_batch_size,
                              drop_remainder=self.is_training)

    # Apply Mixup
    mixup_alpha = self.mixup_alpha if self.is_training else 0.0
    dataset = dataset.map(
        functools.partial(self.mixup, self.local_batch_size, mixup_alpha),
        num_parallel_calls=64)

    # Prefetch overlaps in-feed with training
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset
Esempio n. 2
0
    def model_fn(self, train_dataset: tf.data.Dataset,
                 eval_dataset: tf.data.Dataset):
        """
        Function defining the training flow of the feedforward neural network
        model.

        Args:
            train_dataset: tf.data.Dataset containing the training data.
            eval_dataset: tf.data.Dataset containing the evaluation data.

        Returns:
            model: A trained feedforward neural network model.
        """

        train_dataset = train_dataset.batch(self.batch_size,
                                            drop_remainder=True)
        eval_dataset = eval_dataset.batch(self.batch_size, drop_remainder=True)

        features = list(train_dataset.element_spec[0].keys())
        labels = list(train_dataset.element_spec[1].keys())

        input_layers = [
            tf.keras.layers.Input(shape=(1, ), name=k) for k in features
        ]

        d = tf.keras.layers.Concatenate()(input_layers)

        for size in self.hidden_layers:
            d = tf.keras.layers.Dense(size,
                                      activation=self.hidden_activation)(d)
            d = tf.keras.layers.Dropout(self.dropout_chance)(d)

        output_layers = {
            l: tf.keras.layers.Dense(self.output_units,
                                     activation=self.last_activation,
                                     name=l)(d)
            for l in labels
        }

        model = tf.keras.Model(inputs=input_layers, outputs=output_layers)

        model.compile(loss=self.loss,
                      optimizer=tf.keras.optimizers.Adam(lr=self.lr),
                      metrics=self.metrics)

        model.summary()

        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=self.log_dir)

        model.fit(train_dataset,
                  epochs=self.epochs,
                  validation_data=eval_dataset,
                  callbacks=[tensorboard_callback])

        return model
Esempio n. 3
0
def iterator_from_dataset(
    dataset: tf.data.Dataset,
    batch_size: int,
    repeat: bool = True,
    prefetch_size: int = 0,
    devices: Optional[Sequence[Any]] = None,
):
    """Create a data iterator that returns JAX arrays from a TF dataset.

    Args:
      dataset: the dataset to iterate over.
      batch_size: the batch sizes the iterator should return.
      repeat: whether the iterator should repeat the dataset.
      prefetch_size: the number of batches to prefetch to device.
      devices: the devices to prefetch to.

    Returns:
      An iterator that returns data batches.
    """
    if repeat:
        dataset = dataset.repeat()

    if batch_size > 0:
        dataset = dataset.batch(batch_size)
        it = map(prepare_tf_data, dataset)
    else:
        it = map(prepare_tf_data_unbatched, dataset)

    if prefetch_size > 0:
        it = jax_utils.prefetch_to_device(it, prefetch_size, devices)

    return it
Esempio n. 4
0
    def __init__(self,
                 tf_dataset: tf.data.Dataset,
                 train_ratio: float,
                 validation_ratio: float,
                 batch_size: int = 300):

        self.article_length = len(list(tf_dataset.as_numpy_iterator())[0][0])
        self.theme_count = len(list(tf_dataset.as_numpy_iterator())[0][1])
        self.count = len(list(tf_dataset.as_numpy_iterator()))

        self.dataset = tf_dataset.batch(batch_size).repeat().shuffle(
            batch_size)

        self.trainSize = int(train_ratio * self.count)
        self.validationSize = int(validation_ratio * self.count)
        self.testSize = self.count - self.trainSize - self.validationSize

        self.trainData = self.dataset.take(self.trainSize).repeat()
        self.validationData = self.dataset.skip(self.trainSize).take(
            self.validationSize).repeat()
        self.testData = self.dataset.skip(self.testSize)

        self.train_batch_count = int(math.ceil(self.trainSize / batch_size))
        self.test_batch_count = int(math.ceil(self.testSize / batch_size))
        self.validation_batch_count = int(
            math.ceil(self.validationSize / batch_size))
Esempio n. 5
0
    def _prepare_dataset(
        self,
        dataset: tf.data.Dataset,
        shuffle: bool = False,
        augment: bool = False
    ) -> tf.data.Dataset:
        preprocessing_model = self._build_preprocessing()
        dataset = dataset.map(
            map_func=lambda x, y: (preprocessing_model(x, training=False), y),
            num_parallel_calls=tf.data.experimental.AUTOTUNE
        )

        if shuffle:
            dataset = dataset.shuffle(buffer_size=1_000)

        dataset = dataset.batch(batch_size=self.batch_size)

        if augment:
            data_augmentation_model = self._build_data_augmentation()
            dataset = dataset.map(
                map_func=lambda x, y: (data_augmentation_model(x, training=False), y),
                num_parallel_calls=tf.data.experimental.AUTOTUNE
            )

        return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
Esempio n. 6
0
def create_text_vectorization_model(
        text_vectorization_filepath: str,
        dataset_all_tokens: tf.data.Dataset) -> tf.keras.models.Sequential:
    """
    create text vectorization model
    this vectorizer converts an array of strings to an array of integers
    """
    if exists(text_vectorization_filepath):
        logger.info('found text vectorization model')
        return tf.keras.models.load_model(text_vectorization_filepath,
                                          compile=False)

    vectorize_layer = TextVectorization(max_tokens=vocab_size,
                                        output_mode='int')
    logger.success('created text vectorization layer')
    # batch the dataset to make it easier to store
    # in memory
    vectorize_layer.adapt(dataset_all_tokens.batch(batch_size))
    logger.success('adapted vectorization to training dataset')

    text_vectorization_model = tf.keras.models.Sequential(
        [tf.keras.Input(shape=(1, ), dtype=tf.string), vectorize_layer])
    # simple text vectorization test
    logger.info(text_vectorization_model.predict(["this is a test"]))
    text_vectorization_model.save(text_vectorization_filepath)
    return text_vectorization_model
Esempio n. 7
0
def extract_patches(tf_dataset: tf.data.Dataset,
                    k=1,
                    stride=1,
                    scale=1,
                    batch_size=100,
                    batch_input=False):
    scales = scale if hasattr(scale, '__iter__') else [scale]

    def _extract_patches(*xs):
        assert len(xs) == len(
            scales
        ), 'inputs must be aligned with scales: got {}, expected {}'.format(
            len(xs), len(scales))
        if len(xs) > 1:
            return tuple([
                nn.extract_patches(x, k * scale, stride * scale)
                for x, scale in zip(xs, scales)
            ])
        else:
            return nn.extract_patches(xs[0], k * scale, stride * scale)

    if batch_input:
        return tf_dataset.map(_extract_patches)
    else:
        return tf_dataset.batch(batch_size).map(_extract_patches).unbatch()
Esempio n. 8
0
def get_augmented_data(
    dataset: tf.data.Dataset,
    batch_size: int,
    map_func: Callable,
    shuffle_buffer: Optional[int] = None,
    shuffle_seed: Optional[int] = None,
    augment_seed: Optional[int] = None,
    use_stateless_map: bool = False,
) -> RepeatedData:
    if shuffle_buffer is not None:
        dataset = dataset.shuffle(shuffle_buffer, seed=shuffle_seed)
    dataset = dataset.batch(batch_size)
    steps_per_epoch = tf.keras.backend.get_value(dataset.cardinality())
    # repeat before map so stateless map is different across epochs
    dataset = dataset.repeat()
    AUTOTUNE = tf.data.experimental.AUTOTUNE
    if use_stateless_map:
        dataset = dataset.apply(
            tfrng.data.stateless_map(
                map_func,
                seed=augment_seed,
                num_parallel_calls=AUTOTUNE,
            ))
    else:
        # if map_func has random elements this won't be deterministic
        dataset = dataset.map(map_func, num_parallel_calls=AUTOTUNE)
    dataset = dataset.prefetch(AUTOTUNE)
    return RepeatedData(dataset, steps_per_epoch)
def create_dataset(dataset: tf.data.Dataset) -> tf.data.Dataset:
    dataset = dataset.map(normalize_img,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset
Esempio n. 10
0
    def fit(self,
            dataset: tf.data.Dataset,
            epoches: int = 10,
            batch_size: int = 10) -> List[float]:
        """
        Trains model.

        :param dataset: TensorFlow dataset
        :param epoches: number of epoches (default: 10)
        :param batch_size: batch size (default: 10)
        :return: list of batch errors
        """
        assert epoches > 0, "Number of epoches must be positive"

        errors = []

        for epoch in range(epoches):
            self.logger.info('Starting epoch %d', epoch)

            epoch_err_sum = 0.0
            epoch_err_num = 0

            for batch in dataset.batch(batch_size):
                err_t = self.step(batch)
                err_f: float = err_t.numpy().item()
                self.logger.debug('Batch error: %f', err_f)
                errors.append(err_f)
                epoch_err_sum += err_f
                epoch_err_num += 1

            self.logger.info('Epoch error: %f', epoch_err_sum / epoch_err_num)

        return errors
Esempio n. 11
0
def evaluate_fn(model: tff.learning.Model,
                dataset: tf.data.Dataset) -> OrderedDict[str, tf.Tensor]:
    """Evaluates a model on the given dataset.

  The returned metrics include those given by
  `model.report_local_unfinalized_metrics`. These are specified by the `loss`
  and `metrics` arguments when the model is created by
  `tff.learning.from_keras_model`. The returned metrics also contain an integer
  metric with name 'num_test_examples'.

  Args:
    model: A `tff.learning.Model` created by `tff.learning.from_keras_model`.
    dataset: An unbatched `tf.data.Dataset`.

  Returns:
    An `OrderedDict` of metric names to scalar `tf.Tensor`s.
  """
    # Resets the model's local variables. This is necessary because
    # `model.report_local_unfinalized_metrics()` aggregates the metrics from *all*
    # previous calls to `forward_pass` (which include the metrics computed in
    # training).
    # Resetting ensures that the returned metrics are computed on test data.
    # Similar to the `reset_states` method of `tf.keras.metrics.Metric`.
    model.reset_metrics()

    def reduce_fn(num_examples_sum, batch):
        output = model.forward_pass(batch, training=False)
        return num_examples_sum + output.num_examples

    # Runs `reduce_fn` over the input dataset. The final metrics can be accessed
    # by `model.report_local_unfinalized_metrics()`.
    num_examples_sum = dataset.batch(_EVAL_BATCH_SIZE).reduce(
        initial_state=0, reduce_func=reduce_fn)
    eval_metrics = collections.OrderedDict()
    eval_metrics['num_test_examples'] = num_examples_sum
    local_outputs = model.report_local_unfinalized_metrics()
    # Postprocesses the metric values. This is needed because the values returned
    # by `model.report_local_unfinalized_metrics()` are values of the state
    # variables in each `tf.keras.metrics.Metric`. These values should be
    # processed in the same way as the `result()` method of a
    # `tf.keras.metrics.Metric`.
    for name, metric in local_outputs.items():
        if not isinstance(metric, list):
            raise TypeError(
                f'The metric value returned by `report_local_unfinalized_metrics` is '
                f'expected to be a list, but found an instance of '
                f'{type(metric)}. Please check that your TFF model is '
                'built from a keras model.')
        if len(metric) == 2:
            # The loss and accuracy metrics used in this p13n example has two values:
            # one represents `sum`, and the other represents `count`.
            eval_metrics[name] = metric[0] / metric[1]
        elif len(metric) == 1:
            eval_metrics[name] = metric[0]
        else:
            raise ValueError(
                f'The metric value returned by `report_local_unfinalized_metrics` '
                f'is expected to be a list of length 1 or 2, but found '
                f'one with length {len(metric)}.')
    return eval_metrics
Esempio n. 12
0
def get_prediction_uncertainty_deepensemble(
        model_paths: list,
        dataset_evaluation: tf.data.Dataset,
        batch_size: int = 256) -> np.ndarray:
    """Predict standard deviation of multiple predictions with different dropouts
    Args:
        model_path: Path of the trained model
        dataset_evaluation: dataset in which the evaluation need to performed
    Returns:
        predictions, array shape (N_SAMPLES, )
    """
    dataset = dataset_evaluation.batch(batch_size)
    logger.info("Start predicting uncertainty")

    # Go through all models and compute standard deviation of predictions
    start = time.time()
    predictions_list = [
        _load_and_predict(model_path, dataset) for model_path in model_paths
    ]
    end = time.time()
    logger.info("Total time for uncertainty prediction experiment: %f:.3 sec",
                end - start)

    std = _calculate_std(predictions_list)
    return std
Esempio n. 13
0
def mixup(
    ds: tf.data.Dataset,
    postmix_fn: typing.Callable[..., typing.Any] = None,
    num_parallel_calls: int = None,
):
    """tf.dataでのmixup: <https://arxiv.org/abs/1710.09412>

    Args:
        ds: 元のデータセット
        postmix_fn: mixup後の処理
        num_parallel_calls: premix_fnの並列数

    """
    @tf.function
    def mixup_fn(*data):
        r = _tf_random_beta(alpha=0.2, beta=0.2)
        data = [
            tf.cast(d[0], tf.float32) * r + tf.cast(d[1], tf.float32) * (1 - r)
            for d in data
        ]
        return data if postmix_fn is None else postmix_fn(*data)

    ds = ds.repeat()
    ds = ds.batch(2)
    ds = ds.map(
        mixup_fn,
        num_parallel_calls=num_parallel_calls,
        deterministic=None if num_parallel_calls is None else False,
    )
    return ds
Esempio n. 14
0
def create_dataset(dataset: tf.data.Dataset, num_classes: int,
                   is_training: bool) -> tf.data.Dataset:
  """Produces a full, augmented dataset from the inptu dataset."""
  _, _, resolution, _ = efficientnet_builder.efficientnet_params(
      FLAGS.model_name)

  def process_data(image, label):
    image = preprocessing.preprocess_image(
        image,
        is_training=is_training,
        use_bfloat16=FLAGS.strategy == 'tpus',
        image_size=resolution,
        augment_name=FLAGS.augment_name,
        randaug_num_layers=FLAGS.randaug_num_layers,
        randaug_magnitude=FLAGS.randaug_magnitude,
        resize_method=None)

    label = tf.one_hot(label, num_classes)
    return image, label

  dataset = dataset.map(
      process_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True)
  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

  return dataset
Esempio n. 15
0
def prepare_Dataset(dataset: tf.data.Dataset,
                    shuffle: bool = False,
                    augment: bool = False) -> tf.data.Dataset:
    """Prepare the dataset object with preprocessing and data augmentation.

    Parameters
    ----------
    dataset : tf.data.Dataset
        The dataset object
    shuffle : bool, optional
        Whether to shuffle the dataset, by default False
    augment : bool, optional
        Whether to augment the train dataset, by default False

    Returns
    -------
    tf.data.Dataset
        The prepared dataset
    """
    preprocessing_model = build_preprocessing()
    dataset = dataset.map(map_func=lambda x, y: (preprocessing_model(x), y),
                          num_parallel_calls=AUTOTUNE)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=1_000)

    dataset = dataset.batch(batch_size=BATCH_SIZE)

    if augment:
        data_augmentation_model = build_data_augmentation()
        dataset = dataset.map(map_func=lambda x, y:
                              (data_augmentation_model(x), y),
                              num_parallel_calls=AUTOTUNE)

    return dataset.prefetch(buffer_size=AUTOTUNE)
Esempio n. 16
0
def batch_dataset(dataset: tf.data.Dataset,
                  model: LineRecognizer,
                  batch_size=32,
                  bucket_boundaries=None,
                  padded=True):
    # add image widths and text length
    dataset = dataset.map(lambda i, t: (i, tf.shape(i)[
        1], t, tf.strings.length(t, unit='UTF8_CHAR')))

    dataset = dataset.map(lambda image, width, text, length:
                          (image, width, model.encoder.encode(text), length))

    output_shapes = (model.image_shape, [], [None], [])

    if bucket_boundaries:
        if isinstance(batch_size, int):
            batch_size = [batch_size] * (len(bucket_boundaries) + 1)

        dataset = dataset.apply(
            tf.data.experimental.bucket_by_sequence_length(
                lambda i, w, label, length: w,
                bucket_boundaries=bucket_boundaries,
                bucket_batch_sizes=batch_size,
                padded_shapes=output_shapes))

    elif padded:
        dataset = dataset.padded_batch(batch_size=batch_size,
                                       padded_shapes=output_shapes)
    else:
        dataset = dataset.batch(batch_size)

    return dataset
Esempio n. 17
0
    def get_test_tfdataset(self,
                           test_dataset: tf.data.Dataset) -> tf.data.Dataset:
        """
        Returns a test :class:`~tf.data.Dataset`.

        Args:
            test_dataset (:class:`~tf.data.Dataset`):
                The dataset to use. The dataset should yield tuples of ``(features, labels)`` where ``features`` is a
                dict of input features and ``labels`` is the labels. If ``labels`` is a tensor, the loss is calculated
                by the model by calling ``model(features, labels=labels)``. If ``labels`` is a dict, such as when using
                a QuestionAnswering head model with multiple targets, the loss is instead calculated by calling
                ``model(features, **labels)``.

        Subclass and override this method if you want to inject some custom behavior.
        """

        num_examples = test_dataset.cardinality().numpy()

        if num_examples < 0:
            raise ValueError(
                "The training dataset must have an asserted cardinality")

        steps = math.ceil(num_examples / self.args.eval_batch_size)
        ds = test_dataset.batch(self.args.eval_batch_size).prefetch(
            tf.data.experimental.AUTOTUNE)

        return self.args.strategy.experimental_distribute_dataset(
            ds), steps, num_examples
Esempio n. 18
0
    def __init__(self, dataset: tf.data.Dataset, key: jnp.ndarray,
                 batch_size: int):
        """Creates an iterator.

    Args:
      dataset: underlying tf Dataset
      key: a key to be used for random number generation
      batch_size: batch size
    """
        # Read the whole dataset. We use artificially large batch_size to make sure
        # we capture the whole dataset.
        data = next(dataset.batch(1000000000).as_numpy_iterator())
        dataset_size = jax.tree_flatten(
            jax.tree_map(lambda x: x.shape[0], data))[0][0]
        self._jax_dataset = jax.tree_map(jnp.asarray, data)
        logging.info('Finished loading a dataset into memory. Elements: %d',
                     dataset_size)
        self._key = key

        def sample(key: jnp.ndarray) -> Tuple[Any, jnp.ndarray]:
            key, key_randint = jax.random.split(key)
            indices = jax.random.randint(key_randint, (batch_size, ),
                                         minval=0,
                                         maxval=dataset_size)
            demo_transitions = jax.tree_map(
                lambda d: jnp.take(d, indices, axis=0), self._jax_dataset)
            return demo_transitions, key

        self._sample = jax.jit(sample)
Esempio n. 19
0
def preprocessing(dsData: tf.data.Dataset, window_size, batch_size):
    dsData = dsData.window(window_size + 1, shift=1, drop_remainder=True)
    dsData = dsData.flat_map(lambda w: w.batch(window_size + 1))
    dsData = dsData.map(lambda x: (x[:-1], x[-1]))
    dsData = dsData.shuffle(1000)
    dsData = dsData.batch(batch_size).prefetch(1)
    return dsData
 def train(self, dataset: tf.data.Dataset, nr_records: int):
     dataset = dataset.batch(self.batch_size).map(self.transform_example)
     dataset = dataset.repeat()
     dataset = dataset.shuffle(1000)
     self.model.fit(dataset,
                    epochs=self.epochs,
                    steps_per_epoch=nr_records // self.batch_size)
Esempio n. 21
0
def generate_recovery_examples(tf_dataset: tf.data.Dataset, modes: List[str],
                               mode: str, fwd_model, classifier_model, dataset,
                               labeling_params, batch_size, start_at, stop_at):
    action_sequence_horizon = labeling_params['action_sequence_horizon']
    tf_dataset = tf_dataset.batch(batch_size)
    action_rng = np.random.RandomState(0)
    n_batches = 0
    for _ in tf_dataset:
        n_batches += 1

    t0 = perf_counter()
    for in_batch_idx, example in enumerate(tf_dataset):
        if start_at is not None and (modes.index(mode) == modes.index(
                start_at[0]) and in_batch_idx < start_at[1]):
            continue
        if stop_at is not None and (modes.index(mode) == modes.index(
                stop_at[0]) and in_batch_idx >= stop_at[1]):
            print(Fore.GREEN + "Done!" + Fore.RESET)
            return

        dt = perf_counter() - t0
        print(Fore.GREEN + f"{mode}: {in_batch_idx}/{n_batches}, {dt:.3f}s" +
              Fore.RESET)
        actual_batch_size = int(example['traj_idx'].shape[0])

        # iterate over every subsequence of exactly length actions_sequence_horizon
        for start_t in range(
                0, dataset.steps_per_traj - action_sequence_horizon + 1,
                labeling_params['start_step']):
            end_t = start_t + action_sequence_horizon

            actual_states_from_start_t = {
                k: example[k][:, start_t:end_t]
                for k in fwd_model.state_keys
            }
            actions_from_start_t = {
                k: example[k][:, start_t:end_t - 1]
                for k in fwd_model.action_keys
            }

            data = (
                example,
                actions_from_start_t,
                actual_states_from_start_t,
                labeling_params,
                dataset.data_collection_params,
            )
            constants = (actual_batch_size, action_sequence_horizon,
                         classifier_model.horizon, actual_batch_size, start_t,
                         end_t)
            out_examples = generate_recovery_actions_examples(
                fwd_model=fwd_model,
                classifier_model=classifier_model,
                scenario_metadata=dataset.scenario_metadata,
                data=data,
                constants=constants,
                action_rng=action_rng)
            yield out_examples
 def create_dataset(self, dataset: tf.data.Dataset,
                    input_columns, output_columns,
                    batch_size: int, use_cache: bool):
     dataset = self.add_feature_columns_to_dataset(dataset, input_columns, output_columns)
     if use_cache:
         dataset = dataset.cache("cache").repeat()
     dataset = dataset.shuffle(1000, reshuffle_each_iteration=True)
     dataset = dataset.batch(batch_size, drop_remainder=True)
     return dataset
def prepare_for_testing(data_set: tf.data.Dataset, batch_size, cache_path=''):
    if cache_path != '':
        cache_filename = 'dataset_test.tfcache'
        data_set = data_set.cache(''.join([cache_path, '/', cache_filename]))

    data_set = data_set.repeat()
    data_set = data_set.batch(batch_size=batch_size)

    return data_set
Esempio n. 24
0
def configure_for_performance(ds: tf.data.Dataset) -> tf.data.Dataset:
    """Function applies batch() and prefetch() functions
    to the dataset to optimize data processing.
    :param ds: TensorFlow Dataset object
    :return Batched TensorFlow Dataset object
    """
    ds = ds.batch(BATCH_SIZE)
    ds = ds.prefetch(buffer_size=AUTOTUNE).cache()
    return ds
Esempio n. 25
0
def processing(dataset: tf.data.Dataset, window_size, batch_size):
    dataset = dataset.map(lambda x: table.lookup(x))
    dataset = dataset.unbatch()
    dataset = dataset.window(window_size+1, shift = 1, drop_remainder=True)
    dataset = dataset.flat_map(lambda ds: ds.batch(window_size+1))
    dataset = dataset.map(lambda x: (x[:-1], x[-1]-1))
    dataset = dataset.shuffle(10000)
    dataset = dataset.batch(batch_size).prefetch(1)
    return dataset
Esempio n. 26
0
    def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset:
        """
        Returns a test :class:`~tf.data.Dataset`.

        Args:
            test_dataset (:class:`~tf.data.Dataset`): The dataset to use.
        """
        ds = test_dataset.batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last)

        return self.args.strategy.experimental_distribute_dataset(ds)
Esempio n. 27
0
    def _prepare_dataset(self,
                         dataset: tf.data.Dataset,
                         shuffle: bool = False,
                         augment: bool = False) -> tf.data.Dataset:
        if shuffle:
            dataset = dataset.shuffle(buffer_size=1_000)

        dataset = dataset.batch(batch_size=self.batch_size)

        return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
Esempio n. 28
0
def keras_fit(
    model: tf.keras.Model,
    dataset: tf.data.Dataset,
    num_epochs: int,
    batch_size: int,
    callbacks: List[tf.keras.callbacks.Callback],
) -> None:
    """Train the model using model.fit(...)."""
    ds_train = dataset.batch(batch_size=batch_size, drop_remainder=False)
    model.fit(ds_train, epochs=num_epochs, callbacks=callbacks, verbose=2)
Esempio n. 29
0
def to_batch_dataset(dataset: tf.data.Dataset, batchsize: int = 100, drop_remainder: bool = False):
    """
    Function for converting from tf.data.Dataset type output by the `from_generator` function to a `BatchDataset`

    :param dataset: Tensorflow dataset generated from the use of `from_generator` Tensorflow function
    :param batchsize: The number of data records to be included in the batches for training
    :param drop_remainder: Boolean for determining whether or not data samples that dont fit in the specified batches
    should be dropped or not
    :return:
    """
    return dataset.batch(batchsize, drop_remainder)
 def train(self, dataset: tf.data.Dataset, nr_records: int):
     dataset = dataset.batch(self.batch_size)
     dataset = dataset.shuffle(1000)
     nr_steps = nr_records // self.batch_size
     for i in range(self.epochs):
         step = 0
         for data in dataset:
             loss_value, grads = grad(self.model, data)
             self.optimizer.apply_gradients(zip(grads, [self.model.U, self.model.P]))
             printProgressBar(step, nr_steps, 'Epoch {}, loss:  {:.3f}'.format(i, loss_value),length=80)
             step += 1