コード例 #1
0
    def _get_input_iterator(
            self, input_fn: Callable[[Optional[params_dict.ParamsDict]],
                                     tf.data.Dataset],
            strategy: tf.distribute.Strategy) -> Optional[Iterator[Any]]:
        """Returns distributed dataset iterator.

    Args:
      input_fn: (params: dict) -> tf.data.Dataset.
      strategy: an instance of tf.distribute.Strategy.

    Returns:
      An iterator that yields input tensors.
    """

        if input_fn is None:
            return None
        # When training with multiple TPU workers, datasets needs to be cloned
        # across workers. Since Dataset instance cannot be cloned in eager mode,
        # we instead pass callable that returns a dataset.
        if self._is_multi_host:
            return iter(
                strategy.experimental_distribute_datasets_from_function(
                    input_fn))
        else:
            input_data = input_fn(self._params)
            return iter(strategy.experimental_distribute_dataset(input_data))
コード例 #2
0
def create_dataset(
    dataset_builder: base.BaseDataset, batch_size: int, process_fn: Any,
    distributed_strategy: tf.distribute.Strategy, distributed: bool
) -> Union[tf.data.Dataset, tf.distribute.DistributedDataset]:
    """Creates (optionally distributed) dataset from dataset_builder and process_fn."""
    dataset = dataset_builder.load(batch_size=batch_size).map(process_fn)
    if distributed:
        dataset = distributed_strategy.experimental_distribute_dataset(dataset)
    return dataset
コード例 #3
0
 def read(
         self,
         mode: str,
         mirrored_strategy: tf.distribute.Strategy = None
 ) -> tf.data.Dataset:
     if mirrored_strategy:
         num_gpus = mirrored_strategy.num_replicas_in_sync
         with mirrored_strategy.scope():
             dataset, num_iters = self._read(mode,
                                             self._batch_size * num_gpus)
             dataset = mirrored_strategy.experimental_distribute_dataset(
                 dataset)
         return dataset, num_iters
     else:
         return self._read(mode, self._batch_size)
コード例 #4
0
def compute_predictions(
    model: PredictionModel, dataset: tf.data.Dataset,
    strategy: tf.distribute.Strategy, batch_size: int
) -> Iterator[Tuple[types.ModelPredictions, types.Features]]:
  """Yield the predictions of the model on the given dataset.

  Args:
    model: A function that takes tensor-valued features and returns a vector of
      predictions.
    dataset: The dataset that the function consumes to produce the predictions.
    strategy: The distribution strategy to use when computing.
    batch_size: The batch size that should be used.

  Yields:
    Pairs of model predictions and the corresponding metadata.
  """
  with strategy.scope():
    dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = (
        tf.data.experimental.AutoShardPolicy.DATA)
    dataset = dataset.with_options(options)

  for features in strategy.experimental_distribute_dataset(dataset):
    time_start = time.time()
    if isinstance(strategy, tf.distribute.experimental.TPUStrategy):
      # TODO(josipd): Figure this out better. We can't easily filter,
      #               as they are PerReplica values, not tensors.
      features_model = {"image": features["image"]}
    else:
      features_model = features
    predictions = materialize(strategy,
                              strategy.run(model, args=(features_model,)))
    time_end = time.time()
    time_delta_per_example = (time_end - time_start) / predictions.shape[0]
    metadatas = materialize(strategy, features["metadata"])
    for i in range(predictions.shape[0]):
      model_predictions = types.ModelPredictions(
          predictions=[predictions[i]],
          time_in_s=time_delta_per_example)
      metadata_i = _slice_dictionary(metadatas, i)
      yield model_predictions, metadata_i
コード例 #5
0
def compute_predictions(
        model: PredictionModel, dataset: tf.data.Dataset,
        strategy: tf.distribute.Strategy) -> Iterator[types.ModelPredictions]:
    """Yield the predictions of the model on the given dataset.

  Note that the dataset is expected to yield batches of tensors.

  Args:
    model: A function that takes tensor-valued features and returns a vector of
      predictions.
    dataset: The dataset that the function consumes to produce the predictions.
    strategy: The distribution strategy to use when computing.

  Yields:
    The predictions of the model on the dataset.
  """

    for features in strategy.experimental_distribute_dataset(dataset):
        # TODO(josipd): Figure out how to pass only tpu-allowed types.
        time_start = time.time()
        predictions = materialize(
            strategy, strategy.run(model,
                                   args=({
                                       "image": features["image"]
                                   }, )))
        time_end = time.time()
        time_delta_per_example = (time_end - time_start) / predictions.shape[0]
        try:
            element_ids = materialize(strategy, features["element_id"])
        except KeyError:
            element_ids = [None] * predictions.shape[0]
        metadatas = materialize(strategy, features["metadata"])
        for i in range(predictions.shape[0]):
            yield types.ModelPredictions(element_id=element_ids[i],
                                         metadata=_slice_dictionary(
                                             metadatas, i),
                                         predictions=[predictions[i]],
                                         time_in_s=time_delta_per_example)
コード例 #6
0
def get_datasets(args,
                 strategy: tf.distribute.Strategy,
                 buffer_size: int = 256):
    """ Load and return preprocessed and distributed horse2zebra dataset """
    dataset, metadata = tfds.load("cycle_gan/horse2zebra",
                                  with_info=True,
                                  as_supervised=True)
    train_horses, train_zebras = dataset["trainA"], dataset["trainB"]
    test_horses, test_zebras = dataset["testA"], dataset["testB"]

    # calculate the number of train and test steps needed per epoch
    get_size = lambda name: metadata.splits.get(name).num_examples
    num_train_samples = min([get_size('trainA'), get_size('trainB')])
    num_test_samples = min([get_size('testA'), get_size('testB')])
    args.train_steps = ceil(num_train_samples / args.global_batch_size)
    args.test_steps = ceil(num_test_samples / args.global_batch_size)

    def normalize_image(image):
        """ normalize image to [-1, 1] """
        image = tf.cast(image, dtype=tf.float32)
        return (image / 127.5) - 1.0

    def preprocess_train(image, _):
        image = tf.image.random_flip_left_right(image)
        image = tf.image.resize(image, size=IMAGE_SHAPE)
        image = tf.image.random_crop(image, size=INPUT_SHAPE)
        image = normalize_image(image)
        return image

    def preprocess_test(image, _):
        image = tf.image.resize(image, size=INPUT_SHAPE[:2])
        image = normalize_image(image)
        return image

    train_horses = train_horses.take(num_train_samples)
    train_horses = train_horses.map(preprocess_train,
                                    num_parallel_calls=AUTOTUNE)
    train_horses = train_horses.cache()
    train_horses = train_horses.shuffle(buffer_size)

    train_zebras = train_zebras.take(num_train_samples)
    train_zebras = train_zebras.map(preprocess_train,
                                    num_parallel_calls=AUTOTUNE)
    train_zebras = train_zebras.cache()
    train_zebras = train_zebras.shuffle(buffer_size)

    test_horses = test_horses.take(num_test_samples)
    test_horses = test_horses.map(preprocess_test, num_parallel_calls=AUTOTUNE)
    test_horses = test_horses.cache()

    test_zebras = test_zebras.take(num_test_samples)
    test_zebras = test_zebras.map(preprocess_test, num_parallel_calls=AUTOTUNE)
    test_zebras = test_zebras.cache()

    train_ds = tf.data.Dataset.zip(
        (train_horses.batch(args.global_batch_size),
         train_zebras.batch(args.global_batch_size))).prefetch(AUTOTUNE)
    test_ds = tf.data.Dataset.zip((test_horses.batch(args.global_batch_size),
                                   test_zebras.batch(args.global_batch_size)))
    # take 5 samples from the test set for plotting
    plot_ds = tf.data.Dataset.zip(
        (test_horses.take(5).batch(1), test_zebras.take(5).batch(1)))

    # create distributed datasets
    train_ds = strategy.experimental_distribute_dataset(train_ds)
    test_ds = strategy.experimental_distribute_dataset(test_ds)

    return train_ds, test_ds, plot_ds
コード例 #7
0
def run(train_dataset: tf.data.Dataset, eval_datasets: Dict[str,
                                                            tf.data.Dataset],
        steps_per_eval: Dict[str, int], params: utils.ModelParameters,
        model_dir: str, strategy: tf.distribute.Strategy,
        summary_writer: tf.summary.SummaryWriter, loss_type: str,
        graph_augmenter: augmentation_utils.GraphAugment):
    """Trains and evaluates the model."""
    with strategy.scope():
        model = ub.models.mpnn(
            nodes_shape=train_dataset.element_spec[0]['atoms'].shape[1:],
            edges_shape=train_dataset.element_spec[0]['pairs'].shape[1:],
            num_heads=params.num_heads,
            num_layers=params.num_layers,
            message_layer_size=params.message_layer_size,
            readout_layer_size=params.readout_layer_size,
            use_gp_layer=params.use_gp_layer)
        optimizer = tf.keras.optimizers.RMSprop(
            learning_rate=params.learning_rate)
        metrics = {
            'train/negative_log_likelihood': tf.keras.metrics.Mean(),
            'train/accuracy': tf.keras.metrics.CategoricalAccuracy(),
            'train/loss': tf.keras.metrics.Mean(),
            'train/roc_auc': tf.keras.metrics.AUC(),
        }

        for dataset_name in eval_datasets:
            metrics[
                f'{dataset_name}/accuracy'] = tf.keras.metrics.CategoricalAccuracy(
                )
            metrics[f'{dataset_name}/roc_auc'] = tf.keras.metrics.AUC()
            metrics[
                f'{dataset_name}/negative_log_likelihood'] = tf.keras.metrics.Mean(
                )
            if dataset_name == 'test2':
                ece_num_bins = 5
            else:
                ece_num_bins = 10
            metrics[
                f'{dataset_name}/ece'] = rm.metrics.ExpectedCalibrationError(
                    num_bins=ece_num_bins)
            metrics[f'{dataset_name}/brier'] = rm.metrics.Brier()

    @tf.function
    def train_step(iterator):
        """Training StepFn."""
        def step_fn(inputs):
            """Per-Replica StepFn."""
            if len(inputs) == 3:
                features, labels, sample_weights = inputs
            else:
                features, labels = inputs
                sample_weights = 1

            if params.augmentations:
                # TODO(jihyeonlee): For now, choose 1 augmentation function from all
                # possible with equal probability. Allow user to specify number of
                # augmentations to apply per graph.
                features = graph_augmenter.augment(features)

            with tf.GradientTape() as tape:
                probs = model(features, training=True)
                negative_log_likelihood = tf.reduce_mean(
                    tf.keras.losses.categorical_crossentropy(labels, probs) *
                    sample_weights)

                l2_loss = sum(model.losses)
                if loss_type == 'focal':
                    focal_loss_fn = tfa_losses.SigmoidFocalCrossEntropy()
                    focal_loss = tf.reduce_mean(
                        focal_loss_fn(labels, probs) * sample_weights)
                    loss = focal_loss + l2_loss
                else:
                    loss = negative_log_likelihood + l2_loss
                # Scale the loss given the tf.distribute.Strategy will reduce sum all
                # gradients. See details in
                # https://www.tensorflow.org/tutorials/distribute/custom_training#define_the_loss_function
                scaled_loss = loss / strategy.num_replicas_in_sync

            grads = tape.gradient(scaled_loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            metrics['train/loss'].update_state(loss)
            metrics['train/negative_log_likelihood'].update_state(
                negative_log_likelihood)
            metrics['train/accuracy'].update_state(labels, probs)
            metrics['train/roc_auc'].update_state(labels[:, 1], probs[:, 1])

        for _ in tf.range(tf.cast(params.steps_per_epoch, tf.int32)):
            strategy.run(step_fn, args=(next(iterator), ))

    @tf.function
    def eval_step(iterator, dataset_name, num_steps):
        """Evaluation StepFn."""
        def step_fn(inputs):
            """Per-Replica StepFn."""
            if len(inputs) == 3:
                features, labels, _ = inputs
            else:
                features, labels = inputs

            probs = model(features, training=False)
            negative_log_likelihood = tf.reduce_mean(
                tf.keras.losses.categorical_crossentropy(labels, probs))

            metrics[f'{dataset_name}/negative_log_likelihood'].update_state(
                negative_log_likelihood)
            metrics[f'{dataset_name}/accuracy'].update_state(labels, probs)
            metrics[f'{dataset_name}/roc_auc'].update_state(
                labels[:, 1], probs[:, 1])
            metrics[f'{dataset_name}/ece'].add_batch(probs[:, 1],
                                                     label=labels[:, 1])
            metrics[f'{dataset_name}/brier'].add_batch(probs,
                                                       label=labels[:, 1])

        for _ in tf.range(tf.cast(num_steps, tf.int32)):
            strategy.run(step_fn, args=(next(iterator), ))

    # Makes datasets into distributed version.
    train_dataset = strategy.experimental_distribute_dataset(train_dataset)
    eval_datasets = {
        ds_name: strategy.experimental_distribute_dataset(ds)
        for ds_name, ds in eval_datasets.items()
    }
    logging.info('Number of replicas in sync: %s',
                 strategy.num_replicas_in_sync)

    train_iterator = iter(train_dataset)
    start_time = time.time()
    metrics_history = collections.defaultdict(list)
    for epoch in range(params.num_epochs):
        logging.info('Starting to run epoch: %s', epoch)
        train_step(train_iterator)

        current_step = (epoch + 1) * params.steps_per_epoch
        max_steps = params.steps_per_epoch * params.num_epochs
        time_elapsed = time.time() - start_time
        steps_per_sec = float(current_step) / time_elapsed
        eta_seconds = (max_steps - current_step) / steps_per_sec
        message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. '
                   'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format(
                       current_step / max_steps, epoch + 1, params.num_epochs,
                       steps_per_sec, eta_seconds / 60, time_elapsed / 60))
        logging.info(message)

        # Start evaluation.
        logging.info('Starting to run eval at epoch: %s', epoch)
        for dataset_name, eval_dataset in eval_datasets.items():
            eval_iterator = iter(eval_dataset)
            eval_step(eval_iterator, dataset_name,
                      steps_per_eval[dataset_name])

        metrics_history['epoch'].append(epoch + 1)
        with summary_writer.as_default():
            for name, metric in metrics.items():
                result = utils.get_metric_result_value(metric)
                tf.summary.scalar(name, result, step=epoch + 1)
                metrics_history[name].append(str(result))

        for metric in metrics.values():
            metric.reset_states()

        model.save(os.path.join(model_dir, f'model_{epoch + 1}'),
                   overwrite=True)

    utils.write_params(metrics_history,
                       os.path.join(model_dir, 'metrics_history.json'))
コード例 #8
0
ファイル: sngp.py プロジェクト: google/uncertainty-baselines
def run(
    train_dataset: tf.data.Dataset,
    eval_datasets: Dict[str, tf.data.Dataset],
    steps_per_eval: Dict[str, int],
    params: utils.ModelParameters,
    model_dir: str,
    gp_layer_kwargs: Dict[str, Any],
    strategy: tf.distribute.Strategy,
    summary_writer: tf.summary.SummaryWriter,
    loss_type: str,
    use_spec_norm: bool,
    spec_norm_multiplier: float,
    use_spec_norm_mp: bool,
    spec_norm_multiplier_mp: float):
  """Trains and evaluates the model.

  Args:
    train_dataset: tf dataset that provides training data.
    eval_datasets: A dictionary of tf datasets that provides data for model
      evaluation.
    steps_per_eval: A dictionary of steps needed for each evaluation dataset.
    params: ModelParameters object containing MPNN model parameters.
    model_dir: Directory for files generated during training and evaluation.
    gp_layer_kwargs: A dictionary of parameters used for GP layer.
    strategy: tf Distributed training strategy object.
    summary_writer: tf summary writer to log training and evaluation metrics.
    loss_type: str, loss type to use during training. Currently only
      supports focal loss and cross-entropy loss.
    use_spec_norm: Whether to use Spectral normalization for the dense layer.
    spec_norm_multiplier: Multiplier used to control the magnitude of
      eigenvalue of the dense layer weight matrix.
    use_spec_norm_mp: Whether to use Spectral normalization for the MP layer.
    spec_norm_multiplier_mp: Multiplier used to control the magnitude of
      eigenvalue of the MP layer weight matrix.

  """
  with strategy.scope():
    model = ub.models.mpnn(
        nodes_shape=train_dataset.element_spec[0]['atoms'].shape[1:],
        edges_shape=train_dataset.element_spec[0]['pairs'].shape[1:],
        num_heads=params.num_heads,
        num_layers=params.num_layers,
        message_layer_size=params.message_layer_size,
        readout_layer_size=params.readout_layer_size,
        use_gp_layer=params.use_gp_layer,
        gp_layer_kwargs=gp_layer_kwargs,
        use_spec_norm=use_spec_norm,
        spec_norm_multiplier=spec_norm_multiplier,
        use_spec_norm_mp=use_spec_norm_mp,
        spec_norm_multiplier_mp=spec_norm_multiplier_mp)
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=params.learning_rate)
    metrics = {
        'train/negative_log_likelihood': tf.keras.metrics.Mean(),
        'train/accuracy': tf.keras.metrics.CategoricalAccuracy(),
        'train/loss': tf.keras.metrics.Mean(),
        'train/roc_auc': tf.keras.metrics.AUC(),
    }

    for dataset_name in eval_datasets:
      metrics[
          f'{dataset_name}/accuracy'] = tf.keras.metrics.CategoricalAccuracy()
      metrics[f'{dataset_name}/roc_auc'] = tf.keras.metrics.AUC()
      metrics[
          f'{dataset_name}/negative_log_likelihood'] = tf.keras.metrics.Mean()
      if dataset_name == 'test2':
        ece_num_bins = 5
      else:
        ece_num_bins = 10
      metrics[f'{dataset_name}/ece'] = rm.metrics.ExpectedCalibrationError(
          num_bins=ece_num_bins)
      metrics[f'{dataset_name}/brier'] = rm.metrics.Brier()

  def per_replica_train_step_fn(inputs):
    """Per-Replica StepFn."""
    if len(inputs) == 3:
      features, labels, sample_weights = inputs
    else:
      features, labels = inputs
      sample_weights = 1

    with tf.GradientTape() as tape:
      probs = model(features, training=True)
      negative_log_likelihood = tf.reduce_mean(
          tf.keras.losses.categorical_crossentropy(labels, probs) *
          sample_weights)

      l2_loss = sum(model.losses)
      if loss_type == 'focal':
        focal_loss_fn = tfa_losses.SigmoidFocalCrossEntropy()
        focal_loss = tf.reduce_mean(
            focal_loss_fn(labels, probs) * sample_weights)
        loss = focal_loss + l2_loss
      else:
        loss = negative_log_likelihood + l2_loss
      # Scale the loss given the tf.distribute.Strategy will reduce sum all
      # gradients. See details in
      # https://www.tensorflow.org/tutorials/distribute/custom_training#define_the_loss_function
      scaled_loss = loss / strategy.num_replicas_in_sync

    grads = tape.gradient(scaled_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    metrics['train/loss'].update_state(loss)
    metrics['train/negative_log_likelihood'].update_state(
        negative_log_likelihood)
    metrics['train/accuracy'].update_state(labels, probs)
    metrics['train/roc_auc'].update_state(labels[:, 1], probs[:, 1])

  def per_replica_eval_step_fn(inputs, dataset_name):
    """Per-Replica StepFn."""
    if len(inputs) == 3:
      features, labels, _ = inputs
    else:
      features, labels = inputs

    probs = model(features, training=False)
    negative_log_likelihood = tf.reduce_mean(
        tf.keras.losses.categorical_crossentropy(labels, probs))

    metrics[f'{dataset_name}/negative_log_likelihood'].update_state(
        negative_log_likelihood)
    metrics[f'{dataset_name}/accuracy'].update_state(labels, probs)
    metrics[f'{dataset_name}/roc_auc'].update_state(labels[:, 1], probs[:, 1])
    metrics[f'{dataset_name}/ece'].add_batch(probs[:, 1], label=labels[:, 1])
    metrics[f'{dataset_name}/brier'].add_batch(probs, label=labels[:, 1])

  @tf.function
  def distributed_train_step(iterator):
    """Training StepFn."""
    for _ in tf.range(tf.cast(params.steps_per_epoch, tf.int32)):
      strategy.run(per_replica_train_step_fn, args=(next(iterator),))

  @tf.function
  def distributed_eval_step(iterator, dataset_name, num_steps):
    """Evaluation StepFn."""
    for _ in tf.range(tf.cast(num_steps, tf.int32)):
      strategy.run(
          per_replica_eval_step_fn, args=(next(iterator), dataset_name))

  # Makes datasets into distributed version.
  train_dataset = strategy.experimental_distribute_dataset(train_dataset)
  eval_datasets = {
      ds_name: strategy.experimental_distribute_dataset(ds)
      for ds_name, ds in eval_datasets.items()
  }
  logging.info('Number of replicas in sync: %s', strategy.num_replicas_in_sync)

  train_iterator = iter(train_dataset)
  start_time = time.time()
  metrics_history = collections.defaultdict(list)
  for epoch in range(params.num_epochs):
    logging.info('Starting to run epoch: %s', epoch)
    distributed_train_step(train_iterator)

    current_step = (epoch + 1) * params.steps_per_epoch
    max_steps = params.steps_per_epoch * params.num_epochs
    time_elapsed = time.time() - start_time
    steps_per_sec = float(current_step) / time_elapsed
    eta_seconds = (max_steps - current_step) / steps_per_sec
    message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. '
               'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format(
                   current_step / max_steps, epoch + 1, params.num_epochs,
                   steps_per_sec, eta_seconds / 60, time_elapsed / 60))
    logging.info(message)

    # Start evaluation.
    logging.info('Starting to run eval at epoch: %s', epoch)
    for dataset_name, eval_dataset in eval_datasets.items():
      eval_iterator = iter(eval_dataset)
      distributed_eval_step(eval_iterator, dataset_name,
                            steps_per_eval[dataset_name])

    metrics_history['epoch'].append(epoch + 1)
    with summary_writer.as_default():
      for name, metric in metrics.items():
        result = utils.get_metric_result_value(metric)
        tf.summary.scalar(name, result, step=epoch + 1)
        metrics_history[name].append(str(result))


    for metric in metrics.values():
      metric.reset_states()

    model.save(os.path.join(model_dir, f'model_{epoch + 1}'), overwrite=True)

  utils.write_params(metrics_history,
                     os.path.join(model_dir, 'metrics_history.json'))