def benchmark_custom_training_mnist_bs_512_gpu_2(self):
        """Measure performance with batch_size=512, run_iters=10, gpu=2 and

    distribution_strategy='mirrored'.
    """
        batch_size = 512
        run_iters = 10
        train_dataset = self.train_dataset.shuffle(
            buffer_size=1024).batch(batch_size)

        distribution_strategy = 'mirrored'

        strategy = distribution_util.get_distribution_strategy(
            distribution_strategy=distribution_strategy, num_gpus=2)

        if distribution_strategy != 'off':
            train_dataset = strategy.experimental_distribute_dataset(
                train_dataset)

        strategy_scope = distribution_util.get_strategy_scope(strategy)

        with strategy_scope:
            # Instantiate a loss function.
            loss_fn = tf.keras.losses.CategoricalCrossentropy(
                reduction=tf.keras.losses.Reduction.NONE)
            # Instantiate an optimizer to train the model.
            optimizer = tf.keras.optimizers.Adam()
            model = self._build_model()

        metrics, wall_time = self.measure_performance(model, train_dataset,
                                                      loss_fn, optimizer,
                                                      batch_size, run_iters,
                                                      self.epochs, strategy)
        extras = benchmark_util.get_keras_examples_metadata(
            'conv', batch_size, '.keras.ctl_graph')
        self.report_benchmark(iters=run_iters,
                              wall_time=wall_time,
                              metrics=metrics,
                              extras=extras)
Example #2
0
def measure_performance(model_fn,
                        x=None,
                        y=None,
                        epochs=2,
                        batch_size=32,
                        run_iters=4,
                        optimizer=None,
                        loss=None,
                        metrics=None,
                        verbose=0,
                        num_gpus=0,
                        distribution_strategy='off'):
    """Run models and measure the performance.

  Args:
    model_fn: Model function to be benchmarked.
    x: Input data. See `x` in the `fit()` method of `keras.Model`.
    y: Target data. See `y` in the `fit()` method of `keras.Model`.
    epochs: Integer. Number of epochs to train the model.
      If unspecified, `epochs` will default to 2.
    batch_size: Integer. Number of samples per gradient update. If unspecified,
      `batch_size` will default to 32.
    run_iters: Integer. Number of iterations to run the performance measurement.
      If unspecified, `run_iters` will default to 4.
    optimizer: String (name of optimizer) or optimizer instance. See
      `tf.keras.optimizers`.
    loss: String (name of objective function), objective function or
      `tf.keras.losses.Loss` instance. See `tf.keras.losses`.
    metrics: Lists of metrics to be evaluated by the model during training. See
      `metrics` in the `compile()` method of  `keras.Model`.
    verbose: 0, 1, 2. Verbosity mode. See `verbose` in the `fit()` method of
      `keras.Model`. If unspecified, `verbose` will default to 0.
    num_gpus: Number of GPUs to run the model.
    distribution_strategy: Distribution strategies. It could be
      `multi_worker_mirrored`, `one_device`, `mirrored`. If unspecified,
      `distribution_strategy` will default to 'off'. Note that, `TPU`
      and `parameter_server` are not supported yet.

  Returns:
    Performance summary, which contains build_time, compile_time,
    startup_time, avg_epoch_time, wall_time, exp_per_sec, epochs,
    distribution_strategy.

  Raise:
    ValueError: If `x` is none or if `optimizer` is not provided or
    if `loss` is not provided or if `num_gpus` is negative.
  """
    if 'x' is None:
        raise ValueError('Input data is required.')
    if 'optimizer' is None:
        raise ValueError('Optimizer is required.')
    if 'loss' is None:
        raise ValueError('Loss function is required.')
    if num_gpus < 0:
        raise ValueError('`num_gpus` cannot be negative')

    # TODO(xingyulong): we will add tfds support later and
    #  get the `num_examples` from info.
    num_examples = x.shape[0]

    build_time_list, compile_time_list, startup_time_list = [], [], []
    avg_epoch_time_list, wall_time_list, exp_per_sec_list = [], [], []
    total_num_examples = epochs * num_examples

    strategy = distribution_util.get_distribution_strategy(
        distribution_strategy=distribution_strategy, num_gpus=num_gpus)

    for _ in range(run_iters):
        timer = timeit.default_timer
        start_time = timer()
        # Init the distribution strategy scope for each iteration.
        strategy_scope = distribution_util.get_strategy_scope(strategy)
        with strategy_scope:
            t0 = timer()
            model = model_fn()
            build_time = timer() - t0

            t1 = timer()
            model.compile(
                optimizer=optimizer,
                loss=loss,
                metrics=metrics,
            )
            compile_time = timer() - t1
        # Run one warm up epoch.
        model.fit(x=x, y=y, batch_size=batch_size, epochs=1)
        cbk = TimerCallBack()
        t2 = timer()
        model.fit(x=x,
                  y=y,
                  batch_size=batch_size,
                  epochs=epochs,
                  callbacks=[cbk],
                  verbose=verbose)
        end_time = timer()

        build_time_list.append(build_time)
        compile_time_list.append(compile_time)
        startup_time_list.append(cbk.startup_time)
        avg_epoch_time_list.append(np.mean(cbk.times))
        wall_time_list.append(end_time - start_time)
        exp_per_sec_list.append(total_num_examples / (end_time - t2))

    metrics = []
    metrics.append({'name': 'build_time', 'value': np.mean(build_time_list)})
    metrics.append({
        'name': 'compile_time',
        'value': np.mean(compile_time_list)
    })
    metrics.append({
        'name': 'startup_time',
        'value': np.mean(startup_time_list)
    })
    metrics.append({
        'name': 'avg_epoch_time',
        'value': np.mean(avg_epoch_time_list)
    })
    metrics.append({'name': 'exp_per_sec', 'value': np.mean(exp_per_sec_list)})
    metrics.append({'name': 'epochs', 'value': epochs})

    wall_time = np.mean(wall_time_list)
    extras = {
        'distribution_strategy': distribution_strategy,
        'num_gpus': num_gpus
    }

    return metrics, wall_time, extras