Exemple #1
0
def build_graph(tower_devices, tensor_shapes, variable_mgr, num_iters):
    """Builds the graph for the benchmark.

  Args:
    tower_devices: A list of device strings of the devices to run the all-reduce
      benchmark on.
    tensor_shapes: A list of shapes of the tensors that will be aggregated for
      the all-reduce.
    variable_mgr: The VariableMgr to perform the all-reduce.
    num_iters: Number of iterations to aggregate tensors for.
  Returns:
    An op that runs the benchmark.
  """
    all_device_tensors = []
    for i, tower_device in enumerate(tower_devices):
        with tf.device(tower_device):
            device_tensors = []
            for j, shape in enumerate(tensor_shapes):
                tensor = tf.Variable(tf.random_normal(shape, dtype=tf.float32),
                                     name='tensor_%d_on_device_%d' % (j, i))
                device_tensors.append(tensor)
        all_device_tensors.append(device_tensors)

    log_fn('Building all-reduce ops')
    benchmark_op = build_all_reduce_iterations(all_device_tensors,
                                               tower_devices, variable_mgr,
                                               num_iters)
    log_fn('Done building all-reduce ops')
    return benchmark_op
def run_benchmark(bench_cnn, num_iters):
  """Runs the all-reduce benchmark.

  Args:
    bench_cnn: The BenchmarkCNN where params, the variable manager, and other
      attributes are obtained.
    num_iters: Number of iterations to do all-reduce for for.

  Raises:
    ValueError: Invalid params of bench_cnn.
  """
  if bench_cnn.params.variable_update != 'replicated':
    raise ValueError('--variable_update=replicated must be specified to use'
                     'the all-reduce benchmark')
  if bench_cnn.params.variable_consistency == 'relaxed':
    raise ValueError('--variable_consistency=relaxed is not supported')

  benchmark_op = build_graph(bench_cnn.raw_devices,
                             get_var_shapes(bench_cnn.model),
                             bench_cnn.variable_mgr, num_iters)
  init_ops = [
      tf.global_variables_initializer(),
      bench_cnn.variable_mgr.get_post_init_ops()
  ]
  loss_op = tf.no_op()

  if bench_cnn.graph_file:
    path, filename = os.path.split(bench_cnn.graph_file)
    as_text = filename.endswith('txt')
    log_fn('Writing GraphDef as %s to %s' % (
        'text' if as_text else 'binary', bench_cnn.graph_file))
    tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True),
                         path, filename, as_text)

  run_graph(benchmark_op, bench_cnn, init_ops, loss_op)
Exemple #3
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    run_benchmark(bench, absl_flags.FLAGS.iters_per_step)
Exemple #4
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging,
                              params.model):
        params = benchmark_cnn.setup(params)
        bench = benchmark_cnn.BenchmarkCNN(params)

        tfversion = cnn_util.tensorflow_version_tuple()
        log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

        bench.print_info()
        bench.run()
Exemple #5
0
def run_graph(benchmark_op, bench_cnn, init_ops, dummy_loss_op):
    """Runs the graph for the benchmark.

  Args:
    benchmark_op: An op that runs the benchmark.
    bench_cnn: The BenchmarkCNN where params and other attributes are obtained.
    init_ops: A list of ops that are run before `benchmark_op` for
      initialization.
    dummy_loss_op: Any op. We must pass a loss op to
      `benchmark_cnn.benchmark_one_step`, but the result of the op is never
      actually used.
  """
    config = benchmark_cnn.create_config_proto(bench_cnn.params)
    with tf.Session(config=config) as sess:
        for op in init_ops:
            sess.run(op)
        step_train_times = []
        fetches = {'average_loss': dummy_loss_op, 'benchmark_op': benchmark_op}
        log_fn('Running warmup')
        for i in range(-bench_cnn.num_warmup_batches, bench_cnn.num_batches):
            if i == 0:
                log_fn('Running all-reduce ops')
                start = time.time()
            if i > 0 and i % bench_cnn.params.display_every == 0:
                log_fn('Iteration: %d. Average time per step so far: %s' %
                       (i, (time.time() - start) / i))
            # Call benchmark_one_step instead of directly calling sess.run(...), to
            # potentially get a trace file, partitioned graphs, etc.
            benchmark_cnn.benchmark_one_step(
                sess=sess,
                fetches=fetches,
                step=i,
                # The batch size is only used for the images/sec calculation, which is
                # not actually calculated because we pass show_images_per_sec=False.
                batch_size=None,
                step_train_times=step_train_times,
                trace_filename=bench_cnn.trace_filename,
                partitioned_graph_file_prefix=(
                    bench_cnn.params.partitioned_graph_file_prefix),
                profiler=None,
                image_producer=None,
                params=bench_cnn.params,
                show_images_per_sec=False)
        log_fn('Average time per step: %s' %
               ((time.time() - start) / bench_cnn.num_batches))
Exemple #6
0
    def postprocess(self, results):
        """Postprocess results returned from model."""
        try:
            from cnn_quantization.tf_cnn_benchmarks import coco_metric  # pylint: disable=g-import-not-at-top
        except ImportError:
            raise ImportError(
                'To use the COCO dataset, you must clone the '
                'repo https://github.com/tensorflow/models and add '
                'tensorflow/models and tensorflow/models/research to '
                'the PYTHONPATH, and compile the protobufs; '
                'To evaluate using COCO'
                'metric, download and install Python COCO API from'
                'https://github.com/cocodataset/cocoapi')

        pred_boxes = results[ssd_constants.PRED_BOXES]
        pred_scores = results[ssd_constants.PRED_SCORES]
        # TODO(haoyuzhang): maybe use these values for visualization.
        # gt_boxes = results['gt_boxes']
        # gt_classes = results['gt_classes']
        source_id = results[ssd_constants.SOURCE_ID]
        raw_shape = results[ssd_constants.RAW_SHAPE]

        # COCO evaluation requires processing COCO_NUM_VAL_IMAGES exactly once. Due
        # to rounding errors (i.e., COCO_NUM_VAL_IMAGES % batch_size != 0), setting
        # `num_eval_epochs` to 1 is not enough and will often miss some images. We
        # expect user to set `num_eval_epochs` to >1, which will leave some unused
        # images from previous steps in `predictions`. Here we check if we are doing
        # eval at a new global step.
        if results['global_step'] > self.eval_global_step:
            self.eval_global_step = results['global_step']
            self.predictions.clear()

        for i, sid in enumerate(source_id):
            self.predictions[int(sid)] = {
                ssd_constants.PRED_BOXES: pred_boxes[i],
                ssd_constants.PRED_SCORES: pred_scores[i],
                ssd_constants.SOURCE_ID: source_id[i],
                ssd_constants.RAW_SHAPE: raw_shape[i]
            }

        # COCO metric calculates mAP only after a full epoch of evaluation. Return
        # dummy results for top_N_accuracy to be compatible with benchmar_cnn.py.
        if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES:
            log_fn('Got results for all {:d} eval examples. Calculate mAP...'.
                   format(ssd_constants.COCO_NUM_VAL_IMAGES))

            annotation_file = os.path.join(self.params.data_dir,
                                           ssd_constants.ANNOTATION_FILE)
            # Size of predictions before decoding about 15--30GB, while size after
            # decoding is 100--200MB. When using async eval mode, decoding takes
            # 20--30 seconds of main thread time but is necessary to avoid OOM during
            # inter-process communication.
            decoded_preds = coco_metric.decode_predictions(
                self.predictions.values())
            self.predictions.clear()

            if self.params.collect_eval_results_async:

                def _eval_results_getter():
                    """Iteratively get eval results from async eval process."""
                    while True:
                        step, eval_results = self.async_eval_results_queue.get(
                        )
                        self.eval_coco_ap = eval_results['COCO/AP']
                        mlperf.logger.log_eval_accuracy(
                            self.eval_coco_ap, step,
                            self.batch_size * self.params.num_gpus,
                            ssd_constants.COCO_NUM_TRAIN_IMAGES)
                        if self.reached_target():
                            # Reached target, clear all pending messages in predictions queue
                            # and insert poison pill to stop the async eval process.
                            while not self.async_eval_predictions_queue.empty(
                            ):
                                self.async_eval_predictions_queue.get()
                            self.async_eval_predictions_queue.put('STOP')
                            break

                if not self.async_eval_process:
                    # Limiting the number of messages in predictions queue to prevent OOM.
                    # Each message (predictions data) can potentially consume a lot of
                    # memory, and normally there should only be few messages in the queue.
                    # If often blocked on this, consider reducing eval frequency.
                    self.async_eval_predictions_queue = multiprocessing.Queue(
                        2)
                    self.async_eval_results_queue = multiprocessing.Queue()

                    # Reason to use a Process as opposed to Thread is mainly the
                    # computationally intensive eval runner. Python multithreading is not
                    # truly running in parallel, a runner thread would get significantly
                    # delayed (or alternatively delay the main thread).
                    self.async_eval_process = multiprocessing.Process(
                        target=coco_metric.async_eval_runner,
                        args=(self.async_eval_predictions_queue,
                              self.async_eval_results_queue, annotation_file))
                    self.async_eval_process.daemon = True
                    self.async_eval_process.start()

                    self.async_eval_results_getter_thread = threading.Thread(
                        target=_eval_results_getter, args=())
                    self.async_eval_results_getter_thread.daemon = True
                    self.async_eval_results_getter_thread.start()

                self.async_eval_predictions_queue.put(
                    (self.eval_global_step, decoded_preds))
                return {'top_1_accuracy': 0, 'top_5_accuracy': 0.}

            eval_results = coco_metric.compute_map(decoded_preds,
                                                   annotation_file)
            self.eval_coco_ap = eval_results['COCO/AP']
            ret = {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}
            for metric_key, metric_value in eval_results.items():
                ret[constants.SIMPLE_VALUE_RESULT_PREFIX +
                    metric_key] = metric_value
            mlperf.logger.log_eval_accuracy(
                self.eval_coco_ap, self.eval_global_step,
                self.batch_size * self.params.num_gpus,
                ssd_constants.COCO_NUM_TRAIN_IMAGES)
            return ret
        log_fn('Got {:d} out of {:d} eval examples.'
               ' Waiting for the remaining to calculate mAP...'.format(
                   len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES))
        return {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}