def build_graph(tower_devices, tensor_shapes, variable_mgr, num_iters): """Builds the graph for the benchmark. Args: tower_devices: A list of device strings of the devices to run the all-reduce benchmark on. tensor_shapes: A list of shapes of the tensors that will be aggregated for the all-reduce. variable_mgr: The VariableMgr to perform the all-reduce. num_iters: Number of iterations to aggregate tensors for. Returns: An op that runs the benchmark. """ all_device_tensors = [] for i, tower_device in enumerate(tower_devices): with tf.device(tower_device): device_tensors = [] for j, shape in enumerate(tensor_shapes): tensor = tf.Variable(tf.random_normal(shape, dtype=tf.float32), name='tensor_%d_on_device_%d' % (j, i)) device_tensors.append(tensor) all_device_tensors.append(device_tensors) log_fn('Building all-reduce ops') benchmark_op = build_all_reduce_iterations(all_device_tensors, tower_devices, variable_mgr, num_iters) log_fn('Done building all-reduce ops') return benchmark_op
def run_benchmark(bench_cnn, num_iters): """Runs the all-reduce benchmark. Args: bench_cnn: The BenchmarkCNN where params, the variable manager, and other attributes are obtained. num_iters: Number of iterations to do all-reduce for for. Raises: ValueError: Invalid params of bench_cnn. """ if bench_cnn.params.variable_update != 'replicated': raise ValueError('--variable_update=replicated must be specified to use' 'the all-reduce benchmark') if bench_cnn.params.variable_consistency == 'relaxed': raise ValueError('--variable_consistency=relaxed is not supported') benchmark_op = build_graph(bench_cnn.raw_devices, get_var_shapes(bench_cnn.model), bench_cnn.variable_mgr, num_iters) init_ops = [ tf.global_variables_initializer(), bench_cnn.variable_mgr.get_post_init_ops() ] loss_op = tf.no_op() if bench_cnn.graph_file: path, filename = os.path.split(bench_cnn.graph_file) as_text = filename.endswith('txt') log_fn('Writing GraphDef as %s to %s' % ( 'text' if as_text else 'binary', bench_cnn.graph_file)) tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True), path, filename, as_text) run_graph(benchmark_op, bench_cnn, init_ops, loss_op)
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) run_benchmark(bench, absl_flags.FLAGS.iters_per_step)
def main(positional_arguments): # Command-line arguments like '--distortions False' are equivalent to # '--distortions=True False', where False is a positional argument. To prevent # this from silently running with distortions, we do not allow positional # arguments. assert len(positional_arguments) >= 1 if len(positional_arguments) > 1: raise ValueError('Received unknown positional arguments: %s' % positional_arguments[1:]) params = benchmark_cnn.make_params_from_flags() with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging, params.model): params = benchmark_cnn.setup(params) bench = benchmark_cnn.BenchmarkCNN(params) tfversion = cnn_util.tensorflow_version_tuple() log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) bench.print_info() bench.run()
def run_graph(benchmark_op, bench_cnn, init_ops, dummy_loss_op): """Runs the graph for the benchmark. Args: benchmark_op: An op that runs the benchmark. bench_cnn: The BenchmarkCNN where params and other attributes are obtained. init_ops: A list of ops that are run before `benchmark_op` for initialization. dummy_loss_op: Any op. We must pass a loss op to `benchmark_cnn.benchmark_one_step`, but the result of the op is never actually used. """ config = benchmark_cnn.create_config_proto(bench_cnn.params) with tf.Session(config=config) as sess: for op in init_ops: sess.run(op) step_train_times = [] fetches = {'average_loss': dummy_loss_op, 'benchmark_op': benchmark_op} log_fn('Running warmup') for i in range(-bench_cnn.num_warmup_batches, bench_cnn.num_batches): if i == 0: log_fn('Running all-reduce ops') start = time.time() if i > 0 and i % bench_cnn.params.display_every == 0: log_fn('Iteration: %d. Average time per step so far: %s' % (i, (time.time() - start) / i)) # Call benchmark_one_step instead of directly calling sess.run(...), to # potentially get a trace file, partitioned graphs, etc. benchmark_cnn.benchmark_one_step( sess=sess, fetches=fetches, step=i, # The batch size is only used for the images/sec calculation, which is # not actually calculated because we pass show_images_per_sec=False. batch_size=None, step_train_times=step_train_times, trace_filename=bench_cnn.trace_filename, partitioned_graph_file_prefix=( bench_cnn.params.partitioned_graph_file_prefix), profiler=None, image_producer=None, params=bench_cnn.params, show_images_per_sec=False) log_fn('Average time per step: %s' % ((time.time() - start) / bench_cnn.num_batches))
def postprocess(self, results): """Postprocess results returned from model.""" try: from cnn_quantization.tf_cnn_benchmarks import coco_metric # pylint: disable=g-import-not-at-top except ImportError: raise ImportError( 'To use the COCO dataset, you must clone the ' 'repo https://github.com/tensorflow/models and add ' 'tensorflow/models and tensorflow/models/research to ' 'the PYTHONPATH, and compile the protobufs; ' 'To evaluate using COCO' 'metric, download and install Python COCO API from' 'https://github.com/cocodataset/cocoapi') pred_boxes = results[ssd_constants.PRED_BOXES] pred_scores = results[ssd_constants.PRED_SCORES] # TODO(haoyuzhang): maybe use these values for visualization. # gt_boxes = results['gt_boxes'] # gt_classes = results['gt_classes'] source_id = results[ssd_constants.SOURCE_ID] raw_shape = results[ssd_constants.RAW_SHAPE] # COCO evaluation requires processing COCO_NUM_VAL_IMAGES exactly once. Due # to rounding errors (i.e., COCO_NUM_VAL_IMAGES % batch_size != 0), setting # `num_eval_epochs` to 1 is not enough and will often miss some images. We # expect user to set `num_eval_epochs` to >1, which will leave some unused # images from previous steps in `predictions`. Here we check if we are doing # eval at a new global step. if results['global_step'] > self.eval_global_step: self.eval_global_step = results['global_step'] self.predictions.clear() for i, sid in enumerate(source_id): self.predictions[int(sid)] = { ssd_constants.PRED_BOXES: pred_boxes[i], ssd_constants.PRED_SCORES: pred_scores[i], ssd_constants.SOURCE_ID: source_id[i], ssd_constants.RAW_SHAPE: raw_shape[i] } # COCO metric calculates mAP only after a full epoch of evaluation. Return # dummy results for top_N_accuracy to be compatible with benchmar_cnn.py. if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES: log_fn('Got results for all {:d} eval examples. Calculate mAP...'. format(ssd_constants.COCO_NUM_VAL_IMAGES)) annotation_file = os.path.join(self.params.data_dir, ssd_constants.ANNOTATION_FILE) # Size of predictions before decoding about 15--30GB, while size after # decoding is 100--200MB. When using async eval mode, decoding takes # 20--30 seconds of main thread time but is necessary to avoid OOM during # inter-process communication. decoded_preds = coco_metric.decode_predictions( self.predictions.values()) self.predictions.clear() if self.params.collect_eval_results_async: def _eval_results_getter(): """Iteratively get eval results from async eval process.""" while True: step, eval_results = self.async_eval_results_queue.get( ) self.eval_coco_ap = eval_results['COCO/AP'] mlperf.logger.log_eval_accuracy( self.eval_coco_ap, step, self.batch_size * self.params.num_gpus, ssd_constants.COCO_NUM_TRAIN_IMAGES) if self.reached_target(): # Reached target, clear all pending messages in predictions queue # and insert poison pill to stop the async eval process. while not self.async_eval_predictions_queue.empty( ): self.async_eval_predictions_queue.get() self.async_eval_predictions_queue.put('STOP') break if not self.async_eval_process: # Limiting the number of messages in predictions queue to prevent OOM. # Each message (predictions data) can potentially consume a lot of # memory, and normally there should only be few messages in the queue. # If often blocked on this, consider reducing eval frequency. self.async_eval_predictions_queue = multiprocessing.Queue( 2) self.async_eval_results_queue = multiprocessing.Queue() # Reason to use a Process as opposed to Thread is mainly the # computationally intensive eval runner. Python multithreading is not # truly running in parallel, a runner thread would get significantly # delayed (or alternatively delay the main thread). self.async_eval_process = multiprocessing.Process( target=coco_metric.async_eval_runner, args=(self.async_eval_predictions_queue, self.async_eval_results_queue, annotation_file)) self.async_eval_process.daemon = True self.async_eval_process.start() self.async_eval_results_getter_thread = threading.Thread( target=_eval_results_getter, args=()) self.async_eval_results_getter_thread.daemon = True self.async_eval_results_getter_thread.start() self.async_eval_predictions_queue.put( (self.eval_global_step, decoded_preds)) return {'top_1_accuracy': 0, 'top_5_accuracy': 0.} eval_results = coco_metric.compute_map(decoded_preds, annotation_file) self.eval_coco_ap = eval_results['COCO/AP'] ret = {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.} for metric_key, metric_value in eval_results.items(): ret[constants.SIMPLE_VALUE_RESULT_PREFIX + metric_key] = metric_value mlperf.logger.log_eval_accuracy( self.eval_coco_ap, self.eval_global_step, self.batch_size * self.params.num_gpus, ssd_constants.COCO_NUM_TRAIN_IMAGES) return ret log_fn('Got {:d} out of {:d} eval examples.' ' Waiting for the remaining to calculate mAP...'.format( len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES)) return {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}