def test_profile(self): profiler.start() with traceme.TraceMe('three_times_five'): three = constant_op.constant(3) five = constant_op.constant(5) product = three * five self.assertAllEqual(15, product) with self.assertRaises(profiler.ProfilerAlreadyRunningError): profiler.start() profile_result = profiler.stop() profile_pb = trace_events_pb2.Trace() profile_pb.ParseFromString(profile_result) devices = frozenset(device.name for device in profile_pb.devices.values()) self.assertIn('/host:CPU', devices) if not test_util.IsBuiltWithROCm() and config.list_physical_devices( 'GPU'): # device tracing is not yet supported on the ROCm platform self.assertIn('/device:GPU:0', devices) events = frozenset(event.name for event in profile_pb.trace_events) self.assertIn('three_times_five', events) self.assertIn('Mul', events) with self.assertRaises(profiler.ProfilerNotRunningError): profiler.stop()
def benchmark_keras_model_functional_fit_graph_mode_with_profiler(self): profiler.start() with context.graph_mode(): model = make_keras_model(initializer="glorot_uniform") self._benchmark_keras_model_fit(model) result = profiler.stop() assert result is not None
def trace_on(graph=True, profiler=False): # pylint: disable=redefined-outer-name """Enables execution trace. Args: graph: whether to collect graphs used in execution profiler: whether to enable profiler. Returns: None """ if not context.context().executing_eagerly(): logging.warn("Must enable trace in eager mode.") return global _current_trace_context with _current_trace_context_lock: if _current_trace_context: logging.warn("Trace already enabled") return if graph and not profiler: context.context().enable_graph_collection() if profiler: context.context().enable_run_metadata() _profiler.start() _current_trace_context = _TraceContext(graph=graph, profiler=profiler)
def benchmark_keras_model_functional_fit_run_model_eagerly_with_profiler( self): profiler.start() model = make_keras_model(initializer="glorot_uniform") self._benchmark_keras_model_fit(model, run_eagerly=True) result = profiler.stop() assert result is not None
def main(): profile = None if "--run-cProfile" in sys.argv: import cProfile import pstats profile_file = "cProfile.log" print("Running cProfile, writing output to '{}'".format(profile_file), file=sys.stderr) profile = cProfile.Profile() profile.enable() parser = create_argparser() if len(sys.argv) < 2: parser.error("Too few arguments, run '{} --help' for more information.".format(lidbox.__name__)) # TODO when a subcommand is used incorrectly, get usage strings for its subparser instead of the root parser args = parser.parse_args() tf_profiler = None if args.run_tf_profiler: from tensorflow.python.eager import profiler as tf_profiler tf_profile_file = os.path.abspath(os.path.join("tf_profile", str(int(time.time())))) print("Running TensorFlow profiler, writing output to '{}'".format(tf_profile_file), file=sys.stderr) tf_profiler.start() ret = 1 try: # Initialize a Command object from the class specified in args.cmd_class and remove the class from args command = args.__dict__.pop("cmd_class")(args) ret = command.run() finally: if tf_profiler: tf_profiler_result = tf_profiler.stop() tf_profiler.save(tf_profile_file, tf_profiler_result) if profile: profile.disable() with open(profile_file, "w") as out_f: pstats.Stats(profile, stream=out_f).sort_stats("tottime").print_stats() if ret: sys.exit(ret)
def _start_profiler(): from tensorflow.python.eager import profiler # pylint: disable=g-import-not-at-top try: logging.info('Starting profiler') profiler.start() except Exception: # pylint: disable=W0703 logging.error('Profiler failed to start due to error:\n %s', traceback.format_exc())
def test_profile_exceptions(self): logdir = self.get_temp_dir() profiler.start(logdir) with self.assertRaises(errors.AlreadyExistsError): profiler.start(logdir) profiler.stop() with self.assertRaises(errors.UnavailableError): profiler.stop()
def _start_profiler(): from tensorflow.python.eager import profiler # pylint: disable=g-import-not-at-top try: profiler.start() logging.info('Started TensorFlow profiler') except Exception: # pylint: disable=broad-except logging.error('TensorFlow profiler failed to start due to error:\n %s', traceback.format_exc())
def test_profile(self): profiler.start() three = constant_op.constant(3) five = constant_op.constant(5) product = three * five self.assertAllEqual(15, product) profile_result = profiler.stop() profile_pb = trace_events_pb2.Trace() profile_pb.ParseFromString(profile_result) profile_pb_str = '%s' % profile_pb self.assertTrue('Mul' in profile_pb_str)
def test_save_profile(self): logdir = self.get_temp_dir() profiler.start(logdir) with traceme.TraceMe('three_times_five'): three = constant_op.constant(3) five = constant_op.constant(5) product = three * five self.assertAllEqual(15, product) profiler.stop() file_list = gfile.ListDirectory(logdir) self.assertEqual(len(file_list), 2) for file_name in gfile.ListDirectory(logdir): if gfile.IsDirectory(os.path.join(logdir, file_name)): self.assertEqual(file_name, 'plugins') else: self.assertTrue(file_name.endswith('.profile-empty')) profile_dir = os.path.join(logdir, 'plugins', 'profile') run = gfile.ListDirectory(profile_dir)[0] hostname = socket.gethostname() overview_page = os.path.join(profile_dir, run, hostname + '.overview_page.pb') self.assertTrue(gfile.Exists(overview_page)) input_pipeline = os.path.join(profile_dir, run, hostname + '.input_pipeline.pb') self.assertTrue(gfile.Exists(input_pipeline)) tensorflow_stats = os.path.join(profile_dir, run, hostname + '.tensorflow_stats.pb') self.assertTrue(gfile.Exists(tensorflow_stats)) trace_file = os.path.join(profile_dir, run, hostname + '.trace') self.assertTrue(gfile.Exists(trace_file)) with gfile.Open(trace_file, 'rb') as f: profile_pb = trace_events_pb2.Trace() profile_pb.ParseFromString(f.read()) devices = frozenset(device.name for device in profile_pb.devices.values()) self.assertIn('/host:CPU', devices) if config.list_physical_devices('GPU'): self.assertIn('/device:GPU:0', devices) events = frozenset(event.name for event in profile_pb.trace_events) self.assertIn('three_times_five', events) self.assertIn('Mul:Mul', events)
def trace_on(graph=True, profiler=False): # pylint: disable=redefined-outer-name """Starts a trace to record computation graphs and profiling information. Must be invoked in eager mode. When enabled, TensorFlow runtime will collection information that can later be exported and consumed by TensorBoard. The trace is activated across the entire TensorFlow runtime and affects all threads of execution. To stop the trace and export the collected information, use `tf.summary.trace_export`. To stop the trace without exporting, use `tf.summary.trace_off`. Args: graph: If True, enables collection of executed graphs. It includes ones from tf.function invocation and ones from the legacy graph mode. The default is True. profiler: If True, enables the advanced profiler. Enabling profiler implicitly enables the graph collection. The profiler may incur a high memory overhead. The default is False. """ if ops.inside_function(): logging.warn("Cannot enable trace inside a tf.function.") return if not context.executing_eagerly(): logging.warn("Must enable trace in eager mode.") return global _current_trace_context with _current_trace_context_lock: if _current_trace_context: logging.warn("Trace already enabled") return if graph and not profiler: context.context().enable_graph_collection() if profiler: context.context().enable_run_metadata() _profiler.start() _current_trace_context = _TraceContext(graph=graph, profiler=profiler)
def trace_on(graph=True, profiler=False): # pylint: disable=redefined-outer-name """Starts a trace to record computation graphs and profiling information. Must be invoked in eager mode. When enabled, TensorFlow runtime will collection information that can later be exported and consumed by TensorBoard. The trace is activated across the entire TensorFlow runtime and affects all threads of execution. To stop the trace and export the collected information, use `tf.summary.trace_export`. To stop the trace without exporting, use `tf.summary.trace_off`. Args: graph: If True, enables collection of executed graphs. It includes ones from tf.function invocation and ones from the legacy graph mode. The default is True. profiler: If True, enables the advanced profiler. Enabling profiler implicitly enables the graph collection. The profiler may incur a high memory overhead. The default is False. """ if ops.inside_function(): logging.warn("Cannot enable trace inside a tf.function.") return if not context.context().executing_eagerly(): logging.warn("Must enable trace in eager mode.") return global _current_trace_context with _current_trace_context_lock: if _current_trace_context: logging.warn("Trace already enabled") return if graph and not profiler: context.context().enable_graph_collection() if profiler: context.context().enable_run_metadata() _profiler.start() _current_trace_context = _TraceContext(graph=graph, profiler=profiler)
def test_profile(self): profiler.start() three = constant_op.constant(3) five = constant_op.constant(5) product = three * five self.assertAllEqual(15, product) with self.assertRaises(profiler.ProfilerAlreadyRunningError): profiler.start() profile_result = profiler.stop() profile_pb = trace_events_pb2.Trace() profile_pb.ParseFromString(profile_result) devices = frozenset(device.name for device in profile_pb.devices.values()) self.assertIn('/host:CPU', devices) if config.list_physical_devices('GPU'): self.assertIn('/device:GPU:0', devices) events = frozenset(event.name for event in profile_pb.trace_events) self.assertIn('Mul:Mul', events) with self.assertRaises(profiler.ProfilerNotRunningError): profiler.stop()
def custom_training(waveGlow, hparams, optimizer, checkpoint, manager_checkpoint): step = tf.cast(checkpoint.step, tf.int64) for epoch in tf.range(1): tf.summary.text(name='epoch', data='Start epoch {}'.format(epoch.numpy()) +\ 'at ' + datetime.now().strftime("%Y%m%d-%H%M%S"), step=step) for int_step, (step, x_train) in zip(range(50), training_dataset.enumerate(start=step)): if int_step == 2: profiler.start() train_step(step=step, x_train=x_train, waveGlow=waveGlow, hparams=hparams, optimizer=optimizer) if tf.equal(step % hparams['save_model_every'], 0): save_path = manager_checkpoint.save() tf.summary.text(name='save_checkpoint', data="Saved checkpoint in" + save_path, step=step) if tf.equal(step % hparams['save_audio_every'], 0): utils.eval_step(eval_dataset=validation_dataset, waveGlow=waveGlow, hparams=hparams, step=step) if int_step == 50: profiler_result = profiler.stop() profiler.save(hparams['log_dir'], profiler_result) break checkpoint.step.assign_add(1)
def on_batch_end(self, batch, logs=None): """Writes scalar summaries for metrics on every training batch. Performs profiling if current batch is in profiler_batches. """ # Don't output batch_size and batch number as TensorBoard summaries logs = logs or {} self._samples_seen += logs.get('size', 1) samples_seen_since = self._samples_seen - self._samples_seen_at_last_write if self.update_freq != 'epoch' and samples_seen_since >= self.update_freq: batch_logs = {('batch_' + k): v for k, v in logs.items() if k not in ['batch', 'size', 'num_steps']} self._write_custom_summaries(self._total_batches_seen, batch_logs) self._samples_seen_at_last_write = self._samples_seen self._total_batches_seen += 1 if self._is_profiling: profiler.save(self.log_dir, profiler.stop()) self._is_profiling = False elif (not self._is_profiling and self._total_batches_seen == self._profile_batch - 1): profiler.start() self._is_profiling = True
def run_profile(net, train=False, n_steps=150, do_profile=True, reps=1, dtype="float32", **kwargs): """ Run profiler on a benchmark network. Parameters ---------- net : `~nengo.Network` The nengo Network to be profiled. train : bool If True, profile the `.Simulator.fit` function. Otherwise, profile the `.Simulator.run` function. n_steps : int The number of timesteps to run the simulation. do_profile : bool Whether or not to run profiling reps : int Repeat the run this many times (only profile data from the last run will be kept). dtype : str Simulation dtype (e.g. "float32") Returns ------- exec_time : float Time (in seconds) taken to run the benchmark, taking the minimum over ``reps``. Notes ----- kwargs will be passed on to `.Simulator` """ exec_time = 1e10 n_batches = 1 with net: nengo_dl.configure_settings(inference_only=not train, dtype=dtype) with nengo_dl.Simulator(net, **kwargs) as sim: if hasattr(net, "inp"): x = { net.inp: np.random.randn(sim.minibatch_size * n_batches, n_steps, net.inp.size_out) } elif hasattr(net, "inp_a"): x = { net.inp_a: np.random.randn(sim.minibatch_size * n_batches, n_steps, net.inp_a.size_out), net.inp_b: np.random.randn(sim.minibatch_size * n_batches, n_steps, net.inp_b.size_out), } else: x = None if train: y = { net.p: np.random.randn(sim.minibatch_size * n_batches, n_steps, net.p.size_in) } sim.compile(tf.optimizers.SGD(0.001), loss=tf.losses.mse) # run once to eliminate startup overhead start = timeit.default_timer() sim.fit(x, y, epochs=1, n_steps=n_steps) print("Warmup time:", timeit.default_timer() - start) for _ in range(reps): if do_profile: profiler.start() start = timeit.default_timer() sim.fit(x, y, epochs=1, n_steps=n_steps) exec_time = min(timeit.default_timer() - start, exec_time) if do_profile: profiler.save("profile", profiler.stop()) else: # run once to eliminate startup overhead start = timeit.default_timer() sim.predict(x, n_steps=n_steps) print("Warmup time:", timeit.default_timer() - start) for _ in range(reps): if do_profile: profiler.start() start = timeit.default_timer() sim.predict(x, n_steps=n_steps) exec_time = min(timeit.default_timer() - start, exec_time) if do_profile: profiler.save("profile", profiler.stop()) exec_time /= n_batches print("Execution time:", exec_time) return exec_time
def run(flags_obj): """Run ResNet ImageNet training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ # TODO(tobyboyd): Remove eager flag when tf 1.0 testing ends. # Eager is default in tf 2.0 and should not be toggled if keras_common.is_v2_0(): keras_common.set_config_v2() else: config = keras_common.get_config_proto_v1() if flags_obj.enable_eager: tf.compat.v1.enable_eager_execution(config=config) else: sess = tf.Session(config=config) tf.keras.backend.set_session(sess) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: keras_common.set_gpu_thread_mode_and_count(flags_obj) dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'float16': policy = tf.keras.mixed_precision.experimental.Policy('infer_float32_vars') tf.keras.mixed_precision.experimental.set_policy(policy) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, num_workers=distribution_utils.configure_cluster()) strategy_scope = keras_common.get_strategy_scope(strategy) # pylint: disable=protected-access if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = keras_common.get_synth_input_fn( height=imagenet_main.DEFAULT_IMAGE_SIZE, width=imagenet_main.DEFAULT_IMAGE_SIZE, num_channels=imagenet_main.NUM_CHANNELS, num_classes=imagenet_main.NUM_CLASSES, dtype=dtype) else: distribution_utils.undo_set_up_synthetic_data() input_fn = imagenet_main.input_fn train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras, datasets_num_private_threads=flags_obj.datasets_num_private_threads, dtype=dtype) eval_input_dataset = None if not flags_obj.skip_eval: eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras, dtype=dtype) with strategy_scope: optimizer = keras_common.get_optimizer() if dtype == 'float16': # TODO(reedwm): Remove manually wrapping optimizer once mixed precision # can be enabled with a single line of code. optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, loss_scale=flags_core.get_loss_scale(flags_obj)) if flags_obj.use_trivial_model: model = trivial_model.trivial_model(imagenet_main.NUM_CLASSES) else: model = resnet_model.resnet50(num_classes=imagenet_main.NUM_CLASSES, dtype=dtype) model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['sparse_categorical_accuracy']) time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks( learning_rate_schedule, imagenet_main.NUM_IMAGES['train']) train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size train_epochs = flags_obj.train_epochs if flags_obj.train_steps: train_steps = min(flags_obj.train_steps, train_steps) train_epochs = 1 num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: # Only build the training graph. This reduces memory usage introduced by # control flow ops in layers that have different implementations for # training and inference (e.g., batch norm). tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None callbacks = [time_callback, lr_callback] if flags_obj.enable_tensorboard: callbacks.append(tensorboard_callback) if flags_obj.enable_e2e_xprof: profiler.start() history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=train_steps, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=2) if flags_obj.enable_e2e_xprof: results = profiler.stop() profiler.save(flags_obj.model_dir, results) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) stats = keras_common.build_stats(history, eval_output, time_callback) return stats
def on_batch_begin(self, batch, logs=None): if batch == self.start_step: profiler.start() tf.compat.v1.logging.info('Profiler started at Step %s', self.start_step)
def on_train_begin(self, logs=None): if self._profile_batch == 1: profiler.start() self._is_profiling = True
def on_batch_begin(self, batch, logs=None): if batch == self.start_step_in_epoch and self.should_start: self.should_start = False profiler.start() tf.compat.v1.logging.info('Profiler started at Step %s', self.start_step)
value_grad = tf.random.normal(value.shape, dtype=value.dtype) v = tf.reduce_sum(value * value_grad) grad, = tf.gradients(v, feature) return grad, value_grad @tf.function def get_gradient_gt(feature, coordinate, value_grad): value = resampler.resampler(feature, coordinate) v = tf.reduce_sum(value * value_grad) grad, = tf.gradients(v, feature) return grad mask, value = resample_image(feature, coordinate, tfnative=TFNATIVE) gt = resampler.resampler(feature, coordinate) grad, value_grad = get_gradient(feature, coordinate, TFNATIVE) value_grad = tf.where(mask[..., None], value_grad, 0) grad_gt = get_gradient_gt(feature, coordinate, value_grad) profiler.start() for i in range(3): # feature_grad, _ = get_gradient(feature, coordinate, tfnative=TFNATIVE) # print(feature_grad.shape) mask, value = resample_image(feature, coordinate, tfnative=TFNATIVE) profiler_result = profiler.stop() profiler.save('./profile.log', profiler_result)