def testAutoProfiling(self): ops.reset_default_graph() time_dir = os.path.join(test.get_temp_dir(), 'time') memory_dir = os.path.join(test.get_temp_dir(), 'memory') profile_dir = os.path.join(test.get_temp_dir(), 'dir/dir2/profile') # TODO(xpan): Should we create parent directory for them? gfile.MkDir(time_dir) gfile.MkDir(memory_dir) time_opts = (builder(builder.time_and_memory()).with_file_output( os.path.join(time_dir, 'profile')).select(['micros']).build()) memory_opts = (builder(builder.time_and_memory()).with_file_output( os.path.join(memory_dir, 'profile')).select(['bytes']).build()) time_steps = [2, 3] memory_steps = [1, 3] dump_steps = [3, 4] x = lib.BuildSmallModel() with profile_context.ProfileContext(profile_dir, trace_steps=[1, 2, 3], dump_steps=[3, 4]) as pctx: pctx.add_auto_profiling('scope', time_opts, time_steps) pctx.add_auto_profiling('scope', memory_opts, memory_steps) self._trainLoop(x, 10, time_dir, time_steps, memory_dir, memory_steps, profile_dir, dump_steps)
def testComplexCodeView(self): ops.reset_default_graph() outfile = os.path.join(test.get_temp_dir(), 'dump') opts = (builder(builder.trainable_variables_parameter()) .with_file_output(outfile) .with_accounted_types(['.*']) .with_node_names(show_name_regexes= ['.*model_analyzer_testlib.py.*']) .account_displayed_op_only(False) .select(['params', 'float_ops']).build()) with profile_context.ProfileContext(test.get_temp_dir(), trace_steps=[], dump_steps=[]) as pctx: with session.Session() as sess: x = lib.BuildFullModel() sess.run(variables.global_variables_initializer()) pctx.trace_next_step() _ = sess.run(x) tfprof_node = pctx.profiler.profile_python(options=opts) # pylint: disable=line-too-long with gfile.Open(outfile, 'r') as f: lines = f.read().split('\n') self.assertGreater(len(lines), 5) result = '\n'.join([l[:min(len(l), 80)] for l in lines]) self.assertTrue( compat.as_text(lib.CheckAndRemoveDoc(result)) .startswith('node name | # parameters | # float_ops')) self.assertLess(0, tfprof_node.total_exec_micros) self.assertEqual(2844, tfprof_node.total_parameters) self.assertLess(168800, tfprof_node.total_float_ops) self.assertEqual(8, len(tfprof_node.children)) self.assertEqual('_TFProfRoot', tfprof_node.name) self.assertEqual( 'model_analyzer_testlib.py:63:BuildFullModel', tfprof_node.children[0].name) self.assertEqual( 'model_analyzer_testlib.py:63:BuildFullModel (gradient)', tfprof_node.children[1].name) self.assertEqual( 'model_analyzer_testlib.py:67:BuildFullModel', tfprof_node.children[2].name) self.assertEqual( 'model_analyzer_testlib.py:67:BuildFullModel (gradient)', tfprof_node.children[3].name) self.assertEqual( 'model_analyzer_testlib.py:69:BuildFullModel', tfprof_node.children[4].name) self.assertEqual( 'model_analyzer_testlib.py:70:BuildFullModel', tfprof_node.children[5].name) self.assertEqual( 'model_analyzer_testlib.py:70:BuildFullModel (gradient)', tfprof_node.children[6].name) self.assertEqual( 'model_analyzer_testlib.py:72:BuildFullModel', tfprof_node.children[7].name)
def testBasics(self): ops.reset_default_graph() outfile = os.path.join(test.get_temp_dir(), "dump") opts = builder( builder.time_and_memory()).with_file_output(outfile).build() x = lib.BuildFullModel() profile_str = None profile_step50 = os.path.join(test.get_temp_dir(), "profile_50") with profile_context.ProfileContext(test.get_temp_dir()) as pctx: pctx.add_auto_profiling("op", options=opts, profile_steps=[15, 50, 100]) with session.Session() as sess: sess.run(variables.global_variables_initializer()) total_steps = 101 if test.is_gpu_available() else 50 for i in range(total_steps): sess.run(x) if i == 14 or i == 99: self.assertTrue(gfile.Exists(outfile)) gfile.Remove(outfile) if i == 49: self.assertTrue(gfile.Exists(profile_step50)) with gfile.Open(outfile, "r") as f: profile_str = f.read() gfile.Remove(outfile) with lib.ProfilerFromFile( os.path.join(test.get_temp_dir(), "profile_50")) as profiler: profiler.profile_operations(options=opts) with gfile.Open(outfile, "r") as f: self.assertEqual(profile_str, f.read())
def testComplexCodeView(self): ops.reset_default_graph() outfile = os.path.join(test.get_temp_dir(), 'dump') opts = (builder(builder.trainable_variables_parameter()) .with_file_output(outfile) .with_accounted_types(['.*']) .with_node_names(show_name_regexes= ['.*model_analyzer_testlib.py.*']) .account_displayed_op_only(False) .select(['params', 'float_ops']).build()) with profile_context.ProfileContext(test.get_temp_dir(), trace_steps=[], dump_steps=[]) as pctx: with session.Session() as sess: x = lib.BuildFullModel() sess.run(variables.global_variables_initializer()) pctx.trace_next_step() _ = sess.run(x) tfprof_node = pctx.profiler.profile_python(options=opts) # pylint: disable=line-too-long with gfile.Open(outfile, 'r') as f: lines = f.read().split('\n') result = '\n'.join([l[:min(len(l), 80)] for l in lines]) self.assertEqual(compat.as_bytes('node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/168.85k flops)\n model_analyzer_testlib.py:63:BuildFullModel (0/1.80k params, 0/45.37k flops)\n model_analyzer_testlib.py:40:BuildSmallModel (0/0 params, 0/0 flops)\n model_analyzer_testlib.py:44:BuildSmallModel (0/4 params, 0/8 flops)\n model_analyzer_testlib.py:48:BuildSmallModel (0/648 params, 0/1.30k flops)\n model_analyzer_testlib.py:49:BuildSmallModel (0/0 params, 0/23.33k flops)\n model_analyzer_testlib.py:53:BuildSmallModel (0/1.15k params, 0/2.30k flops)\n model_analyzer_testlib.py:54:BuildSmallModel (0/0 params, 0/18.43k flops)\n model_analyzer_testlib.py:63:BuildFullModel (gradient) (0/0 params, 0/67.39k f\n model_analyzer_testlib.py:49:BuildSmallModel (gradient) (0/0 params, 0/46.66\n model_analyzer_testlib.py:54:BuildSmallModel (gradient) (0/0 params, 0/20.74\n model_analyzer_testlib.py:67:BuildFullModel (0/1.04k params, 0/18.57k flops)\n model_analyzer_testlib.py:67:BuildFullModel (gradient) (0/0 params, 0/37.00k f\n model_analyzer_testlib.py:69:BuildFullModel (0/0 params, 0/0 flops)\n model_analyzer_testlib.py:70:BuildFullModel (0/0 params, 0/258 flops)\n model_analyzer_testlib.py:70:BuildFullModel (gradient) (0/0 params, 0/129 flop\n model_analyzer_testlib.py:72:BuildFullModel (0/0 params, 0/141 flops)\n'), compat.as_bytes(result)) self.assertLess(0, tfprof_node.total_exec_micros) self.assertEqual(2844, tfprof_node.total_parameters) self.assertEqual(168854, tfprof_node.total_float_ops) self.assertEqual(8, len(tfprof_node.children)) self.assertEqual('_TFProfRoot', tfprof_node.name) self.assertEqual( 'model_analyzer_testlib.py:63:BuildFullModel', tfprof_node.children[0].name) self.assertEqual( 'model_analyzer_testlib.py:63:BuildFullModel (gradient)', tfprof_node.children[1].name) self.assertEqual( 'model_analyzer_testlib.py:67:BuildFullModel', tfprof_node.children[2].name) self.assertEqual( 'model_analyzer_testlib.py:67:BuildFullModel (gradient)', tfprof_node.children[3].name) self.assertEqual( 'model_analyzer_testlib.py:69:BuildFullModel', tfprof_node.children[4].name) self.assertEqual( 'model_analyzer_testlib.py:70:BuildFullModel', tfprof_node.children[5].name) self.assertEqual( 'model_analyzer_testlib.py:70:BuildFullModel (gradient)', tfprof_node.children[6].name) self.assertEqual( 'model_analyzer_testlib.py:72:BuildFullModel', tfprof_node.children[7].name)
def testDisabled(self): ops.reset_default_graph() x = lib.BuildFullModel() with profile_context.ProfileContext(test.get_temp_dir(), enabled=False) as pctx: with session.Session() as sess: self.evaluate(variables.global_variables_initializer()) for _ in range(10): self.evaluate(x) self.assertTrue(pctx.profiler is None) self.assertTrue( getattr(session.BaseSession, "profile_context", None) is None) with profile_context.ProfileContext(test.get_temp_dir()) as pctx: with session.Session() as sess: self.evaluate(variables.global_variables_initializer()) for _ in range(10): self.evaluate(x) self.assertFalse(pctx.profiler is None) self.assertFalse( getattr(session.BaseSession, "profile_context", None) is None)
def testBasics(self): ops.reset_default_graph() outfile = os.path.join(test.get_temp_dir(), "dump") opts = builder( builder.time_and_memory()).with_file_output(outfile).build() x = lib.BuildFullModel() profile_str = None profile_step100 = os.path.join(test.get_temp_dir(), "profile_100") with profile_context.ProfileContext(test.get_temp_dir()) as pctx: pctx.add_auto_profiling("op", options=opts, profile_steps=[15, 50, 100]) with session.Session() as sess: sess.run(variables.global_variables_initializer()) total_steps = 101 for i in range(total_steps): sess.run(x) if i == 14 or i == 49: self.assertTrue(gfile.Exists(outfile)) gfile.Remove(outfile) if i == 99: self.assertTrue(gfile.Exists(profile_step100)) with gfile.Open(outfile, "r") as f: profile_str = f.read() gfile.Remove(outfile) self.assertEqual(set([15, 50, 100]), set(pctx.get_profiles("op").keys())) with lib.ProfilerFromFile( os.path.join(test.get_temp_dir(), "profile_100")) as profiler: profiler.profile_operations(options=opts) with gfile.Open(outfile, "r") as f: if test.is_built_with_rocm(): # The profiler output for ROCm mode, includes an extra warning related # to the lack of stream tracing in ROCm moed. Need to skip this warning # when doing the diff in ROCm mode profile_str = "\n".join(profile_str.split("\n")[7:]) self.assertEqual(profile_str, f.read())
def testAutoTracingInDeubMode(self): ops.reset_default_graph() x = lib.BuildFullModel() with profile_context.ProfileContext(test.get_temp_dir(), debug=True): with session.Session() as sess: self.evaluate(variables.global_variables_initializer()) for _ in range(10): self.evaluate(x) for f in gfile.ListDirectory(test.get_temp_dir()): # Warm up, no tracing. self.assertFalse("run_meta" in f) self.evaluate(x) self.assertTrue( gfile.Exists(os.path.join(test.get_temp_dir(), "run_meta_11"))) gfile.Remove(os.path.join(test.get_temp_dir(), "run_meta_11")) # fetched already. self.evaluate(x) for f in gfile.ListDirectory(test.get_temp_dir()): self.assertFalse("run_meta" in f)
def testSelectEverythingDetail(self): ops.reset_default_graph() dev = '/device:GPU:0' if test.is_gpu_available() else '/device:CPU:0' outfile = os.path.join(test.get_temp_dir(), 'dump') opts = (builder( builder.trainable_variables_parameter()).with_file_output( outfile).with_accounted_types(['.*']).select([ 'micros', 'bytes', 'params', 'float_ops', 'occurrence', 'device', 'op_types', 'input_shapes' ]).build()) with profile_context.ProfileContext(test.get_temp_dir(), trace_steps=[], dump_steps=[]) as pctx: with session.Session() as sess, ops.device(dev): x = lib.BuildSmallModel() sess.run(variables.global_variables_initializer()) pctx.trace_next_step() pctx.dump_next_step() _ = sess.run(x) pctx.profiler.profile_name_scope(options=opts) with gfile.Open(outfile, 'r') as f: # pylint: disable=line-too-long dump_str = lib.CheckAndRemoveDoc(f.read()) outputs = dump_str.split('\n') self.assertEqual( outputs[0], 'node name | # parameters | # float_ops | requested bytes | total execution time | accelerator execution time | cpu execution time | assigned devices | op types | op count (run|defined) | input shapes' ) for o in outputs[1:]: if o.find('Conv2D ') > 0: metrics = o[o.find('(') + 1:o.find(')')].split(',') # Make sure time is profiled. gap = 1 if test.is_gpu_available() else 2 for i in range(3, 6, gap): mat = re.search('(.*)[um]s/(.*)[um]s', metrics[i]) self.assertGreater(float(mat.group(1)), 0.0) self.assertGreater(float(mat.group(2)), 0.0) # Make sure device is profiled. if test.is_gpu_available(): self.assertTrue(metrics[6].find('gpu') > 0) self.assertFalse(metrics[6].find('cpu') > 0) else: self.assertFalse(metrics[6].find('gpu') > 0) self.assertTrue(metrics[6].find('cpu') > 0) # Make sure float_ops is profiled. mat = re.search('(.*)k/(.*)k flops', metrics[1].strip()) self.assertGreater(float(mat.group(1)), 0.0) self.assertGreater(float(mat.group(2)), 0.0) # Make sure op_count is profiled. self.assertEqual(metrics[8].strip(), '1/1|1/1') # Make sure input_shapes is profiled. self.assertEqual(metrics[9].strip(), '0:2x6x6x3|1:3x3x3x6') if o.find('DW (3x3x3x6') > 0: metrics = o[o.find('(') + 1:o.find(')')].split(',') mat = re.search('(.*)/(.*) params', metrics[1].strip()) self.assertGreater(float(mat.group(1)), 0.0) self.assertGreater(float(mat.group(2)), 0.0) # pylint: enable=line-too-long # Test that profiler restored from profile file gives the same result. gfile.Remove(outfile) profile_file = os.path.join(test.get_temp_dir(), 'profile_1') with lib.ProfilerFromFile(profile_file) as profiler: profiler.profile_name_scope(options=opts) with gfile.Open(outfile, 'r') as f: self.assertEqual(dump_str, lib.CheckAndRemoveDoc(f.read()))
def main(_): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=2000 // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=100) ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None training_batch_generator = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. builder = option_builder.ProfileOptionBuilder opts1 = builder(builder.time_and_memory()).\ order_by('micros').\ with_max_depth(10).\ with_file_output("./pctx/opts1-rank-%d" % hvd.rank()).\ build() opts2 = builder.trainable_variables_parameter() # with profile_context.ProfileContext("./pctx", # trace_steps=range(100, 110), # dump_steps=[110]) as pctx: with profile_context.ProfileContext("./pctx") as pctx: pctx.add_auto_profiling('op', opts1, [800, 900, 1000]) pctx.add_auto_profiling('scope', opts2, [1000]) with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = next(training_batch_generator) mon_sess.run(train_op, feed_dict={ image: image_, label: label_ }) pctx.profiler.advise(options=model_analyzer.ALL_ADVICE)