def testDumpingDebugHookWithStatefulLegacyWatchFnWorks(self): watch_fn_state = {"run_counter": 0} def counting_watch_fn(fetches, feed_dict): del fetches, feed_dict watch_fn_state["run_counter"] += 1 if watch_fn_state["run_counter"] % 2 == 1: # If odd-index run (1-based), watch everything. return "DebugIdentity", r".*", r".*" else: # If even-index run, watch nothing. return "DebugIdentity", r"$^", r"$^" dumping_hook = hooks.DumpingDebugHook( self.session_root, watch_fn=counting_watch_fn, log_usage=False) mon_sess = monitored_session._HookedSession(self.sess, [dumping_hook]) for _ in range(4): mon_sess.run(self.inc_v) dump_dirs = glob.glob(os.path.join(self.session_root, "run_*")) dump_dirs = sorted( dump_dirs, key=lambda x: int(os.path.basename(x).split("_")[1])) self.assertEqual(4, len(dump_dirs)) for i, dump_dir in enumerate(dump_dirs): self._assert_correct_run_subdir_naming(os.path.basename(dump_dir)) dump = debug_data.DebugDumpDir(dump_dir) if i % 2 == 0: self.assertAllClose([10.0 + 1.0 * i], dump.get_tensors("v", 0, "DebugIdentity")) else: self.assertEqual(0, dump.size) self.assertEqual(repr(self.inc_v), dump.run_fetches_info) self.assertEqual(repr(None), dump.run_feed_keys_info)
def testDebugDumpDir_invalidFileNamingPattern(self): # File name with too few underscores should lead to an exception. open(os.path.join(self._dump_root, "node1_DebugIdentity_1234"), "wb") with self.assertRaisesRegexp(ValueError, "does not conform to the naming pattern"): debug_data.DebugDumpDir(self._dump_root)
def _session_run_for_graph_structure_lookup(self): with session.Session() as sess: u_name = "testDumpGraphStructureLookup/u" v_name = "testDumpGraphStructureLookup/v" w_name = "testDumpGraphStructureLookup/w" u_init = constant_op.constant([2.0, 4.0]) u = variables.Variable(u_init, name=u_name) v = math_ops.add(u, u, name=v_name) w = math_ops.add(v, v, name=w_name) u.initializer.run() run_options = config_pb2.RunOptions(output_partition_graphs=True) debug_utils.watch_graph(run_options, sess.graph, debug_ops=["DebugIdentity"], debug_urls=self._debug_urls()) run_metadata = config_pb2.RunMetadata() sess.run(w, options=run_options, run_metadata=run_metadata) self.assertEqual(self._expected_partition_graph_count, len(run_metadata.partition_graphs)) dump = debug_data.DebugDumpDir( self._dump_root, partition_graphs=run_metadata.partition_graphs) return u_name, v_name, w_name, dump
def testDuplicateNodeNamesInGraphDefOfSingleDeviceRaisesException(self): self._makeDataDirWithMultipleDevicesAndDuplicateNodeNames() graph_cpu_0 = graph_pb2.GraphDef() node = graph_cpu_0.node.add() node.name = "node_foo_1" node.op = "FooOp" node.device = "/job:localhost/replica:0/task:0/cpu:0" graph_gpu_0 = graph_pb2.GraphDef() node = graph_gpu_0.node.add() node.name = "node_foo_1" node.op = "FooOp" node.device = "/job:localhost/replica:0/task:0/device:GPU:0" graph_gpu_1 = graph_pb2.GraphDef() node = graph_gpu_1.node.add() node.name = "node_foo_1" node.op = "FooOp" node.device = "/job:localhost/replica:0/task:0/device:GPU:1" node = graph_gpu_1.node.add() # Here is the duplicate. node.name = "node_foo_1" node.op = "FooOp" node.device = "/job:localhost/replica:0/task:0/device:GPU:1" with self.assertRaisesRegex(ValueError, r"Duplicate node name on device "): debug_data.DebugDumpDir( self._dump_root, partition_graphs=[graph_cpu_0, graph_gpu_0, graph_gpu_1])
def testGrpcDebugWrapperSessionWithWatchFnWorks(self): def watch_fn(feeds, fetch_keys): del feeds, fetch_keys return ["DebugIdentity", "DebugNumericSummary"], r".*/read", None u = variables.Variable(2.1, name="u") v = variables.Variable(20.0, name="v") w = math_ops.multiply(u, v, name="w") sess = session.Session( config=session_debug_testlib.no_rewrite_session_config()) sess.run(u.initializer) sess.run(v.initializer) sess = grpc_wrapper.GrpcDebugWrapperSession(sess, "localhost:%d" % self._server_port, watch_fn=watch_fn) w_result = sess.run(w) self.assertAllClose(42.0, w_result) dump = debug_data.DebugDumpDir(self._dump_root) self.assertEqual(4, dump.size) self.assertAllClose([2.1], dump.get_tensors("u/read", 0, "DebugIdentity")) self.assertEqual( 14, len(dump.get_tensors("u/read", 0, "DebugNumericSummary")[0])) self.assertAllClose([20.0], dump.get_tensors("v/read", 0, "DebugIdentity")) self.assertEqual( 14, len(dump.get_tensors("v/read", 0, "DebugNumericSummary")[0]))
def testAdditionalHooks(self): checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt') log_dir = os.path.join(self.get_temp_dir(), 'log_dir1/') # First, save out the current model to a checkpoint: self._prepareCheckpoint(checkpoint_path) # Next, determine the metric to evaluate: value_op, update_op = metric_ops.streaming_accuracy( self._predictions, self._labels) dumping_root = os.path.join(self.get_temp_dir(), 'tfdbg_dump_dir') dumping_hook = hooks.DumpingDebugHook(dumping_root, log_usage=False) try: # Run the evaluation and verify the results: accuracy_value = evaluation.evaluate_once('', checkpoint_path, log_dir, eval_op=update_op, final_op=value_op, hooks=[dumping_hook]) self.assertAlmostEqual(accuracy_value, self._expected_accuracy) dump = debug_data.DebugDumpDir( glob.glob(os.path.join(dumping_root, 'run_*'))[0]) # Here we simply assert that the dumped data has been loaded and is # non-empty. We do not care about the detailed model-internal tensors or # their values. self.assertTrue(dump.dumped_tensor_data) finally: if os.path.isdir(dumping_root): shutil.rmtree(dumping_root)
def testTrainWithSessionWrapper(self): """Test that slim.learning.train can take `session_wrapper` args. One of the applications of `session_wrapper` is the wrappers of TensorFlow Debugger (tfdbg), which intercept methods calls to `tf.Session` (e.g., run) to achieve debugging. `DumpingDebugWrapperSession` is used here for testing purpose. """ dump_root = tempfile.mkdtemp() def dumping_wrapper(sess): # pylint: disable=invalid-name return dumping_wrapper_lib.DumpingDebugWrapperSession(sess, dump_root) with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = LogisticClassifier(tf_inputs) loss_ops.log_loss(tf_predictions, tf_labels) total_loss = loss_ops.get_total_loss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) train_op = learning.create_train_op(total_loss, optimizer) loss = learning.train( train_op, None, number_of_steps=1, session_wrapper=dumping_wrapper) self.assertIsNotNone(loss) run_root = glob.glob(os.path.join(dump_root, 'run_*'))[-1] dump = debug_data.DebugDumpDir(run_root) self.assertAllEqual(0, dump.get_tensors('global_step', 0, 'DebugIdentity')[0])
def createAndRunGraphWithWhileLoop(self): """Create and run a TensorFlow Graph with a while loop to generate dumps.""" self.dump_root = self.get_temp_dir() self.curr_file_path = os.path.abspath( tf_inspect.getfile(tf_inspect.currentframe())) # Run a simple TF graph to generate some debug dumps that can be used in # source annotation. with session.Session() as sess: loop_body = lambda i: math_ops.add(i, 2) self.traceback_first_line = line_number_above() loop_cond = lambda i: math_ops.less(i, 16) i = constant_op.constant(10, name="i") loop = control_flow_ops.while_loop(loop_cond, loop_body, [i]) run_options = config_pb2.RunOptions(output_partition_graphs=True) debug_utils.watch_graph(run_options, sess.graph, debug_urls=["file://%s" % self.dump_root]) run_metadata = config_pb2.RunMetadata() sess.run(loop, options=run_options, run_metadata=run_metadata) self.dump = debug_data.DebugDumpDir( self.dump_root, partition_graphs=run_metadata.partition_graphs) self.dump.set_python_graph(sess.graph)
def testOutputSlotWithoutOutgoingEdgeCanBeWatched(self): """Test watching output slots not attached to any outgoing edges.""" with session.Session() as sess: u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]]) u = constant_op.constant(u_init_val, shape=[2, 2], name="u") # Create a control edge from a node with an output: From u to z. # Node u will get executed only because of the control edge. The output # tensor u:0 is not attached to any outgoing edge in the graph. This test # checks that the debugger can watch such a tensor. with ops.control_dependencies([u]): z = control_flow_ops.no_op(name="z") run_options = config_pb2.RunOptions(output_partition_graphs=True) debug_utils.watch_graph(run_options, sess.graph, debug_ops=["DebugIdentity"], debug_urls=self._debug_urls()) run_metadata = config_pb2.RunMetadata() sess.run(z, options=run_options, run_metadata=run_metadata) dump = debug_data.DebugDumpDir( self._dump_root, partition_graphs=run_metadata.partition_graphs) # Assert that the DebugIdentity watch on u works properly. self.assertEqual(1, len(dump.dumped_tensor_data)) datum = dump.dumped_tensor_data[0] self.assertEqual("u", datum.node_name) self.assertEqual(0, datum.output_slot) self.assertEqual("DebugIdentity", datum.debug_op) self.assertAllClose([[5.0, 3.0], [-1.0, 0.0]], datum.get_tensor())
def testDumpingWithWatchFnWithNonDefaultDebugOpsWorks(self): """Use a watch_fn that specifies non-default debug ops.""" def watch_fn(fetches, feeds): del fetches, feeds return framework.WatchOptions( debug_ops=["DebugIdentity", "DebugNumericSummary"], node_name_regex_whitelist=r"^v.*", op_type_regex_whitelist=r".*", tensor_dtype_regex_whitelist=".*_ref") sess = dumping_wrapper.DumpingDebugWrapperSession( self.sess, session_root=self.session_root, watch_fn=watch_fn, log_usage=False) sess.run(self.inc_v) dump_dirs = glob.glob(os.path.join(self.session_root, "run_*")) self.assertEqual(1, len(dump_dirs)) dump = debug_data.DebugDumpDir(dump_dirs[0]) self.assertAllClose([10.0], dump.get_tensors("v", 0, "DebugIdentity")) self.assertEqual( 14, len(dump.get_tensors("v", 0, "DebugNumericSummary")[0])) dumped_nodes = [dump.node_name for dump in dump.dumped_tensor_data] self.assertNotIn("inc_v", dumped_nodes) self.assertNotIn("delta", dumped_nodes)
def testDebugDumpDir_usesGfileGlob(self): if platform.system() == "Windows": self.skipTest("gfile.Glob is not used on Windows.") self._makeDataDirWithMultipleDevicesAndDuplicateNodeNames() def fake_gfile_glob(glob_pattern): del glob_pattern return [] with test.mock.patch.object(gfile, "Glob", side_effect=fake_gfile_glob, autospec=True) as fake: debug_data.DebugDumpDir(self._dump_root) expected_calls = [ test.mock.call( os.path.join(self._dump_root, (debug_data.METADATA_FILE_PREFIX + debug_data.CORE_METADATA_TAG + "*"))), test.mock.call( os.path.join(self._dump_root, (debug_data.METADATA_FILE_PREFIX + debug_data.FETCHES_INFO_FILE_TAG + "*"))), test.mock.call( os.path.join(self._dump_root, (debug_data.METADATA_FILE_PREFIX + debug_data.FEED_KEYS_INFO_FILE_TAG + "*"))), test.mock.call( os.path.join(self._dump_root, (debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG + "*"))) ] fake.assert_has_calls(expected_calls, any_order=True)
def testDebugNumericSummaryOnUninitializedTensorGivesCorrectResult(self): with session.Session() as sess: a = variables.Variable([42], dtype=np.float32, name="numeric_summary_uninit/a") run_metadata = config_pb2.RunMetadata() run_options = config_pb2.RunOptions(output_partition_graphs=True) debug_utils.watch_graph(run_options, sess.graph, debug_ops=["DebugNumericSummary"], debug_urls=self._debug_urls()) sess.run(a.initializer, options=run_options, run_metadata=run_metadata) dump = debug_data.DebugDumpDir( self._dump_root, partition_graphs=run_metadata.partition_graphs) self.assertTrue(dump.loaded_partition_graphs()) # DebugNumericSummary output should reflect the uninitialized state of # the watched tensor. numeric_summary = dump.get_tensors("numeric_summary_uninit/a", 0, "DebugNumericSummary")[0] self.assertAllClose([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], numeric_summary[0:8]) self.assertTrue(np.isinf(numeric_summary[8])) self.assertGreater(numeric_summary[8], 0.0) self.assertTrue(np.isinf(numeric_summary[9])) self.assertLess(numeric_summary[9], 0.0) self.assertTrue(np.isnan(numeric_summary[10])) self.assertTrue(np.isnan(numeric_summary[11]))
def testDebugNumericSummaryOnInitializedTensorGivesCorrectResult(self): with session.Session() as sess: a = variables.Variable([ np.nan, np.nan, 0.0, 0.0, 0.0, -1.0, -3.0, 3.0, 7.0, -np.inf, -np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.nan, np.nan ], dtype=np.float32, name="numeric_summary/a") b = variables.Variable([0.0] * 18, dtype=np.float32, name="numeric_summary/b") c = math_ops.add(a, b, name="numeric_summary/c") sess.run(variables.global_variables_initializer()) run_metadata = config_pb2.RunMetadata() run_options = config_pb2.RunOptions(output_partition_graphs=True) debug_utils.watch_graph(run_options, sess.graph, debug_ops=["DebugNumericSummary"], debug_urls=self._debug_urls()) sess.run(c, options=run_options, run_metadata=run_metadata) dump = debug_data.DebugDumpDir( self._dump_root, partition_graphs=run_metadata.partition_graphs) self.assertTrue(dump.loaded_partition_graphs()) self.assertAllClose([[ 1.0, 18.0, 2.0, 2.0, 3.0, 2.0, 5.0, 4.0, -3.0, 7.0, 0.85714286, 8.97959184 ]], dump.get_tensors("numeric_summary/a/read", 0, "DebugNumericSummary"))
def on_run_end(self, request): """Overrides on-run-end callback. Actions taken: 1) Load the debug dump. 2) Bring up the Analyzer CLI. Args: request: An instance of OnSessionInitRequest. Returns: An instance of OnSessionInitResponse. """ self._is_run_start = False if request.performed_action == framework.OnRunStartAction.DEBUG_RUN: partition_graphs = None if request.run_metadata and request.run_metadata.partition_graphs: partition_graphs = request.run_metadata.partition_graphs elif request.client_graph_def: partition_graphs = [request.client_graph_def] if request.tf_error and not os.path.isdir(self._dump_root): # It is possible that the dump root may not exist due to errors that # have occurred prior to graph execution (e.g., invalid device # assignments), in which case we will just raise the exception as the # unwrapped Session does. raise request.tf_error debug_dump = debug_data.DebugDumpDir( self._dump_root, partition_graphs=partition_graphs) debug_dump.set_python_graph(self._sess.graph) passed_filter = None if self._active_tensor_filter: if not debug_dump.find( self._tensor_filters[self._active_tensor_filter], first_n=1): # No dumped tensor passes the filter in this run. Clean up the dump # directory and move on. self._remove_dump_root() return framework.OnRunEndResponse() else: # Some dumped tensor(s) from this run passed the filter. passed_filter = self._active_tensor_filter self._active_tensor_filter = None self._prep_cli_for_run_end(debug_dump, request.tf_error, passed_filter) self._run_start_response = self._launch_cli() # Clean up the dump generated by this run. self._remove_dump_root() else: # No debug information to show following a non-debug run() call. self._run_start_response = None # Return placeholder response that currently holds no additional # information. return framework.OnRunEndResponse()
def _load_dumped_intermediate_tensors(self, dump_path, target_name): dump_dir = debug_data.DebugDumpDir(dump_path, validate=False) for dump in dump_dir.dumped_tensor_data: if (dump.tensor_name not in self._ref_tensor_names and dump.tensor_name not in self._tensor_handles and dump.tensor_name not in self._override_tensors and dump.tensor_name != target_name): self._dumped_intermediate_tensors[dump.tensor_name] = dump
def testGraphStructureLookupWithoutPartitionGraphsDoesNotErrorOut(self): _, _, _, dump = self._session_run_for_graph_structure_lookup() # Now load the dump again, without the partition graphs, so we can check # errors are not raised because the partition graphs are loaded from the # dump directory. dump = debug_data.DebugDumpDir(self._dump_root, validate=False) self.assertTrue(dump.loaded_partition_graphs())
def testDebugDumpDir_invalidFileNamingPattern(self): # File name with too few underscores should lead to an exception. device_dir = os.path.join( self._dump_root, debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG + ",job_localhost,replica_0,task_0,cpu_0") os.makedirs(device_dir) open(os.path.join(device_dir, "node1_DebugIdentity_1234"), "wb") with self.assertRaisesRegex(ValueError, "does not conform to the naming pattern"): debug_data.DebugDumpDir(self._dump_root)
def testDumpUninitializedVariable(self): op_namespace = "testDumpUninitializedVariable" with session.Session() as sess: u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]]) s_init_val = b"str1" u_name = "%s/u" % op_namespace s_name = "%s/s" % op_namespace u_init = constant_op.constant(u_init_val, shape=[2, 2]) u = variables.Variable(u_init, name=u_name) s_init = constant_op.constant(s_init_val) s = variables.Variable(s_init, name=s_name) run_options = config_pb2.RunOptions(output_partition_graphs=True) debug_urls = self._debug_urls() # Add debug tensor watch for u. debug_utils.add_debug_tensor_watch(run_options, "%s" % u_name, 0, debug_urls=debug_urls) debug_utils.add_debug_tensor_watch(run_options, "%s" % s_name, 0, debug_urls=debug_urls) run_metadata = config_pb2.RunMetadata() # Initialize u and s. sess.run(variables.global_variables_initializer(), options=run_options, run_metadata=run_metadata) # Verify the dump file for the uninitialized value of u. dump = debug_data.DebugDumpDir( self._dump_root, partition_graphs=run_metadata.partition_graphs) self.assertEqual(2, dump.size) self.assertEqual(self._expected_partition_graph_count, len(run_metadata.partition_graphs)) # Verify that the variable is properly initialized by the run() call. u_vals = dump.get_tensors(u_name, 0, "DebugIdentity") s_vals = dump.get_tensors(s_name, 0, "DebugIdentity") self.assertEqual(1, len(u_vals)) self.assertIsNone(u_vals[0]) self.assertEqual(1, len(s_vals)) self.assertIsNone(s_vals[0]) # Call run() again, to check that u is initialized properly. self.assertAllClose(u_init_val, sess.run(u)) self.assertEqual(s_init_val, sess.run(s))
def on_run_end(self, request): """Overrides on-run-end callback. Actions taken: 1) Load the debug dump. 2) Bring up the Analyzer CLI. Args: request: An instance of OnSessionInitRequest. Returns: An instance of OnSessionInitResponse. """ self._is_run_start = False if request.performed_action == framework.OnRunStartAction.DEBUG_RUN: partition_graphs = None if request.run_metadata and request.run_metadata.partition_graphs: partition_graphs = request.run_metadata.partition_graphs elif request.client_graph_def: partition_graphs = [request.client_graph_def] debug_dump = debug_data.DebugDumpDir( self._dump_root, partition_graphs=partition_graphs) debug_dump.set_python_graph(self._sess.graph) passed_filter = None if self._active_tensor_filter: if not debug_dump.find( self._tensor_filters[self._active_tensor_filter], first_n=1): # No dumped tensor passes the filter in this run. Clean up the dump # directory and move on. self._remove_dump_root() return framework.OnRunEndResponse() else: # Some dumped tensor(s) from this run passed the filter. passed_filter = self._active_tensor_filter self._active_tensor_filter = None self._prep_cli_for_run_end(debug_dump, request.tf_error, passed_filter) self._run_start_response = self._launch_cli() # Clean up the dump generated by this run. self._remove_dump_root() else: # No debug information to show following a non-debug run() call. self._run_start_response = None # Return placeholder response that currently holds no additional # information. return framework.OnRunEndResponse()
def _generate_dump_from_simple_addition_graph(self): with session.Session() as sess: u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]]) v_init_val = np.array([[2.0], [-1.0]]) # Use node names with overlapping namespace (i.e., parent directory) to # test concurrent, non-racing directory creation. u_name = "u" v_name = "v" w_name = "w" u_init = constant_op.constant(u_init_val, shape=[2, 2]) u = variables.Variable(u_init, name=u_name) v_init = constant_op.constant(v_init_val, shape=[2, 1]) v = variables.Variable(v_init, name=v_name) w = math_ops.matmul(u, v, name=w_name) u.initializer.run() v.initializer.run() run_options = config_pb2.RunOptions(output_partition_graphs=True) debug_urls = "file://%s" % self._dump_root # Add debug tensor watch for u. debug_utils.add_debug_tensor_watch(run_options, "%s/read" % u_name, 0, debug_urls=debug_urls) # Add debug tensor watch for v. debug_utils.add_debug_tensor_watch(run_options, "%s/read" % v_name, 0, debug_urls=debug_urls) run_metadata = config_pb2.RunMetadata() # Invoke Session.run(). sess.run(w, options=run_options, run_metadata=run_metadata) self.assertEqual(self._expected_partition_graph_count, len(run_metadata.partition_graphs)) dump = debug_data.DebugDumpDir( self._dump_root, partition_graphs=run_metadata.partition_graphs) simple_add_results = collections.namedtuple("SimpleAddResults", [ "u_init_val", "v_init_val", "u", "v", "w", "u_name", "v_name", "w_name", "dump" ]) return simple_add_results(u_init_val, v_init_val, u, v, w, u_name, v_name, w_name, dump)
def testWatchingVariableUpdateOpsSeesUpdatedValues(self): """Watch output slots on Variable-updating ops, with no emitted edges.""" with session.Session() as sess: u_init = constant_op.constant(10.0) u = variables.Variable(u_init, name="gdo/u") v_init = constant_op.constant(20.0) v = variables.Variable(v_init, name="gdo/v") w = math_ops.multiply(u, v, name="gdo/w") # gdo stands for GradientDescentOptimizer. train_op = gradient_descent.GradientDescentOptimizer( learning_rate=0.1).minimize(w, name="gdo/train") u.initializer.run() v.initializer.run() run_options = config_pb2.RunOptions(output_partition_graphs=True) debug_utils.watch_graph(run_options, sess.graph, debug_ops=["DebugIdentity"], debug_urls=self._debug_urls()) run_metadata = config_pb2.RunMetadata() sess.run(train_op, options=run_options, run_metadata=run_metadata) dump = debug_data.DebugDumpDir( self._dump_root, partition_graphs=run_metadata.partition_graphs) update_u_data = dump.watch_key_to_data( "gdo/train/update_gdo/u/ApplyGradientDescent:0:DebugIdentity") self.assertEqual(1, len(update_u_data)) # Gradient descent on u: w = u * v, so dw / du = v. # Updated value of u should be: # 10.0 - learning_rate * v = 10.0 - 0.1 * 20.0 = 8.0 self.assertAllClose(8.0, update_u_data[0].get_tensor()) update_v_data = dump.watch_key_to_data( "gdo/train/update_gdo/v/ApplyGradientDescent:0:DebugIdentity") self.assertEqual(1, len(update_v_data)) # Gradient descent on u: w = u * v, so dw / dv = u. # Updated value of u should be: # 20.0 - learning_rate * u = 20.0 - 0.1 * 10.0 = 19.0 self.assertAllClose(19.0, update_v_data[0].get_tensor()) # Verify that the Variables u and v are updated properly. self.assertAllClose(8.0, sess.run(u)) self.assertAllClose(19.0, sess.run(v))
def testDumpingOnASingleRunWorksWithRelativePathForDebugDumpDir(self): sess = dumping_wrapper.DumpingDebugWrapperSession( self.sess, session_root=self.session_root, log_usage=False) sess.run(self.inc_v) dump_dirs = glob.glob(os.path.join(self.session_root, "run_*")) cwd = os.getcwd() try: os.chdir(self.session_root) dump = debug_data.DebugDumpDir( os.path.relpath(dump_dirs[0], self.session_root)) self.assertAllClose([10.0], dump.get_tensors("v", 0, "DebugIdentity")) finally: os.chdir(cwd)
def testGradientsValuesFromDumpWorks(self): y = math_ops.add(self.w, -1.0, name="y") z = math_ops.square(y, name="z") grad_debugger = debug_gradients.GradientsDebugger() with grad_debugger.watch_gradients_by_tensors(self.sess.graph, [self.w, self.u, y]): train_op = gradient_descent.GradientDescentOptimizer(0.1).minimize( z) self.sess.run(variables.global_variables_initializer()) run_options = config_pb2.RunOptions(output_partition_graphs=True) dump_dir = tempfile.mkdtemp() debug_url = "file://" + dump_dir debug_utils.watch_graph(run_options, self.sess.graph, debug_urls=debug_url) run_metadata = config_pb2.RunMetadata() self.assertAllClose(2.0, self.sess.run(self.u)) self.sess.run(train_op, options=run_options, run_metadata=run_metadata) self.assertAllClose(-1.0, self.sess.run(self.u)) dump = debug_data.DebugDumpDir( dump_dir, partition_graphs=run_metadata.partition_graphs) dump.set_python_graph(self.sess.graph) y_grad_values = debug_gradients.gradient_values_from_dump( grad_debugger, y, dump) self.assertEqual(1, len(y_grad_values)) self.assertAllClose(10.0, y_grad_values[0]) w_grad_values = debug_gradients.gradient_values_from_dump( grad_debugger, self.w, dump) self.assertEqual(1, len(w_grad_values)) self.assertAllClose(10.0, w_grad_values[0]) u_grad_values = debug_gradients.gradient_values_from_dump( grad_debugger, self.u, dump) self.assertEqual(1, len(u_grad_values)) self.assertAllClose(30.0, u_grad_values[0]) with self.assertRaisesRegexp( LookupError, r"This GradientsDebugger has not received any gradient tensor for " r"x-tensor v:0"): debug_gradients.gradient_values_from_dump(grad_debugger, self.v, dump) # Cleanup. shutil.rmtree(dump_dir)
def testDumpingDebugHookWithoutWatchFnWorks(self): dumping_hook = hooks.DumpingDebugHook(self.session_root, log_usage=False) mon_sess = monitored_session._HookedSession(self.sess, [dumping_hook]) mon_sess.run(self.inc_v) dump_dirs = glob.glob(os.path.join(self.session_root, "run_*")) self.assertEqual(1, len(dump_dirs)) self._assert_correct_run_subdir_naming(os.path.basename(dump_dirs[0])) dump = debug_data.DebugDumpDir(dump_dirs[0]) self.assertAllClose([10.0], dump.get_tensors("v", 0, "DebugIdentity")) self.assertEqual(repr(self.inc_v), dump.run_fetches_info) self.assertEqual(repr(None), dump.run_feed_keys_info)
def testLookUpNodePythonTracebackWorks(self): with session.Session() as sess: u_init = constant_op.constant(10.0) u = variables.Variable(u_init, name="traceback/u") v_init = constant_op.constant(20.0) v = variables.Variable(v_init, name="traceback/v") w = math_ops.multiply(u, v, name="traceback/w") sess.run(variables.global_variables_initializer()) run_metadata = config_pb2.RunMetadata() run_options = config_pb2.RunOptions(output_partition_graphs=True) debug_utils.watch_graph(run_options, sess.graph, debug_urls=self._debug_urls()) sess.run(w, options=run_options, run_metadata=run_metadata) dump = debug_data.DebugDumpDir( self._dump_root, partition_graphs=run_metadata.partition_graphs) # Prior to setting the Python graph, attempts to do traceback lookup # should lead to exceptions. with self.assertRaisesRegexp( LookupError, "Python graph is not available for traceback lookup"): dump.node_traceback("traceback/w") dump.set_python_graph(sess.graph) # After setting the Python graph, attempts to look up nonexistent nodes # should lead to exceptions. with self.assertRaisesRegexp( KeyError, r"Cannot find node \"foo\" in Python graph"): dump.node_traceback("foo") # Lookup should work with node name input. traceback = dump.node_traceback("traceback/w") self.assertIsInstance(traceback, list) self.assertGreater(len(traceback), 0) for trace in traceback: self.assertIsInstance(trace, tuple) # Lookup should also work with tensor name input. traceback = dump.node_traceback("traceback/w:0") self.assertIsInstance(traceback, list) self.assertGreater(len(traceback), 0) for trace in traceback: self.assertIsInstance(trace, tuple)
def testDumpingOnASingleRunWorks(self): sess = dumping_wrapper.DumpingDebugWrapperSession( self.sess, session_root=self.session_root, log_usage=False) sess.run(self.inc_v) dump_dirs = glob.glob(os.path.join(self.session_root, "run_*")) self.assertEqual(1, len(dump_dirs)) self._assert_correct_run_subdir_naming(os.path.basename(dump_dirs[0])) dump = debug_data.DebugDumpDir(dump_dirs[0]) self.assertAllClose([10.0], dump.get_tensors("v", 0, "DebugIdentity")) self.assertEqual(repr(self.inc_v), dump.run_fetches_info) self.assertEqual(repr(None), dump.run_feed_keys_info)
def testMultiGPUSessionRun(self): local_devices = device_lib.list_local_devices() gpu_device_names = [] for device in local_devices: if device.device_type == "GPU": gpu_device_names.append(device.name) gpu_device_names = sorted(gpu_device_names) if len(gpu_device_names) < 2: self.skipTest( "This test requires at least 2 GPUs, but only %d is available." % len(gpu_device_names)) with session.Session() as sess: v = variables.Variable([10.0, 15.0], dtype=dtypes.float32, name="v") with ops.device(gpu_device_names[0]): u0 = math_ops.add(v, v, name="u0") with ops.device(gpu_device_names[1]): u1 = math_ops.multiply(v, v, name="u1") w = math_ops.subtract(u1, u0, name="w") sess.run(v.initializer) run_options = config_pb2.RunOptions(output_partition_graphs=True) debug_utils.watch_graph(run_options, sess.graph, debug_urls="file://" + self._dump_root) run_metadata = config_pb2.RunMetadata() self.assertAllClose([80.0, 195.0], sess.run(w, options=run_options, run_metadata=run_metadata)) debug_dump_dir = debug_data.DebugDumpDir( self._dump_root, partition_graphs=run_metadata.partition_graphs) self.assertEqual(3, len(debug_dump_dir.devices())) self.assertAllClose([10.0, 15.0], debug_dump_dir.get_tensors( "v", 0, "DebugIdentity")[0]) self.assertAllClose([20.0, 30.0], debug_dump_dir.get_tensors( "u0", 0, "DebugIdentity")[0]) self.assertAllClose([100.0, 225.0], debug_dump_dir.get_tensors( "u1", 0, "DebugIdentity")[0])
def _compareOriginalAndReconstructedGraphDefs(self, sess, fetches, feed_dict=None, expected_output=None): run_options = config_pb2.RunOptions(output_partition_graphs=True) run_metadata = config_pb2.RunMetadata() output = sess.run(fetches, feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) if expected_output is not None: self.assertAllClose(expected_output, output) non_debug_graph_defs = run_metadata.partition_graphs debug_utils.watch_graph(run_options, sess.graph, debug_urls=self._debug_url) run_metadata = config_pb2.RunMetadata() output = sess.run(fetches, feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) if expected_output is not None: self.assertAllClose(expected_output, output) dump = debug_data.DebugDumpDir( self._dump_dir, partition_graphs=run_metadata.partition_graphs, validate=True) reconstructed = dump.reconstructed_non_debug_partition_graphs() self.assertEqual(len(non_debug_graph_defs), len(reconstructed)) for i, non_debug_graph_def in enumerate(non_debug_graph_defs): device_name = debug_graphs._infer_device_name(non_debug_graph_def) test_util.assert_equal_graph_def( self._graphDefWithoutBlacklistedNodes( reconstructed[device_name]), self._graphDefWithoutBlacklistedNodes(non_debug_graph_def)) # Test debug_graphs.reconstruct_non_debug_graph_def. reconstructed_again = ( debug_graphs.reconstruct_non_debug_graph_def( run_metadata.partition_graphs[i])) test_util.assert_equal_graph_def( self._graphDefWithoutBlacklistedNodes(reconstructed_again), self._graphDefWithoutBlacklistedNodes(non_debug_graph_def))
def createAndRunGraphHelper(self): """Create and run a TensorFlow Graph to generate debug dumps. This is intentionally done in separate method, to make it easier to test the stack-top mode of source annotation. """ self.dump_root = self.get_temp_dir() self.curr_file_path = os.path.abspath( tf_inspect.getfile(tf_inspect.currentframe())) # Run a simple TF graph to generate some debug dumps that can be used in # source annotation. with session.Session() as sess: self.u_init = constant_op.constant(np.array([[5.0, 3.0], [-1.0, 0.0]]), shape=[2, 2], name="u_init") self.u_init_line_number = line_number_above() self.u = variables.Variable(self.u_init, name="u") self.u_line_number = line_number_above() self.v_init = constant_op.constant(np.array([[2.0], [-1.0]]), shape=[2, 1], name="v_init") self.v_init_line_number = line_number_above() self.v = variables.Variable(self.v_init, name="v") self.v_line_number = line_number_above() self.w = math_ops.matmul(self.u, self.v, name="w") self.w_line_number = line_number_above() self.evaluate(self.u.initializer) self.evaluate(self.v.initializer) run_options = config_pb2.RunOptions(output_partition_graphs=True) debug_utils.watch_graph(run_options, sess.graph, debug_urls=["file://%s" % self.dump_root]) run_metadata = config_pb2.RunMetadata() sess.run(self.w, options=run_options, run_metadata=run_metadata) self.dump = debug_data.DebugDumpDir( self.dump_root, partition_graphs=run_metadata.partition_graphs) self.dump.set_python_graph(sess.graph)
def testDumpingOnMultipleRunsWorks(self): sess = dumping_wrapper.DumpingDebugWrapperSession( self.sess, session_root=self.session_root, log_usage=False) for _ in range(3): sess.run(self.inc_v) dump_dirs = glob.glob(os.path.join(self.session_root, "run_*")) dump_dirs = sorted( dump_dirs, key=lambda x: int(os.path.basename(x).split("_")[1])) self.assertEqual(3, len(dump_dirs)) for i, dump_dir in enumerate(dump_dirs): self._assert_correct_run_subdir_naming(os.path.basename(dump_dir)) dump = debug_data.DebugDumpDir(dump_dir) self.assertAllClose([10.0 + 1.0 * i], dump.get_tensors("v", 0, "DebugIdentity")) self.assertEqual(repr(self.inc_v), dump.run_fetches_info) self.assertEqual(repr(None), dump.run_feed_keys_info)