def testWriteGraphExecutionTraceEventsWithCircularBuffer(self): writer = debug_events_writer.DebugEventsWriter(self.dump_root) num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2 for i in range(num_execution_events): trace = debug_event_pb2.GraphExecutionTrace() trace.op_name = "Op%d" % i writer.WriteGraphExecutionTrace(trace) with debug_events_reader.DebugEventsReader(self.dump_root) as reader: actuals = list(reader.graph_execution_traces_iterator()) # Before FlushExecutionFiles() is called. No data should have been written # to the file. self.assertEmpty(actuals) writer.FlushExecutionFiles() actuals = list( item.debug_event.graph_execution_trace for item in reader.graph_execution_traces_iterator()) self.assertLen(actuals, debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE) for i in range(debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE): self.assertEqual( actuals[i].op_name, "Op%d" % (i + debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE))
def testWriteGraphOpCreationAndDebuggedGraphs(self): writer = debug_events_writer.DebugEventsWriter(self.dump_root, self.tfdbg_run_id) num_op_creations = 10 for i in range(num_op_creations): graph_op_creation = debug_event_pb2.GraphOpCreation() graph_op_creation.op_type = "Conv2D" graph_op_creation.op_name = "Conv2D_%d" % i writer.WriteGraphOpCreation(graph_op_creation) debugged_graph = debug_event_pb2.DebuggedGraph() debugged_graph.graph_id = "deadbeaf" debugged_graph.graph_name = "MyGraph1" writer.WriteDebuggedGraph(debugged_graph) writer.FlushNonExecutionFiles() reader = debug_events_reader.DebugEventsReader(self.dump_root) actuals = list(item.debug_event for item in reader.graphs_iterator()) self.assertLen(actuals, num_op_creations + 1) for i in range(num_op_creations): self.assertEqual(actuals[i].graph_op_creation.op_type, "Conv2D") self.assertEqual(actuals[i].graph_op_creation.op_name, "Conv2D_%d" % i) self.assertEqual(actuals[num_op_creations].debugged_graph.graph_id, "deadbeaf")
def testConcurrentWritesToNonExecutionFilesWorks(self): writer = debug_events_writer.DebugEventsWriter(self.dump_root) source_file_state = {"counter": 0, "lock": threading.Lock()} def WriteSourceFile(): source_file = debug_event_pb2.SourceFile() with source_file_state["lock"]: source_file.file_path = "/home/tf2user/file_%d.py" % source_file_state[ "counter"] source_file_state["counter"] += 1 writer.WriteSourceFile(source_file) # More-frequent-than-necessary concurrent flushing is not recommended, # but tolerated. writer.FlushNonExecutionFiles() stack_frame_state = {"counter": 0, "lock": threading.Lock()} def WriteStackFrame(): stack_frame = debug_event_pb2.StackFrameWithId() with stack_frame_state["lock"]: stack_frame.id = "stack_frame_%d" % stack_frame_state["counter"] stack_frame_state["counter"] += 1 writer.WriteStackFrameWithId(stack_frame) # More-frequent-than-necessary concurrent flushing is not recommended, # but tolerated. writer.FlushNonExecutionFiles() graph_op_state = {"counter": 0, "lock": threading.Lock()} def WriteGraphOpCreation(): graph_op_creation = debug_event_pb2.GraphOpCreation() with graph_op_state["lock"]: graph_op_creation.op_name = "Op%d" % graph_op_state["counter"] graph_op_state["counter"] += 1 writer.WriteGraphOpCreation(graph_op_creation) # More-frequent-than-necessary concurrent flushing is not recommended, # but tolerated. writer.FlushNonExecutionFiles() num_threads = 9 threads = [] for i in range(num_threads): if i % 3 == 0: target = WriteSourceFile elif i % 3 == 1: target = WriteStackFrame else: target = WriteGraphOpCreation thread = threading.Thread(target=target) thread.start() threads.append(thread) for thread in threads: thread.join() # Verify the content of the .source_files file. with debug_events_reader.DebugEventsReader(self.dump_root) as reader: source_files_iter = reader.source_files_iterator() actuals = list(source_files_iter) file_paths = sorted( [actual.source_file.file_path for actual in actuals]) self.assertEqual(file_paths, [ "/home/tf2user/file_0.py", "/home/tf2user/file_1.py", "/home/tf2user/file_2.py" ]) # Verify the content of the .stack_frames file. actuals = list(reader.stack_frames_iterator()) stack_frame_ids = sorted( [actual.stack_frame_with_id.id for actual in actuals]) self.assertEqual(stack_frame_ids, ["stack_frame_0", "stack_frame_1", "stack_frame_2"]) # Verify the content of the .graphs file. actuals = list(reader.graphs_iterator()) graph_op_names = sorted( [actual.graph_op_creation.op_name for actual in actuals]) self.assertEqual(graph_op_names, ["Op0", "Op1", "Op2"])
def testPureEagerOpExecution(self, tensor_debug_mode): """Test catching Infinity in eager op execution: float32.""" writer = dumping_callback.enable_dump_debug_info( self.dump_root, tensor_debug_mode=tensor_debug_mode) x = constant_op.constant(10.0) zero = constant_op.constant(0.0) one = constant_op.constant(1.0) two = constant_op.constant(2.0) three = constant_op.constant(3.0) # Use Collatz conjecture as a test case. while x > one: if math_ops.equal(x % two, zero): x = x / two else: x = x * three + one writer.FlushNonExecutionFiles() self._readAndCheckMetadataFile() stack_frame_by_id = self._readAndCheckSourceFilesAndStackFrames() # Before FlushExecutionFiles() is called, the .execution file should be # empty. with debug_events_reader.DebugEventsReader(self.dump_root) as reader: execution_iter = reader.execution_iterator() with self.assertRaises(StopIteration): next(execution_iter) # After the flushing, the .execution file should hold the appropriate # contents. writer.FlushExecutionFiles() execution_iter = reader.execution_iterator() prev_wall_time = 1 executed_op_types = [] tensor_values = collections.defaultdict(lambda: []) for debug_event in execution_iter: self.assertGreaterEqual(debug_event.wall_time, prev_wall_time) prev_wall_time = debug_event.wall_time execution = debug_event.execution executed_op_types.append(execution.op_type) self.assertTrue(execution.input_tensor_ids) self.assertTrue(execution.output_tensor_ids) if tensor_debug_mode == "NO_TENSOR": # Due to the NO_TENSOR tensor debug mode, tensor_protos ought to # be empty. self.assertFalse(execution.tensor_protos) elif tensor_debug_mode == "FULL_TENSOR": # Under the FULL_TENSOR mode, the value of the tensor should be # available through `tensor_protos`. tensor_value = float( tensor_util.MakeNdarray(execution.tensor_protos[0])) tensor_values[execution.op_type].append(tensor_value) # Verify the code_location field. self.assertTrue(execution.code_location.stack_frame_ids) for stack_frame_id in execution.code_location.stack_frame_ids: self.assertIn(stack_frame_id, stack_frame_by_id) if tensor_debug_mode == "FULL_TENSOR": self.assertAllClose(tensor_values["Greater"], [1, 1, 1, 1, 1, 1, 0]) self.assertAllClose(tensor_values["RealDiv"], [5, 8, 4, 2, 1]) self.assertAllClose(tensor_values["Mul"], [15]) self.assertAllClose(tensor_values["AddV2"], [16]) self.assertEqual( executed_op_types, [ "Greater", "FloorMod", "Equal", "RealDiv", # 10 --> 5 "Greater", "FloorMod", "Equal", "Mul", "AddV2", # 5 --> 16 "Greater", "FloorMod", "Equal", "RealDiv", # 16 --> 8 "Greater", "FloorMod", "Equal", "RealDiv", # 8 --> 4 "Greater", "FloorMod", "Equal", "RealDiv", # 4 --> 2 "Greater", "FloorMod", "Equal", "RealDiv", # 2 --> 1 "Greater" ]) # Due to the pure eager op execution, the .graph file and the # .graph_execution_traces file ought to be empty. graphs_iterator = reader.graphs_iterator() with self.assertRaises(StopIteration): next(graphs_iterator) graph_trace_iter = reader.graph_execution_traces_iterator() with self.assertRaises(StopIteration): next(graph_trace_iter)
def testFunctionExecutionWithControlFlow(self, tensor_debug_mode): writer = dumping_callback.enable_dump_debug_info( self.dump_root, tensor_debug_mode=tensor_debug_mode) @def_function.function def iterative_doubling(x, times): i = constant_op.constant(0, dtype=dtypes.int32) while i < times: x = x * 2.0 i += 1 return x x = constant_op.constant(0.5, dtype=dtypes.float32) times = constant_op.constant(4, dtype=dtypes.int32) self.assertAllClose(self.evaluate(iterative_doubling(x, times)), 8.0) writer.FlushNonExecutionFiles() stack_frame_by_id = self._readAndCheckSourceFilesAndStackFrames() # Verify the content of the .graphs file. context_ids, op_types, op_name_to_op_type, _ = ( self._readAndCheckGraphsFile(stack_frame_by_id)) self.assertIn("Less", op_types) self.assertIn("Mul", op_types) self.assertIn("AddV2", op_types) # Before FlushExecutionFiles() is called, the .execution and # .graph_execution_traces files should be both empty. with debug_events_reader.DebugEventsReader(self.dump_root) as reader: execution_iter = reader.execution_iterator() graph_execution_traces_iter = reader.graph_execution_traces_iterator( ) with self.assertRaises(StopIteration): next(execution_iter) with self.assertRaises(StopIteration): next(graph_execution_traces_iter) # TODO(cais): Backport execution instrumentation to tf.Session. writer.FlushExecutionFiles() # After the flushing, the .execution file should hold the appropriate # contents. if context.executing_eagerly(): (executed_op_types, input_tensor_ids, output_tensor_ids, tensor_debug_modes, tensor_values) = self._readAndCheckExecutionFile() # NOTE(b/142486213): Execution of the TF function happens with # Session.run() in v1 graph mode, hence it doesn't get logged to the # .execution file. self.assertLen(executed_op_types, 1) self.assertIn("iterative_doubling", executed_op_types[0]) self.assertLen(input_tensor_ids[0], 2) self.assertLen(output_tensor_ids[0], 1) self.assertEqual( tensor_debug_modes[0], debug_event_pb2.TensorDebugMode.Value(tensor_debug_mode)) if tensor_debug_mode == "FULL_TENSOR": self.assertAllClose(tensor_values, [[8.0]]) (op_names, _, output_slots, tensor_values ) = self._readAndCheckGraphExecutionTracesFile(context_ids) executed_op_types = [ op_name_to_op_type[op_name] for op_name in op_names ] # The Less op should have been executed 5 times. self.assertEqual(executed_op_types.count("Less"), 5) # The last executed op should be Less. self.assertEqual(executed_op_types[-1], "Less") # The Mul op should have been executed 4 times. self.assertEqual(executed_op_types.count("Mul"), 4) # The AddV2 op should have been run, but we refrain from asserting on how # many times it's executed. self.assertIn("AddV2", executed_op_types) for output_slot in output_slots: self.assertEqual(output_slot, 0) if tensor_debug_mode == "NO_TENSOR": # Under the default NO_TENSOR tensor-debug mode, the tensor_proto ought # to be an empty float32 tensor. for tensor_value in tensor_values: self.assertEqual(tensor_value.dtype, np.float32) self.assertEqual(tensor_value.shape, (0, )) elif tensor_debug_mode == "FULL_TENSOR": less_values = [ tensor_values[i] for i, op_type in enumerate(executed_op_types) if op_type == "Less" ] self.assertAllClose(less_values, [True, True, True, True, False]) mul_values = [ tensor_values[i] for i, op_type in enumerate(executed_op_types) if op_type == "Mul" ] self.assertAllClose(mul_values, [1.0, 2.0, 4.0, 8.0])
def testSingleTensorFullTensorDebugModeWithCircularBufferBehavior(self): @def_function.function def write_debug_trace(x): square = math_ops.square(x) gen_debug_ops.debug_identity_v2( square, tfdbg_context_id="deadbeaf", op_name="Square", output_slot=0, tensor_debug_mode=debug_event_pb2.TensorDebugMode.FULL_TENSOR, debug_urls=["file://%s" % self.dump_root]) sqrt = math_ops.sqrt(x) gen_debug_ops.debug_identity_v2( sqrt, tfdbg_context_id="beafdead", op_name="Sqrt", output_slot=0, tensor_debug_mode=debug_event_pb2.TensorDebugMode.FULL_TENSOR, debug_urls=["file://%s" % self.dump_root]) return square + sqrt x = np.array([3.0, 4.0]) # Only the graph-execution trace of the last iteration should be written # to self.dump_root. for _ in range(self.circular_buffer_size // 2 + 1): self.assertAllClose(write_debug_trace(x), [9.0 + np.sqrt(3.0), 16.0 + 2.0]) with debug_events_reader.DebugEventsReader(self.dump_root) as reader: # Check that the .metadata DebugEvents data file has been created, even # before FlushExecutionFiles() is called. self.assertGreater(reader.starting_wall_time(), 0) self.assertTrue(reader.tensorflow_version()) self.assertTrue( reader.tfdbg_file_version().startswith("debug.Event")) graph_trace_iter = reader.graph_execution_traces_iterators()[0] # Before FlushExecutionFiles() is called, the .graph_execution_traces file # ought to be empty. with self.assertRaises(StopIteration): next(graph_trace_iter) # Flush the circular buffer. self.writer.FlushExecutionFiles() graph_trace_iter = reader.graph_execution_traces_iterators()[0] # The circular buffer has a size of 4. So only the data from the # last two iterations should have been written to self.dump_root. for _ in range(2): debug_event = next(graph_trace_iter).debug_event self.assertGreater(debug_event.wall_time, 0) trace = debug_event.graph_execution_trace self.assertEqual(trace.tfdbg_context_id, "deadbeaf") self.assertEqual(trace.op_name, "Square") self.assertEqual(trace.output_slot, 0) self.assertEqual(trace.tensor_debug_mode, debug_event_pb2.TensorDebugMode.FULL_TENSOR) tensor_value = tensor_util.MakeNdarray(trace.tensor_proto) self.assertAllClose(tensor_value, [9.0, 16.0]) debug_event = next(graph_trace_iter).debug_event self.assertGreater(debug_event.wall_time, 0) trace = debug_event.graph_execution_trace self.assertEqual(trace.tfdbg_context_id, "beafdead") self.assertEqual(trace.op_name, "Sqrt") self.assertEqual(trace.output_slot, 0) self.assertEqual(trace.tensor_debug_mode, debug_event_pb2.TensorDebugMode.FULL_TENSOR) tensor_value = tensor_util.MakeNdarray(trace.tensor_proto) self.assertAllClose(tensor_value, [np.sqrt(3.0), 2.0]) # Only the graph-execution trace of the last iteration should be written # to self.dump_root. with self.assertRaises(StopIteration): next(graph_trace_iter)
def _readAndCheckGraphsFile(self, stack_frame_by_id): """Read and verify the content of the .graphs debug-event file. Args: stack_frame_by_id: A dict mapping unique string IDs to stack frames. It is used by this method to look up stack frames. Returns: context_ids: IDs of op creation contexts (e.g., TensorFlow graphs), as a `list` of `str`s. op_types: Types of the ops that are created, as a `list` of `str`s with the same length as `context_ids`. op_name_to_op_type: An `OrderedDict` mapping op name to op type. op_name_to_context_id: A `dict` mapping op name to the ID of the innermost containing graph (context). """ with debug_events_reader.DebugEventsReader(self.dump_root) as reader: graphs_iter = reader.graphs_iterator() prev_wall_time = 0 op_types = [] op_name_to_op_type = collections.OrderedDict() op_name_to_context_id = dict( ) # Maps op name to ID of innermost context. context_ids = set() symbolic_tensor_ids = set() # Maps context ID to ID of directly enclosing context (`None` for # outermost contexts). context_id_to_outer_id = dict() for debug_event in graphs_iter: self.assertGreaterEqual(debug_event.wall_time, prev_wall_time) prev_wall_time = debug_event.wall_time # A DebugEvent in the .graphs file contains either of the two fields: # - graph_op_creation for creation of a symbolic op in a graph context. # - debugged_graph for information regarding the graph (context). if debug_event.graph_op_creation.ByteSize(): graph_op_creation = debug_event.graph_op_creation self.assertTrue(graph_op_creation.op_type) op_types.append(graph_op_creation.op_type) self.assertTrue(graph_op_creation.op_name) op_name_to_op_type[ graph_op_creation.op_name] = graph_op_creation.op_type op_name_to_context_id[ graph_op_creation.op_name] = graph_op_creation.graph_id self.assertTrue(graph_op_creation.graph_id) context_ids.add(graph_op_creation.graph_id) self.assertTrue(graph_op_creation.code_location) if graph_op_creation.num_outputs: self.assertLen(graph_op_creation.output_tensor_ids, graph_op_creation.num_outputs) # Check that all symblic tensor IDs are unique. for tensor_id in graph_op_creation.output_tensor_ids: self.assertNotIn(tensor_id, symbolic_tensor_ids) symbolic_tensor_ids.add(tensor_id) for stack_frame_id in graph_op_creation.code_location.stack_frame_ids: self.assertIn(stack_frame_id, stack_frame_by_id) else: debugged_graph = debug_event.debugged_graph if debugged_graph.outer_context_id: inner_id = debugged_graph.graph_id outer_id = debugged_graph.outer_context_id if inner_id in context_id_to_outer_id: # The outer context of a context must be always the same. self.assertEqual(context_id_to_outer_id[inner_id], outer_id) else: context_id_to_outer_id[inner_id] = outer_id else: # This is an outermost context. if debugged_graph.graph_id in context_id_to_outer_id: self.assertIsNone(context_id_to_outer_id[ debugged_graph.graph_id]) else: context_id_to_outer_id[ debugged_graph.graph_id] = None # If any graph is created, the graph context hierarchy must be populated. # In addition, the context of each graph op must be locatable within the # graph context hierarchy. for context_id in op_name_to_context_id.values(): self.assertIn(context_id, context_id_to_outer_id) return context_ids, op_types, op_name_to_op_type, op_name_to_context_id