Example #1
0
    def testWriteGraphExecutionTraceEventsWithCircularBuffer(self):
        writer = debug_events_writer.DebugEventsWriter(self.dump_root)
        num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2
        for i in range(num_execution_events):
            trace = debug_event_pb2.GraphExecutionTrace()
            trace.op_name = "Op%d" % i
            writer.WriteGraphExecutionTrace(trace)

        with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
            actuals = list(reader.graph_execution_traces_iterator())
            # Before FlushExecutionFiles() is called. No data should have been written
            # to the file.
            self.assertEmpty(actuals)

            writer.FlushExecutionFiles()
            actuals = list(
                item.debug_event.graph_execution_trace
                for item in reader.graph_execution_traces_iterator())
            self.assertLen(actuals,
                           debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE)
            for i in range(debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE):
                self.assertEqual(
                    actuals[i].op_name, "Op%d" %
                    (i + debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE))
Example #2
0
    def testWriteGraphOpCreationAndDebuggedGraphs(self):
        writer = debug_events_writer.DebugEventsWriter(self.dump_root,
                                                       self.tfdbg_run_id)
        num_op_creations = 10
        for i in range(num_op_creations):
            graph_op_creation = debug_event_pb2.GraphOpCreation()
            graph_op_creation.op_type = "Conv2D"
            graph_op_creation.op_name = "Conv2D_%d" % i
            writer.WriteGraphOpCreation(graph_op_creation)
        debugged_graph = debug_event_pb2.DebuggedGraph()
        debugged_graph.graph_id = "deadbeaf"
        debugged_graph.graph_name = "MyGraph1"
        writer.WriteDebuggedGraph(debugged_graph)
        writer.FlushNonExecutionFiles()

        reader = debug_events_reader.DebugEventsReader(self.dump_root)
        actuals = list(item.debug_event for item in reader.graphs_iterator())
        self.assertLen(actuals, num_op_creations + 1)
        for i in range(num_op_creations):
            self.assertEqual(actuals[i].graph_op_creation.op_type, "Conv2D")
            self.assertEqual(actuals[i].graph_op_creation.op_name,
                             "Conv2D_%d" % i)
        self.assertEqual(actuals[num_op_creations].debugged_graph.graph_id,
                         "deadbeaf")
    def testConcurrentWritesToNonExecutionFilesWorks(self):
        writer = debug_events_writer.DebugEventsWriter(self.dump_root)

        source_file_state = {"counter": 0, "lock": threading.Lock()}

        def WriteSourceFile():
            source_file = debug_event_pb2.SourceFile()
            with source_file_state["lock"]:
                source_file.file_path = "/home/tf2user/file_%d.py" % source_file_state[
                    "counter"]
                source_file_state["counter"] += 1
            writer.WriteSourceFile(source_file)
            # More-frequent-than-necessary concurrent flushing is not recommended,
            # but tolerated.
            writer.FlushNonExecutionFiles()

        stack_frame_state = {"counter": 0, "lock": threading.Lock()}

        def WriteStackFrame():
            stack_frame = debug_event_pb2.StackFrameWithId()
            with stack_frame_state["lock"]:
                stack_frame.id = "stack_frame_%d" % stack_frame_state["counter"]
                stack_frame_state["counter"] += 1
            writer.WriteStackFrameWithId(stack_frame)
            # More-frequent-than-necessary concurrent flushing is not recommended,
            # but tolerated.
            writer.FlushNonExecutionFiles()

        graph_op_state = {"counter": 0, "lock": threading.Lock()}

        def WriteGraphOpCreation():
            graph_op_creation = debug_event_pb2.GraphOpCreation()
            with graph_op_state["lock"]:
                graph_op_creation.op_name = "Op%d" % graph_op_state["counter"]
                graph_op_state["counter"] += 1
            writer.WriteGraphOpCreation(graph_op_creation)
            # More-frequent-than-necessary concurrent flushing is not recommended,
            # but tolerated.
            writer.FlushNonExecutionFiles()

        num_threads = 9
        threads = []
        for i in range(num_threads):
            if i % 3 == 0:
                target = WriteSourceFile
            elif i % 3 == 1:
                target = WriteStackFrame
            else:
                target = WriteGraphOpCreation
            thread = threading.Thread(target=target)
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()

        # Verify the content of the .source_files file.
        with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
            source_files_iter = reader.source_files_iterator()
            actuals = list(source_files_iter)
            file_paths = sorted(
                [actual.source_file.file_path for actual in actuals])
            self.assertEqual(file_paths, [
                "/home/tf2user/file_0.py", "/home/tf2user/file_1.py",
                "/home/tf2user/file_2.py"
            ])

        # Verify the content of the .stack_frames file.
        actuals = list(reader.stack_frames_iterator())
        stack_frame_ids = sorted(
            [actual.stack_frame_with_id.id for actual in actuals])
        self.assertEqual(stack_frame_ids,
                         ["stack_frame_0", "stack_frame_1", "stack_frame_2"])

        # Verify the content of the .graphs file.
        actuals = list(reader.graphs_iterator())
        graph_op_names = sorted(
            [actual.graph_op_creation.op_name for actual in actuals])
        self.assertEqual(graph_op_names, ["Op0", "Op1", "Op2"])
    def testPureEagerOpExecution(self, tensor_debug_mode):
        """Test catching Infinity in eager op execution: float32."""
        writer = dumping_callback.enable_dump_debug_info(
            self.dump_root, tensor_debug_mode=tensor_debug_mode)

        x = constant_op.constant(10.0)
        zero = constant_op.constant(0.0)
        one = constant_op.constant(1.0)
        two = constant_op.constant(2.0)
        three = constant_op.constant(3.0)
        # Use Collatz conjecture as a test case.
        while x > one:
            if math_ops.equal(x % two, zero):
                x = x / two
            else:
                x = x * three + one

        writer.FlushNonExecutionFiles()
        self._readAndCheckMetadataFile()
        stack_frame_by_id = self._readAndCheckSourceFilesAndStackFrames()

        # Before FlushExecutionFiles() is called, the .execution file should be
        # empty.
        with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
            execution_iter = reader.execution_iterator()
            with self.assertRaises(StopIteration):
                next(execution_iter)

            # After the flushing, the .execution file should hold the appropriate
            # contents.
            writer.FlushExecutionFiles()
            execution_iter = reader.execution_iterator()
            prev_wall_time = 1
            executed_op_types = []
            tensor_values = collections.defaultdict(lambda: [])
            for debug_event in execution_iter:
                self.assertGreaterEqual(debug_event.wall_time, prev_wall_time)
                prev_wall_time = debug_event.wall_time
                execution = debug_event.execution
                executed_op_types.append(execution.op_type)
                self.assertTrue(execution.input_tensor_ids)
                self.assertTrue(execution.output_tensor_ids)
                if tensor_debug_mode == "NO_TENSOR":
                    # Due to the NO_TENSOR tensor debug mode, tensor_protos ought to
                    # be empty.
                    self.assertFalse(execution.tensor_protos)
                elif tensor_debug_mode == "FULL_TENSOR":
                    # Under the FULL_TENSOR mode, the value of the tensor should be
                    # available through `tensor_protos`.
                    tensor_value = float(
                        tensor_util.MakeNdarray(execution.tensor_protos[0]))
                    tensor_values[execution.op_type].append(tensor_value)
                # Verify the code_location field.
                self.assertTrue(execution.code_location.stack_frame_ids)
                for stack_frame_id in execution.code_location.stack_frame_ids:
                    self.assertIn(stack_frame_id, stack_frame_by_id)
            if tensor_debug_mode == "FULL_TENSOR":
                self.assertAllClose(tensor_values["Greater"],
                                    [1, 1, 1, 1, 1, 1, 0])
                self.assertAllClose(tensor_values["RealDiv"], [5, 8, 4, 2, 1])
                self.assertAllClose(tensor_values["Mul"], [15])
                self.assertAllClose(tensor_values["AddV2"], [16])

            self.assertEqual(
                executed_op_types,
                [
                    "Greater",
                    "FloorMod",
                    "Equal",
                    "RealDiv",  # 10 --> 5
                    "Greater",
                    "FloorMod",
                    "Equal",
                    "Mul",
                    "AddV2",  # 5 --> 16
                    "Greater",
                    "FloorMod",
                    "Equal",
                    "RealDiv",  # 16 --> 8
                    "Greater",
                    "FloorMod",
                    "Equal",
                    "RealDiv",  # 8 --> 4
                    "Greater",
                    "FloorMod",
                    "Equal",
                    "RealDiv",  # 4 --> 2
                    "Greater",
                    "FloorMod",
                    "Equal",
                    "RealDiv",  # 2 --> 1
                    "Greater"
                ])

            # Due to the pure eager op execution, the .graph file and the
            # .graph_execution_traces file ought to be empty.
            graphs_iterator = reader.graphs_iterator()
            with self.assertRaises(StopIteration):
                next(graphs_iterator)
            graph_trace_iter = reader.graph_execution_traces_iterator()
            with self.assertRaises(StopIteration):
                next(graph_trace_iter)
    def testFunctionExecutionWithControlFlow(self, tensor_debug_mode):
        writer = dumping_callback.enable_dump_debug_info(
            self.dump_root, tensor_debug_mode=tensor_debug_mode)

        @def_function.function
        def iterative_doubling(x, times):
            i = constant_op.constant(0, dtype=dtypes.int32)
            while i < times:
                x = x * 2.0
                i += 1
            return x

        x = constant_op.constant(0.5, dtype=dtypes.float32)
        times = constant_op.constant(4, dtype=dtypes.int32)
        self.assertAllClose(self.evaluate(iterative_doubling(x, times)), 8.0)

        writer.FlushNonExecutionFiles()
        stack_frame_by_id = self._readAndCheckSourceFilesAndStackFrames()

        # Verify the content of the .graphs file.
        context_ids, op_types, op_name_to_op_type, _ = (
            self._readAndCheckGraphsFile(stack_frame_by_id))
        self.assertIn("Less", op_types)
        self.assertIn("Mul", op_types)
        self.assertIn("AddV2", op_types)

        # Before FlushExecutionFiles() is called, the .execution and
        # .graph_execution_traces files should be both empty.
        with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
            execution_iter = reader.execution_iterator()
            graph_execution_traces_iter = reader.graph_execution_traces_iterator(
            )
            with self.assertRaises(StopIteration):
                next(execution_iter)
            with self.assertRaises(StopIteration):
                next(graph_execution_traces_iter)

            # TODO(cais): Backport execution instrumentation to tf.Session.
            writer.FlushExecutionFiles()
            # After the flushing, the .execution file should hold the appropriate
            # contents.
            if context.executing_eagerly():
                (executed_op_types, input_tensor_ids, output_tensor_ids,
                 tensor_debug_modes,
                 tensor_values) = self._readAndCheckExecutionFile()
                # NOTE(b/142486213): Execution of the TF function happens with
                # Session.run() in v1 graph mode, hence it doesn't get logged to the
                # .execution file.
                self.assertLen(executed_op_types, 1)
                self.assertIn("iterative_doubling", executed_op_types[0])
                self.assertLen(input_tensor_ids[0], 2)
                self.assertLen(output_tensor_ids[0], 1)
                self.assertEqual(
                    tensor_debug_modes[0],
                    debug_event_pb2.TensorDebugMode.Value(tensor_debug_mode))
                if tensor_debug_mode == "FULL_TENSOR":
                    self.assertAllClose(tensor_values, [[8.0]])

            (op_names, _, output_slots, tensor_values
             ) = self._readAndCheckGraphExecutionTracesFile(context_ids)
            executed_op_types = [
                op_name_to_op_type[op_name] for op_name in op_names
            ]
            # The Less op should have been executed 5 times.
            self.assertEqual(executed_op_types.count("Less"), 5)
            # The last executed op should be Less.
            self.assertEqual(executed_op_types[-1], "Less")
            # The Mul op should have been executed 4 times.
            self.assertEqual(executed_op_types.count("Mul"), 4)
            # The AddV2 op should have been run, but we refrain from asserting on how
            # many times it's executed.
            self.assertIn("AddV2", executed_op_types)
            for output_slot in output_slots:
                self.assertEqual(output_slot, 0)
            if tensor_debug_mode == "NO_TENSOR":
                # Under the default NO_TENSOR tensor-debug mode, the tensor_proto ought
                # to be an empty float32 tensor.
                for tensor_value in tensor_values:
                    self.assertEqual(tensor_value.dtype, np.float32)
                    self.assertEqual(tensor_value.shape, (0, ))
            elif tensor_debug_mode == "FULL_TENSOR":
                less_values = [
                    tensor_values[i]
                    for i, op_type in enumerate(executed_op_types)
                    if op_type == "Less"
                ]
                self.assertAllClose(less_values,
                                    [True, True, True, True, False])
                mul_values = [
                    tensor_values[i]
                    for i, op_type in enumerate(executed_op_types)
                    if op_type == "Mul"
                ]
                self.assertAllClose(mul_values, [1.0, 2.0, 4.0, 8.0])
Example #6
0
    def testSingleTensorFullTensorDebugModeWithCircularBufferBehavior(self):
        @def_function.function
        def write_debug_trace(x):
            square = math_ops.square(x)
            gen_debug_ops.debug_identity_v2(
                square,
                tfdbg_context_id="deadbeaf",
                op_name="Square",
                output_slot=0,
                tensor_debug_mode=debug_event_pb2.TensorDebugMode.FULL_TENSOR,
                debug_urls=["file://%s" % self.dump_root])

            sqrt = math_ops.sqrt(x)
            gen_debug_ops.debug_identity_v2(
                sqrt,
                tfdbg_context_id="beafdead",
                op_name="Sqrt",
                output_slot=0,
                tensor_debug_mode=debug_event_pb2.TensorDebugMode.FULL_TENSOR,
                debug_urls=["file://%s" % self.dump_root])
            return square + sqrt

        x = np.array([3.0, 4.0])
        # Only the graph-execution trace of the last iteration should be written
        # to self.dump_root.
        for _ in range(self.circular_buffer_size // 2 + 1):
            self.assertAllClose(write_debug_trace(x),
                                [9.0 + np.sqrt(3.0), 16.0 + 2.0])

        with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
            # Check that the .metadata DebugEvents data file has been created, even
            # before FlushExecutionFiles() is called.
            self.assertGreater(reader.starting_wall_time(), 0)
            self.assertTrue(reader.tensorflow_version())
            self.assertTrue(
                reader.tfdbg_file_version().startswith("debug.Event"))

            graph_trace_iter = reader.graph_execution_traces_iterators()[0]
            # Before FlushExecutionFiles() is called, the .graph_execution_traces file
            # ought to be empty.
            with self.assertRaises(StopIteration):
                next(graph_trace_iter)

            # Flush the circular buffer.
            self.writer.FlushExecutionFiles()
            graph_trace_iter = reader.graph_execution_traces_iterators()[0]

            # The circular buffer has a size of 4. So only the data from the
            # last two iterations should have been written to self.dump_root.
            for _ in range(2):
                debug_event = next(graph_trace_iter).debug_event
                self.assertGreater(debug_event.wall_time, 0)
                trace = debug_event.graph_execution_trace
                self.assertEqual(trace.tfdbg_context_id, "deadbeaf")
                self.assertEqual(trace.op_name, "Square")
                self.assertEqual(trace.output_slot, 0)
                self.assertEqual(trace.tensor_debug_mode,
                                 debug_event_pb2.TensorDebugMode.FULL_TENSOR)
                tensor_value = tensor_util.MakeNdarray(trace.tensor_proto)
                self.assertAllClose(tensor_value, [9.0, 16.0])

                debug_event = next(graph_trace_iter).debug_event
                self.assertGreater(debug_event.wall_time, 0)
                trace = debug_event.graph_execution_trace
                self.assertEqual(trace.tfdbg_context_id, "beafdead")
                self.assertEqual(trace.op_name, "Sqrt")
                self.assertEqual(trace.output_slot, 0)
                self.assertEqual(trace.tensor_debug_mode,
                                 debug_event_pb2.TensorDebugMode.FULL_TENSOR)
                tensor_value = tensor_util.MakeNdarray(trace.tensor_proto)
                self.assertAllClose(tensor_value, [np.sqrt(3.0), 2.0])

            # Only the graph-execution trace of the last iteration should be written
            # to self.dump_root.
            with self.assertRaises(StopIteration):
                next(graph_trace_iter)
    def _readAndCheckGraphsFile(self, stack_frame_by_id):
        """Read and verify the content of the .graphs debug-event file.

    Args:
      stack_frame_by_id: A dict mapping unique string IDs to stack frames.
        It is used by this method to look up stack frames.

    Returns:
      context_ids: IDs of op creation contexts (e.g., TensorFlow graphs), as a
        `list` of `str`s.
      op_types: Types of the ops that are created, as a `list` of `str`s with
        the same length as `context_ids`.
      op_name_to_op_type: An `OrderedDict` mapping op name to op type.
      op_name_to_context_id: A `dict` mapping op name to the ID of the innermost
        containing graph (context).
    """
        with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
            graphs_iter = reader.graphs_iterator()
            prev_wall_time = 0
            op_types = []
            op_name_to_op_type = collections.OrderedDict()
            op_name_to_context_id = dict(
            )  # Maps op name to ID of innermost context.
            context_ids = set()
            symbolic_tensor_ids = set()
            # Maps context ID to ID of directly enclosing context (`None` for
            # outermost contexts).
            context_id_to_outer_id = dict()

            for debug_event in graphs_iter:
                self.assertGreaterEqual(debug_event.wall_time, prev_wall_time)
                prev_wall_time = debug_event.wall_time
                # A DebugEvent in the .graphs file contains either of the two fields:
                # - graph_op_creation for creation of a symbolic op in a graph context.
                # - debugged_graph for information regarding the graph (context).
                if debug_event.graph_op_creation.ByteSize():
                    graph_op_creation = debug_event.graph_op_creation
                    self.assertTrue(graph_op_creation.op_type)
                    op_types.append(graph_op_creation.op_type)
                    self.assertTrue(graph_op_creation.op_name)
                    op_name_to_op_type[
                        graph_op_creation.op_name] = graph_op_creation.op_type
                    op_name_to_context_id[
                        graph_op_creation.op_name] = graph_op_creation.graph_id
                    self.assertTrue(graph_op_creation.graph_id)
                    context_ids.add(graph_op_creation.graph_id)
                    self.assertTrue(graph_op_creation.code_location)
                    if graph_op_creation.num_outputs:
                        self.assertLen(graph_op_creation.output_tensor_ids,
                                       graph_op_creation.num_outputs)
                        # Check that all symblic tensor IDs are unique.
                        for tensor_id in graph_op_creation.output_tensor_ids:
                            self.assertNotIn(tensor_id, symbolic_tensor_ids)
                            symbolic_tensor_ids.add(tensor_id)
                    for stack_frame_id in graph_op_creation.code_location.stack_frame_ids:
                        self.assertIn(stack_frame_id, stack_frame_by_id)
                else:
                    debugged_graph = debug_event.debugged_graph
                    if debugged_graph.outer_context_id:
                        inner_id = debugged_graph.graph_id
                        outer_id = debugged_graph.outer_context_id
                        if inner_id in context_id_to_outer_id:
                            # The outer context of a context must be always the same.
                            self.assertEqual(context_id_to_outer_id[inner_id],
                                             outer_id)
                        else:
                            context_id_to_outer_id[inner_id] = outer_id
                    else:
                        # This is an outermost context.
                        if debugged_graph.graph_id in context_id_to_outer_id:
                            self.assertIsNone(context_id_to_outer_id[
                                debugged_graph.graph_id])
                        else:
                            context_id_to_outer_id[
                                debugged_graph.graph_id] = None

            # If any graph is created, the graph context hierarchy must be populated.
            # In addition, the context of each graph op must be locatable within the
            # graph context hierarchy.
            for context_id in op_name_to_context_id.values():
                self.assertIn(context_id, context_id_to_outer_id)
            return context_ids, op_types, op_name_to_op_type, op_name_to_context_id