def testReadingTwoFileSetsWithTheSameDumpRootSucceeds(self):
        # To simulate a multi-host data dump, we first generate file sets in two
        # different directories, with the same tfdbg_run_id, and then combine them.
        tfdbg_run_id = "foo"
        for i in range(2):
            writer = debug_events_writer.DebugEventsWriter(
                os.path.join(self.dump_root, str(i)),
                tfdbg_run_id,
                circular_buffer_size=-1)
            if i == 0:
                debugged_graph = debug_event_pb2.DebuggedGraph(
                    graph_id="graph1", graph_name="graph1")
                writer.WriteDebuggedGraph(debugged_graph)
                op_name = "Op_0"
                graph_op_creation = debug_event_pb2.GraphOpCreation(
                    op_type="FooOp", op_name=op_name, graph_id="graph1")
                writer.WriteGraphOpCreation(graph_op_creation)
                op_name = "Op_1"
                graph_op_creation = debug_event_pb2.GraphOpCreation(
                    op_type="FooOp", op_name=op_name, graph_id="graph1")
                writer.WriteGraphOpCreation(graph_op_creation)
            for _ in range(10):
                trace = debug_event_pb2.GraphExecutionTrace(
                    op_name="Op_%d" % i, tfdbg_context_id="graph1")
                writer.WriteGraphExecutionTrace(trace)
                writer.FlushNonExecutionFiles()
                writer.FlushExecutionFiles()

        # Move all files from the subdirectory /1 to subdirectory /0.
        dump_root_0 = os.path.join(self.dump_root, "0")
        src_paths = glob.glob(os.path.join(self.dump_root, "1", "*"))
        for src_path in src_paths:
            dst_path = os.path.join(
                dump_root_0,
                # Rename the file set to avoid file name collision.
                re.sub(r"(tfdbg_events\.\d+)", r"\g<1>1",
                       os.path.basename(src_path)))
            os.rename(src_path, dst_path)

        with debug_events_reader.DebugDataReader(dump_root_0) as reader:
            reader.update()
            # Verify the content of the .graph_execution_traces file.
            trace_digests = reader.graph_execution_traces(digest=True)
            self.assertLen(trace_digests, 20)
            for _ in range(10):
                trace = reader.read_graph_execution_trace(trace_digests[i])
                self.assertEqual(trace.op_name, "Op_0")
            for _ in range(10):
                trace = reader.read_graph_execution_trace(trace_digests[i +
                                                                        10])
                self.assertEqual(trace.op_name, "Op_1")
  def callback(self,
               op_type,
               inputs,
               attrs,
               outputs,
               op_name=None,
               graph=None):
    """Op callback for tracing (dumping) a TF program's execution."""
    del attrs  # Unused

    writer = self.get_writer()
    if graph:
      context_id = self._get_context_id(graph)
      assert op_name is not None
      output_tensor_ids = self._get_symbolic_tensor_ids(len(outputs))
      graph_op_creation = debug_event_pb2.GraphOpCreation(
          op_type=op_type,
          op_name=op_name,
          graph_name=graph.name if hasattr(graph, "name") else None,
          graph_id=context_id,
          input_names=[input_tensor.name for input_tensor in inputs],
          num_outputs=len(outputs),
          output_tensor_ids=output_tensor_ids,
          code_location=self._process_stack_frames())
      writer.WriteGraphOpCreation(graph_op_creation)
      if outputs and compat.as_bytes(
          op_type) not in op_callbacks_common.OP_CALLBACK_SKIP_OPS:
        return self._instrument_symbolic_tensors(
            outputs, op_type, op_name, context_id, output_tensor_ids)
    else:
      input_ids = [t._id for t in inputs]  # pylint:disable=protected-access
      writer.WriteExecution(
          self._dump_eager_tensors(outputs, op_type, input_ids))
Exemple #3
0
    def testWriteGraphOpCreationAndDebuggedGraphs(self):
        writer = debug_events_writer.DebugEventsWriter(self.dump_root)
        num_op_creations = 10
        for i in range(num_op_creations):
            graph_op_creation = debug_event_pb2.GraphOpCreation()
            graph_op_creation.op_type = "Conv2D"
            graph_op_creation.op_name = "Conv2D_%d" % i
            writer.WriteGraphOpCreation(graph_op_creation)
        debugged_graph = debug_event_pb2.DebuggedGraph()
        debugged_graph.graph_id = "deadbeaf"
        debugged_graph.graph_name = "MyGraph1"
        writer.WriteDebuggedGraph(debugged_graph)
        writer.FlushNonExecutionFiles()

        source_files_paths = glob.glob(os.path.join(self.dump_root,
                                                    "*.graphs"))
        self.assertEqual(len(source_files_paths), 1)
        actuals = ReadDebugEvents(source_files_paths[0])
        self.assertEqual(len(actuals), num_op_creations + 1)
        for i in range(num_op_creations):
            self.assertEqual(actuals[i].graph_op_creation.op_type, "Conv2D")
            self.assertEqual(actuals[i].graph_op_creation.op_name,
                             "Conv2D_%d" % i)
        self.assertEqual(actuals[num_op_creations].debugged_graph.graph_id,
                         "deadbeaf")
  def testRangeReadingGraphExecutionTraces(self, begin, end, expected_begin,
                                           expected_end):
    writer = debug_events_writer.DebugEventsWriter(
        self.dump_root, self.tfdbg_run_id, circular_buffer_size=-1)
    debugged_graph = debug_event_pb2.DebuggedGraph(
        graph_id="graph1", graph_name="graph1")
    writer.WriteDebuggedGraph(debugged_graph)
    for i in range(5):
      op_name = "Op_%d" % i
      graph_op_creation = debug_event_pb2.GraphOpCreation(
          op_name=op_name, graph_id="graph1")
      writer.WriteGraphOpCreation(graph_op_creation)
      trace = debug_event_pb2.GraphExecutionTrace(
          op_name=op_name, tfdbg_context_id="graph1")
      writer.WriteGraphExecutionTrace(trace)
    writer.FlushNonExecutionFiles()
    writer.FlushExecutionFiles()
    writer.Close()

    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
      reader.update()
      traces = reader.graph_execution_traces(begin=begin, end=end)
    self.assertLen(traces, expected_end - expected_begin)
    self.assertEqual(traces[0].op_name, "Op_%d" % expected_begin)
    self.assertEqual(traces[-1].op_name, "Op_%d" % (expected_end - 1))
 def WriteGraphOpCreation():
     graph_op_creation = debug_event_pb2.GraphOpCreation()
     with graph_op_state["lock"]:
         graph_op_creation.op_name = "Op%d" % graph_op_state["counter"]
         graph_op_state["counter"] += 1
     writer.WriteGraphOpCreation(graph_op_creation)
     # More-frequent-than-necessary concurrent flushing is not recommended,
     # but tolerated.
     writer.FlushNonExecutionFiles()
Exemple #6
0
  def callback(self,
               op_type,
               inputs,
               attrs,
               outputs,
               op_name=None,
               graph=None):
    """Op callback for tracing (dumping) a TF program's execution."""
    del attrs  # Unused

    writer = self.get_writer()
    if graph:
      is_v1_graph_mode = not ops.executing_eagerly_outside_functions()
      context_id = self._get_context_id(graph)  # Innermost context ID.
      output_tensor_ids = self._get_symbolic_tensor_ids(len(outputs))
      if op_type in ("Const", "Placeholder", "PlaceholderWithDefault"):
        # In some cases, the op name of a Const or Placeholder op in a graph
        # can be duplicate (e.g., `None` or "resource").
        # When this happens, we use the output tensor name to infer
        # the non-duplicated tensor name.
        op_name = outputs[0].name.split(":")[0]
      if is_v1_graph_mode:
        for input_tensor in inputs:
          if input_tensor in self._placeholder_to_debug_tensor and outputs:
            outputs[0].op._add_control_input(  # pylint: disable=protected-access
                self._placeholder_to_debug_tensor[input_tensor].op)
      graph_op_creation = debug_event_pb2.GraphOpCreation(
          op_type=op_type,
          op_name=op_name,
          graph_name=graph.name if hasattr(graph, "name") else None,
          graph_id=context_id,
          input_names=[
              self._lookup_tensor_name(input_tensor) for input_tensor in inputs
          ],
          num_outputs=len(outputs),
          output_tensor_ids=output_tensor_ids,
          code_location=self._process_stack_frames())
      writer.WriteGraphOpCreation(graph_op_creation)
      if outputs and compat.as_bytes(
          op_type) not in op_callbacks_common.OP_CALLBACK_SKIP_OPS:
        return self._instrument_symbolic_tensors(
            outputs, op_type, op_name, context_id, output_tensor_ids)
    else:
      op_type_bytes = compat.as_bytes(op_type)
      if op_type_bytes == b"DebugNumericSummaryV2":
        # TODO(b/140334369): Remove this special casing logic once op_callback.
        # automatically prevents infinite recursion in eager mode.
        return None
      if op_type_bytes in op_callbacks_common.OP_CALLBACK_SKIP_OPS:
        return None
      context_id = self._func_graph_id_from_func_name(op_type)
      input_ids = [t._id for t in inputs]  # pylint:disable=protected-access
      output_tensor_device_ids = [writer.RegisterDeviceAndGetId(output.device)
                                  for output in outputs] if outputs else []
      writer.WriteExecution(self._dump_eager_tensors(
          outputs, op_type, input_ids, output_tensor_device_ids,
          graph_id=context_id))
 def write_graph_execution_trace():
   with graph_execution_trace_state["lock"]:
     op_name = "Op%d" % graph_execution_trace_state["counter"]
     graph_op_creation = debug_event_pb2.GraphOpCreation(
         op_type="FooOp", op_name=op_name, graph_id="graph1")
     trace = debug_event_pb2.GraphExecutionTrace(
         op_name=op_name, tfdbg_context_id="graph1")
     graph_execution_trace_state["counter"] += 1
   writer.WriteGraphOpCreation(graph_op_creation)
   writer.WriteGraphExecutionTrace(trace)
 def write_and_update_job():
   while True:
     if writer_state["done"]:
       break
     op_name = "Op%d" % writer_state["counter"]
     graph_op_creation = debug_event_pb2.GraphOpCreation(
         op_type="FooOp", op_name=op_name, graph_id="graph1")
     writer.WriteGraphOpCreation(graph_op_creation)
     trace = debug_event_pb2.GraphExecutionTrace(
         op_name=op_name, tfdbg_context_id="graph1")
     writer.WriteGraphExecutionTrace(trace)
     writer_state["counter"] += 1
     writer.FlushNonExecutionFiles()
     writer.FlushExecutionFiles()
     reader.update()
    def callback(self,
                 op_type,
                 inputs,
                 attrs,
                 outputs,
                 op_name=None,
                 graph=None):
        """Op callback for tracing (dumping) a TF program's execution."""
        del attrs  # Unused

        writer = self.get_writer()
        if graph:
            context_id = self._get_context_id(graph)  # Innermost context ID.
            assert op_name is not None
            output_tensor_ids = self._get_symbolic_tensor_ids(len(outputs))
            graph_op_creation = debug_event_pb2.GraphOpCreation(
                op_type=op_type,
                op_name=op_name,
                graph_name=graph.name if hasattr(graph, "name") else None,
                graph_id=context_id,
                input_names=[input_tensor.name for input_tensor in inputs],
                num_outputs=len(outputs),
                output_tensor_ids=output_tensor_ids,
                code_location=self._process_stack_frames())
            writer.WriteGraphOpCreation(graph_op_creation)
            if outputs and compat.as_bytes(
                    op_type) not in op_callbacks_common.OP_CALLBACK_SKIP_OPS:
                return self._instrument_symbolic_tensors(
                    outputs, op_type, op_name, context_id, output_tensor_ids)
        else:
            op_type_bytes = compat.as_bytes(op_type)
            if op_type_bytes == b"DebugNumericSummaryV2":
                # TODO(b/140334369): Remove this special casing logic once op_callback.
                # automatically prevents infinite recursion in eager mode.
                return None
            if op_type_bytes in op_callbacks_common.OP_CALLBACK_SKIP_OPS:
                return None
            context_id = self._func_graph_id_from_func_name(op_type)
            input_ids = [t._id for t in inputs]  # pylint:disable=protected-access
            writer.WriteExecution(
                self._dump_eager_tensors(outputs,
                                         op_type,
                                         input_ids,
                                         graph_id=context_id))
    def testConcurrentGraphExecutionTraceRandomReads(self):
        circular_buffer_size = -1
        writer = debug_events_writer.DebugEventsWriter(self.dump_root,
                                                       self.tfdbg_run_id,
                                                       circular_buffer_size)
        debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1",
                                                       graph_name="graph1")
        writer.WriteDebuggedGraph(debugged_graph)

        for i in range(100):
            op_name = "Op%d" % i
            graph_op_creation = debug_event_pb2.GraphOpCreation(
                op_type="FooOp", op_name=op_name, graph_id="graph1")
            writer.WriteGraphOpCreation(graph_op_creation)
            trace = debug_event_pb2.GraphExecutionTrace(
                op_name=op_name, tfdbg_context_id="graph1")
            writer.WriteGraphExecutionTrace(trace)
        writer.FlushNonExecutionFiles()
        writer.FlushExecutionFiles()

        reader = debug_events_reader.DebugDataReader(self.dump_root)
        reader.update()
        traces = [None] * 100

        def read_job_1():
            digests = reader.graph_execution_traces(digest=True)
            for i in range(49, -1, -1):
                traces[i] = reader.read_graph_execution_trace(digests[i])

        def read_job_2():
            digests = reader.graph_execution_traces(digest=True)
            for i in range(99, 49, -1):
                traces[i] = reader.read_graph_execution_trace(digests[i])

        thread_1 = threading.Thread(target=read_job_1)
        thread_2 = threading.Thread(target=read_job_2)
        thread_1.start()
        thread_2.start()
        thread_1.join()
        thread_2.join()
        for i in range(100):
            self.assertEqual(traces[i].op_name, "Op%d" % i)
  def testWriteGraphOpCreationAndDebuggedGraphs(self):
    writer = debug_events_writer.DebugEventsWriter(self.dump_root)
    num_op_creations = 10
    for i in range(num_op_creations):
      graph_op_creation = debug_event_pb2.GraphOpCreation()
      graph_op_creation.op_type = "Conv2D"
      graph_op_creation.op_name = "Conv2D_%d" % i
      writer.WriteGraphOpCreation(graph_op_creation)
    debugged_graph = debug_event_pb2.DebuggedGraph()
    debugged_graph.graph_id = "deadbeaf"
    debugged_graph.graph_name = "MyGraph1"
    writer.WriteDebuggedGraph(debugged_graph)
    writer.FlushNonExecutionFiles()

    reader = debug_events_reader.DebugEventsReader(self.dump_root)
    actuals = list(reader.graphs_iterator())
    self.assertLen(actuals, num_op_creations + 1)
    for i in range(num_op_creations):
      self.assertEqual(actuals[i].graph_op_creation.op_type, "Conv2D")
      self.assertEqual(actuals[i].graph_op_creation.op_name, "Conv2D_%d" % i)
    self.assertEqual(actuals[num_op_creations].debugged_graph.graph_id,
                     "deadbeaf")