コード例 #1
0
    def testLimitingInfNanMonitorAlertCountWorks(self):
        mock_reader = test.mock.MagicMock()
        monitor = debug_events_monitors.InfNanMonitor(mock_reader, limit=3)
        for i in range(10):
            execution_digest = debug_events_reader.ExecutionDigest(
                i * 1000, 1, "FooOp", output_tensor_device_ids=[0, 1])
            execution = debug_events_reader.Execution(
                execution_digest,
                "worker01", ["a1", "b2", "e3"],
                debug_event_pb2.TensorDebugMode.CURT_HEALTH,
                graph_id=None,
                input_tensor_ids=[12, 34],
                output_tensor_ids=[56, 78],
                debug_tensor_values=[[-1, 0],
                                     [-1, 1]])  # [tensor_id, any_inf_nan].
            monitor.on_execution(i, execution)

        alerts = monitor.alerts()
        self.assertLen(alerts, 3)
        for i, alert in enumerate(alerts):
            self.assertEqual(alert.wall_time, i * 1000)
            self.assertEqual(alert.op_type, "FooOp")
            self.assertEqual(alert.output_slot, 1)
            # The four fields below are unavailable under CURT_HEALTH mode by design.
            self.assertIsNone(alert.size)
            self.assertIsNone(alert.num_neg_inf)
            self.assertIsNone(alert.num_pos_inf)
            self.assertIsNone(alert.num_nan)
            self.assertEqual(alert.execution_index, i)
            self.assertIsNone(alert.graph_execution_trace_index)
コード例 #2
0
    def testInfNanMonitorOnExecutionUnderHealthMode(self, tensor_debug_mode,
                                                    debug_tensor_values):
        mock_reader = test.mock.MagicMock()
        monitor = debug_events_monitors.InfNanMonitor(mock_reader)
        execution_digest = debug_events_reader.ExecutionDigest(
            1234, 1, "BarOp", output_tensor_device_ids=[0, 1])

        execution = debug_events_reader.Execution(
            execution_digest,
            "worker01", ["a1", "b2", "e3"],
            tensor_debug_mode,
            graph_id=None,
            input_tensor_ids=[12, 34],
            output_tensor_ids=[56, 78],
            debug_tensor_values=debug_tensor_values)
        monitor.on_execution(60, execution)

        self.assertLen(monitor.alerts(), 1)
        alert = monitor.alerts()[0]
        self.assertEqual(alert.wall_time, 1234)
        self.assertEqual(alert.op_type, "BarOp")
        self.assertEqual(alert.output_slot, 0)
        self.assertEqual(alert.size, 10)
        self.assertEqual(alert.num_neg_inf, 1)
        self.assertEqual(alert.num_pos_inf, 2)
        self.assertEqual(alert.num_nan, 3)
        self.assertEqual(alert.execution_index, 60)
        self.assertIsNone(alert.graph_execution_trace_index)
コード例 #3
0
    def testInfNanMonitorOnExecutionUnderCurtHealthMode(self):
        mock_reader = test.mock.MagicMock()
        monitor = debug_events_monitors.InfNanMonitor(mock_reader)
        execution_digest = debug_events_reader.ExecutionDigest(
            1234, 1, "FooOp", output_tensor_device_ids=[0, 1])
        execution = debug_events_reader.Execution(
            execution_digest,
            "worker01", ["a1", "b2", "e3"],
            debug_event_pb2.TensorDebugMode.CURT_HEALTH,
            graph_id=None,
            input_tensor_ids=[12, 34],
            output_tensor_ids=[56, 78],
            debug_tensor_values=[[-1, 0], [-1,
                                           1]])  # [tensor_id, any_inf_nan].
        monitor.on_execution(50, execution)

        self.assertLen(monitor.alerts(), 1)
        alert = monitor.alerts()[0]
        self.assertEqual(alert.wall_time, 1234)
        self.assertEqual(alert.op_type, "FooOp")
        self.assertEqual(alert.output_slot, 1)
        # The four fields below are unavailable under CURT_HEALTH mode by design.
        self.assertIsNone(alert.size)
        self.assertIsNone(alert.num_neg_inf)
        self.assertIsNone(alert.num_pos_inf)
        self.assertIsNone(alert.num_nan)
        self.assertEqual(alert.execution_index, 50)
        self.assertIsNone(alert.graph_execution_trace_index)
コード例 #4
0
 def testExecutionNoGraphNoInputButWithOutputToJson(self):
   execution_digest = debug_events_reader.ExecutionDigest(
       1234, 5678, "FooOp", output_tensor_device_ids=[1357])
   execution = debug_events_reader.Execution(
       execution_digest,
       "localhost",
       ("a1", "b2"),
       debug_event_pb2.TensorDebugMode.FULL_HEALTH,
       graph_id="abcd",
       input_tensor_ids=[13, 37],
       output_tensor_ids=None,
       debug_tensor_values=None)
   json = execution.to_json()
   self.jsonRoundTripCheck(json)
   self.assertEqual(json["wall_time"], 1234)
   self.assertEqual(json["op_type"], "FooOp")
   self.assertEqual(json["output_tensor_device_ids"], (1357,))
   self.assertEqual(json["host_name"], "localhost")
   self.assertEqual(json["stack_frame_ids"], ("a1", "b2"))
   self.assertEqual(json["tensor_debug_mode"],
                    debug_event_pb2.TensorDebugMode.FULL_HEALTH)
   self.assertEqual(json["graph_id"], "abcd")
   self.assertEqual(json["input_tensor_ids"], (13, 37))
   self.assertIsNone(json["output_tensor_ids"])
   self.assertIsNone(json["debug_tensor_values"])
コード例 #5
0
 def testExecutionDigestWithTwoOutputsToJson(self):
   execution_digest = debug_events_reader.ExecutionDigest(
       1234, 5678, "FooOp", output_tensor_device_ids=[1357, 2468])
   json = execution_digest.to_json()
   self.jsonRoundTripCheck(json)
   self.assertEqual(json["wall_time"], 1234)
   self.assertEqual(json["op_type"], "FooOp")
   self.assertEqual(json["output_tensor_device_ids"], (1357, 2468))
コード例 #6
0
 def testExecutionWithNoOutputTensorsReturnsZeroForNumOutputs(
     self, output_tensor_ids):
   execution = debug_events_reader.Execution(
       debug_events_reader.ExecutionDigest(1234, 5678, "FooOp"),
       "localhost", ("a1", "b2"),
       debug_event_pb2.TensorDebugMode.FULL_HEALTH,
       graph_id="abcd",
       input_tensor_ids=[13, 37],
       output_tensor_ids=output_tensor_ids,
       debug_tensor_values=None)
   self.assertEqual(execution.num_outputs, 0)
コード例 #7
0
    def testInfNanMonitorOnExecutionUnderModeWithNoInfNanInfo(
            self, tensor_debug_mode, debug_tensor_values):
        mock_reader = test.mock.MagicMock()
        monitor = debug_events_monitors.InfNanMonitor(mock_reader)
        execution_digest = debug_events_reader.ExecutionDigest(
            1234, 1, "BarOp", output_tensor_device_ids=[0, 1])

        execution = debug_events_reader.Execution(
            execution_digest,
            "worker01", ["a1", "b2", "e3"],
            tensor_debug_mode,
            graph_id=None,
            input_tensor_ids=[12, 34],
            output_tensor_ids=[56, 78],
            debug_tensor_values=debug_tensor_values)
        monitor.on_execution(60, execution)

        self.assertEmpty(monitor.alerts())
コード例 #8
0
    def testInfNanMonitorOnExecutionUnderFullTensorModeWorks(
            self, tensor_value, dtype, expected_size, expected_num_neg_inf,
            expected_num_pos_inf, expected_num_nan):
        mock_reader = test.mock.MagicMock()
        mock_reader.execution_to_tensor_values.return_value = [
            np.array([[0.0, -1.0, 1.0]]),
            np.array(tensor_value, dtype=dtype)
        ]
        monitor = debug_events_monitors.InfNanMonitor(mock_reader)
        execution_digest = debug_events_reader.ExecutionDigest(
            1234,
            1,
            "__inference_bar_function_1234",
            output_tensor_device_ids=[0, 1])
        execution = debug_events_reader.Execution(
            execution_digest,
            "worker01", ["a1", "b2", "e3"],
            debug_event_pb2.TensorDebugMode.FULL_TENSOR,
            graph_id=None,
            input_tensor_ids=[12, 34],
            output_tensor_ids=[56, 78])
        monitor.on_execution(70, execution)

        if expected_num_neg_inf or expected_num_pos_inf or expected_num_nan:
            self.assertLen(monitor.alerts(), 1)
            alert = monitor.alerts()[0]
            self.assertEqual(alert.wall_time, 1234)
            self.assertEqual(alert.op_type, "__inference_bar_function_1234")
            self.assertEqual(alert.output_slot, 1)
            self.assertEqual(alert.size, expected_size)
            self.assertEqual(alert.num_neg_inf, expected_num_neg_inf)
            self.assertEqual(alert.num_pos_inf, expected_num_pos_inf)
            self.assertEqual(alert.num_nan, expected_num_nan)
            self.assertEqual(alert.execution_index, 70)
            self.assertIsNone(alert.graph_execution_trace_index, 70)
        else:
            self.assertEmpty(monitor.alerts())