def testMultipleEventsFromDifferentDevicesAndSameTensorName(self): alert_1 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1) alert_2 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 1, 1) alert_3 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1634, 1, 1, 1) registry = numerics_alert.NumericsAlertRegistry() registry.register(alert_1) registry.register(alert_2) registry.register(alert_3) self.assertEqual( [ numerics_alert.NumericsAlertReportRow( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 1, 2, 2, ), numerics_alert.NumericsAlertReportRow( "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1, ), ], registry.report(), )
def testAlertingEventCallback(self): numerics_alert_callback = tf.test.mock.Mock() stream_handler = debugger_server_lib.DebuggerDataStreamHandler( events_writer_manager=FakeEventsWriterManager( self.events_written), numerics_alert_callback=numerics_alert_callback) stream_handler.on_core_metadata_event(tf.Event()) # The stream handler receives 1 good event and 1 with an NaN value. stream_handler.on_value_event( self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [0] * 14)) stream_handler.on_value_event( self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ])) # The second event should have triggered the callback. numerics_alert_callback.assert_called_once_with( numerics_alert.NumericsAlert("/job:localhost/replica:0/task:0/cpu:0", "Add:0", 0, 1, 0, 0)) # The stream handler receives an event with a -Inf value. numerics_alert_callback.reset_mock() stream_handler.on_value_event( self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ])) numerics_alert_callback.assert_called_once_with( numerics_alert.NumericsAlert("/job:localhost/replica:0/task:0/cpu:0", "Add:0", 0, 0, 1, 0)) # The stream handler receives an event with a +Inf value. numerics_alert_callback.reset_mock() stream_handler.on_value_event( self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 ])) numerics_alert_callback.assert_called_once_with( numerics_alert.NumericsAlert("/job:localhost/replica:0/task:0/cpu:0", "Add:0", 0, 0, 0, 1)) # The stream handler receives an event without any pathetic values. numerics_alert_callback.reset_mock() stream_handler.on_value_event( self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [ 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0 ])) # assert_not_called is not available in Python 3.4. self.assertFalse(numerics_alert_callback.called)
def testAddAlertInReverseChronologicalOrder(self): history = numerics_alert.NumericsAlertHistory() alert_1 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 10, 0, 10) history.add(alert_1) alert_2 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1220, 20, 20, 0) history.add(alert_2) self.assertEqual(1220, history.first_timestamp()) self.assertEqual(1234, history.last_timestamp()) self.assertEqual(2, history.event_count(constants.NAN_KEY)) self.assertEqual(1, history.event_count(constants.NEG_INF_KEY)) self.assertEqual(1, history.event_count(constants.POS_INF_KEY))
def testConstructFromOneAlert(self): alert = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 10, 0, 10) history = numerics_alert.NumericsAlertHistory() history.add(alert) self.assertEqual(1234, history.first_timestamp()) self.assertEqual(1234, history.last_timestamp()) self.assertEqual(1, history.event_count(constants.NAN_KEY)) self.assertEqual(0, history.event_count(constants.NEG_INF_KEY)) self.assertEqual(1, history.event_count(constants.POS_INF_KEY))
def testMultipleEventsFromSameDeviceAndSameTensor(self): alert_1 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 10, 10) alert_2 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1634, 5, 5, 5) registry = numerics_alert.NumericsAlertRegistry() registry.register(alert_1) registry.register(alert_2) self.assertEqual( [ numerics_alert.NumericsAlertReportRow( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 1, 2, 2, ) ], registry.report(), )
def testFilterReport(self): alert_1 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1) alert_2 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 1, 1) alert_3 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:0/gpu:0", "xent/Mean:0", 1634, 1, 1, 1) registry = numerics_alert.NumericsAlertRegistry() registry.register(alert_1) registry.register(alert_2) registry.register(alert_3) self.assertEqual( [ numerics_alert.NumericsAlertReportRow( "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1, ) ], registry.report(device_name_filter=r".*\/task:1\/.*"), ) self.assertEqual( [ numerics_alert.NumericsAlertReportRow( "/job:worker/replica:0/task:0/gpu:0", "xent/Mean:0", 1634, 1, 1, 1, ) ], registry.report(tensor_name_filter=r".*Mean.*"), )
def testCreateJsonableRegistry(self): alert = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1) registry = numerics_alert.NumericsAlertRegistry() registry.register(alert) triplet_list = registry.create_jsonable_registry() self.assertEqual(1, len(triplet_list)) triplet = triplet_list[0] self.assertEqual("/job:worker/replica:0/task:1/gpu:0", triplet.device) self.assertEqual("xent/Log:0", triplet.tensor) self.assertListEqual([0, -1, -1], list(triplet.jsonable_history["nan"])) self.assertListEqual([1, 1434, 1434], list(triplet.jsonable_history["neg_inf"])) self.assertListEqual([1, 1434, 1434], list(triplet.jsonable_history["pos_inf"]))
def testSingleAlert(self): alert = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 10, 10) registry = numerics_alert.NumericsAlertRegistry() registry.register(alert) self.assertEqual( [ numerics_alert.NumericsAlertReportRow( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 1, 1, ) ], registry.report(), )
def testRegisterBeyondCapacityObeysCapacity(self): alert_1 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1) alert_2 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 1, 1) alert_3 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:2/gpu:0", "xent/Log:0", 1634, 0, 1, 1) alert_4 = numerics_alert.NumericsAlert( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1834, 1, 1, 1) registry = numerics_alert.NumericsAlertRegistry(capacity=2) registry.register(alert_1) registry.register(alert_2) self.assertEqual( [ numerics_alert.NumericsAlertReportRow( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 1, 1, ), numerics_alert.NumericsAlertReportRow( "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1, ), ], registry.report(), ) registry.register(alert_3) self.assertEqual( [ numerics_alert.NumericsAlertReportRow( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 1, 1, ), numerics_alert.NumericsAlertReportRow( "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1, ), ], registry.report(), ) registry.register(alert_4) self.assertEqual( [ numerics_alert.NumericsAlertReportRow( "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 1, 2, 2, ), numerics_alert.NumericsAlertReportRow( "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1, ), ], registry.report(), )