Ejemplo n.º 1
0
    def on_value_event(self, event):
        """Records the summary values based on an updated message from the
        debugger.

        Logs an error message if writing the event to disk fails.

        Args:
          event: The Event proto to be processed.
        """
        if not event.summary.value:
            logger.warning("The summary of the event lacks a value.")
            return

        # The node name property is actually a watch key, which is a concatenation
        # of several pieces of data.
        watch_key = event.summary.value[0].node_name
        if not watch_key.endswith(constants.DEBUG_NUMERIC_SUMMARY_SUFFIX):
            # Ignore events that lack a DebugNumericSummary.
            # NOTE(@chihuahua): We may later handle other types of debug ops.
            return

        # We remove the constants.DEBUG_NUMERIC_SUMMARY_SUFFIX from the end of the
        # watch name because it is not distinguishing: every health pill entry ends
        # with it.
        node_name_and_output_slot = watch_key[:-len(
            constants.DEBUG_NUMERIC_SUMMARY_SUFFIX)]

        shape = tensor_util.make_ndarray(event.summary.value[0].tensor).shape
        if (len(shape) != 1 or
                shape[0] < constants.MIN_DEBUG_NUMERIC_SUMMARY_TENSOR_LENGTH):
            logger.warning("Health-pill tensor either lacks a dimension or is "
                           "shaped incorrectly: %s" % shape)
            return

        match = re.match(r"^(.*):(\d+)$", node_name_and_output_slot)
        if not match:
            logger.warning(
                ("A event with a health pill has an invalid node name and output "
                 "slot combination, (i.e., an unexpected debug op): %r"),
                node_name_and_output_slot,
            )
            return

        if self._session_run_index >= 0:
            event.step = self._session_run_index
        else:
            # Data from parameter servers (or any graphs without a master) do not
            # contain core metadata. So the session run count is missing. Set its
            # value to a microsecond epoch timestamp.
            event.step = int(time.time() * 1e6)

        # Write this event to the events file designated for data from the
        # debugger.
        self._events_writer_manager.write_event(event)

        alert = numerics_alert.extract_numerics_alert(event)
        if self._numerics_alert_callback and alert:
            self._numerics_alert_callback(alert)
Ejemplo n.º 2
0
  def on_value_event(self, event):
    """Records the summary values based on an updated message from the debugger.

    Logs an error message if writing the event to disk fails.

    Args:
      event: The Event proto to be processed.
    """
    if not event.summary.value:
      tf.logging.warn("The summary of the event lacks a value.")
      return

    # The node name property is actually a watch key, which is a concatenation
    # of several pieces of data.
    watch_key = event.summary.value[0].node_name
    if not watch_key.endswith(constants.DEBUG_NUMERIC_SUMMARY_SUFFIX):
      # Ignore events that lack a DebugNumericSummary.
      # NOTE(@chihuahua): We may later handle other types of debug ops.
      return

    # We remove the constants.DEBUG_NUMERIC_SUMMARY_SUFFIX from the end of the
    # watch name because it is not distinguishing: every health pill entry ends
    # with it.
    node_name_and_output_slot = watch_key[
        :-len(constants.DEBUG_NUMERIC_SUMMARY_SUFFIX)]

    shape = tf.make_ndarray(event.summary.value[0].tensor).shape
    if (len(shape) != 1 or
        shape[0] < constants.MIN_DEBUG_NUMERIC_SUMMARY_TENSOR_LENGTH):
      tf.logging.warning("Health-pill tensor either lacks a dimension or is "
                         "shaped incorrectly: %s" % shape)
      return

    match = re.match(r"^(.*):(\d+)$", node_name_and_output_slot)
    if not match:
      tf.logging.warning(
          ("A event with a health pill has an invalid node name and output "
           "slot combination, (i.e., an unexpected debug op): %r"),
          node_name_and_output_slot)
      return

    if self._session_run_index >= 0:
      event.step = self._session_run_index
    else:
      # Data from parameter servers (or any graphs without a master) do not
      # contain core metadata. So the session run count is missing. Set its
      # value to a microsecond epoch timestamp.
      event.step = int(time.time() * 1e6)

    # Write this event to the events file designated for data from the
    # debugger.
    self._events_writer_manager.write_event(event)

    alert = numerics_alert.extract_numerics_alert(event)
    if self._numerics_alert_callback and alert:
      self._numerics_alert_callback(alert)