Beispiel #1
0
def profile(graph,
            run_meta=None,
            op_log=None,
            cmd='scope',
            options=_DEFAULT_PROFILE_OPTIONS):
    """Print model statistics.

    https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/README.md

  Args:
    graph: tf.Graph.
    run_meta: tensorflow::RunMetadata proto. When provided, also shows valid
              timing and memory information when 'select' option contains
              'micros' and 'bytes'.
    op_log: tensorflow::tfprof::OpLog proto. users can use this proto to
            group together ops and use a op_type to select the group.
    cmd: string. Either 'op', 'scope', 'graph', 'code'.
         'op' view organize outputs using operation type. (e.g. MatMul)
         'scope' view organize outputs using graph node name scope.
         'graph' view organize outputs using graph node inputs/outputs.
         'code' view organize outputs using Python call stack.
    options: A dict of options. See core/profiler/g3doc/options.md.
  Returns:
    If cmd is 'scope' or 'graph', returns TFGraphNodeProto proto.
    If cmd is 'op' or 'code', returns TFMultiGraphNodeProto proto.
    Side effect: stdout/file/timeline.json depending on options['output']
  """
    if options == _DEFAULT_PROFILE_OPTIONS:
        options = TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()

    # pylint: disable=protected-access
    op_log = tfprof_logger._merge_default_with_oplog(graph,
                                                     op_log,
                                                     run_meta,
                                                     add_trace=cmd == 'code')
    # pylint: enable=protected-access

    opts = _build_options(options)

    run_meta_str = run_meta.SerializeToString() if run_meta else b''

    if cmd == 'code' or cmd == 'op':
        tfprof_node = tfprof_output_pb2.TFMultiGraphNodeProto()
        tfprof_node.ParseFromString(
            print_mdl.PrintModelAnalysis(
                graph.as_graph_def(add_shapes=True).SerializeToString(),
                run_meta_str, op_log.SerializeToString(), cmd.encode('utf-8'),
                opts.SerializeToString()))
    elif cmd == 'graph' or cmd == 'scope':
        tfprof_node = tfprof_output_pb2.TFGraphNodeProto()
        tfprof_node.ParseFromString(
            print_mdl.PrintModelAnalysis(
                graph.as_graph_def(add_shapes=True).SerializeToString(),
                run_meta_str, op_log.SerializeToString(), cmd.encode('utf-8'),
                opts.SerializeToString()))
    else:
        raise errors.InvalidArgumentError(None, None,
                                          'unknown cmd: %s\n' % cmd)

    return tfprof_node
Beispiel #2
0
def _assert_static(condition, data):
    """Raises a InvalidArgumentError with as much information as possible."""
    if not condition:
        data_static = [_maybe_constant_value_string(x) for x in data]
        raise errors.InvalidArgumentError(node_def=None,
                                          op=None,
                                          message='\n'.join(data_static))
Beispiel #3
0
def max_spanning_tree_gradient(mst_op, d_loss_d_max_scores, *_):
  """Returns a subgradient of the MaximumSpanningTree op.

  Note that MaximumSpanningTree is only differentiable w.r.t. its |scores| input
  and its |max_scores| output.

  Args:
    mst_op: The MaximumSpanningTree op being differentiated.
    d_loss_d_max_scores: [B] vector where entry b is the gradient of the network
      loss w.r.t. entry b of the |max_scores| output of the |mst_op|.
    *_: The gradients w.r.t. the other outputs; ignored.

  Returns:
    1. None, since the op is not differentiable w.r.t. its |num_nodes| input.
    2. [B,M,M] tensor where entry b,t,s is a subgradient of the network loss
       w.r.t. entry b,t,s of the |scores| input, with the same dtype as
       |d_loss_d_max_scores|.
  """
  dtype = d_loss_d_max_scores.dtype.base_dtype
  if dtype is None:
    raise errors.InvalidArgumentError("Expected (%s) is not None" % dtype)

  argmax_sources_bxm = mst_op.outputs[1]
  input_dim = array_ops.shape(argmax_sources_bxm)[1]  # M in the docstring

  # The one-hot argmax is a subgradient of max.  Convert the batch of maximal
  # spanning trees into 0/1 indicators, then scale them by the relevant output
  # gradients from |d_loss_d_max_scores|.  Note that |d_loss_d_max_scores| must
  # be reshaped in order for it to broadcast across the batch dimension.
  indicators_bxmxm = standard_ops.one_hot(
      argmax_sources_bxm, input_dim, dtype=dtype)
  d_loss_d_max_scores_bx1 = array_ops.expand_dims(d_loss_d_max_scores, -1)
  d_loss_d_max_scores_bx1x1 = array_ops.expand_dims(d_loss_d_max_scores_bx1, -1)
  d_loss_d_scores_bxmxm = indicators_bxmxm * d_loss_d_max_scores_bx1x1
  return None, d_loss_d_scores_bxmxm
  def most_specific_common_supertype(
      self, others: Sequence[trace.TraceType]) -> Optional[trace.TraceType]:
    if not others:
      raise errors.InvalidArgumentError(
          "Argument `others` to function `most_specific_common_supertype` must be a non-empty Sequence."
      )

    return None
 def _assert_self_adjoint(self):
   if all(operator.is_square for operator in self.operators):
     asserts = [operator.assert_self_adjoint() for operator in self.operators]
     return control_flow_ops.group(asserts)
   else:
     raise errors.InvalidArgumentError(
         node_def=None, op=None, message="All Kronecker factors must be "
         "square for the product to be self adjoint.")
Beispiel #6
0
def print_model_analysis(graph,
                         run_meta=None,
                         op_log=None,
                         tfprof_cmd='scope',
                         tfprof_options=TRAINABLE_VARS_PARAMS_STAT_OPTIONS):
    """Print model statistics.

    See go/tfprof or README for examples and tutorials.
    Run tfprof tool for help:
    'bazel run third_party/tensorflow/tools/tfprof help'

  Args:
    graph: tf.Graph.
    run_meta: tensorflow::RunMetadata proto. When provided, also shows valid
              timing and memory information when 'select' option contains
              'micros' and 'bytes'.
    op_log: tensorflow::tfprof::OpLog proto. users can use this proto to
            group together ops and use a op_type to select the group.
    tfprof_cmd: string. Either 'op', 'scope', 'graph', 'code'.
                'op' view organize outputs using operation type. (e.g. MatMul)
                'scope' view organize outputs using graph node name scope.
                'graph' view organize outputs using graph node inputs/outputs.
                'code' view organize outputs using Python call stack.
    tfprof_options: See 'tfprof help' for details.
  Returns:
    If tfprof_cmd is 'scope' or 'graph', returns TFGraphNodeProto proto.
    If tfprof_cmd is 'op' or 'code', returns TFMultiGraphNodeProto proto.
    Side effect: stdout/file/timeline.json depending on tfprof_options['output']
  """
    # pylint: disable=protected-access
    op_log = tfprof_logger._merge_default_with_oplog(
        graph, op_log, run_meta, add_trace=tfprof_cmd == 'code')
    # pylint: enable=protected-access

    opts = _build_options(tfprof_options)

    run_meta_str = run_meta.SerializeToString() if run_meta else b''

    if tfprof_cmd == 'code' or tfprof_cmd == 'op':
        tfprof_node = tfprof_output_pb2.TFMultiGraphNodeProto()
        tfprof_node.ParseFromString(
            print_mdl.PrintModelAnalysis(
                graph.as_graph_def(add_shapes=True).SerializeToString(),
                run_meta_str, op_log.SerializeToString(),
                tfprof_cmd.encode('utf-8'), opts.SerializeToString()))
    elif tfprof_cmd == 'graph' or tfprof_cmd == 'scope':
        tfprof_node = tfprof_output_pb2.TFGraphNodeProto()
        tfprof_node.ParseFromString(
            print_mdl.PrintModelAnalysis(
                graph.as_graph_def(add_shapes=True).SerializeToString(),
                run_meta_str, op_log.SerializeToString(),
                tfprof_cmd.encode('utf-8'), opts.SerializeToString()))
    else:
        raise errors.InvalidArgumentError(
            None, None, 'unknown tfprof_cmd: %s\n' % tfprof_cmd)

    return tfprof_node
Beispiel #7
0
 def __init__(self, name, mode):
     self.__name = name
     self.__mode = mode
     self._read_buf = None
     if mode not in ("r", "w", "a", "r+", "w+", "a+"):
         raise errors.InvalidArgumentError(
             None, None,
             "mode is not 'r' or 'w' or 'a' or 'r+' or 'w+' or 'a+'")
     self._read_check_passed = mode in ("r", "r+", "a+", "w+")
     self._write_check_passed = mode in ("a", "w", "r+", "a+", "w+")
Beispiel #8
0
    def _get_coordinatewise_learning_rate(self, grad, var):
        # Compute the learning rate using a moving average for the diagonal of BB^T
        avg_first = self.get_slot(var, 'first_moment')
        avg_second = self.get_slot(var, 'second_moment')
        decay_tensor = math_ops.cast(self._decay_tensor, var.dtype)
        batch_size = math_ops.cast(self._batch_size_tensor, var.dtype)

        # Create an estimator for the moving average of gradient mean and variance
        # via Welford's algorithm
        if isinstance(grad, ops.Tensor):
            delta = grad - avg_first
            first_moment_update = avg_first.assign_add(
                array_ops.where(self._counter < 1, math_ops.cast(1, var.dtype),
                                1. - decay_tensor) * delta)

            with ops.control_dependencies([first_moment_update]):
                second_moment_update = avg_second.assign_add(
                    math_ops.cast(self._counter < 1, var.dtype) *
                    -(1. - decay_tensor) *
                    (avg_second - decay_tensor * math_ops.square(delta)))
            diag_preconditioner = control_flow_ops.with_dependencies(
                [second_moment_update],
                clip_ops.clip_by_value(avg_second, 1e-12, 1e12))
        elif isinstance(grad, ops.IndexedSlices):
            delta = grad.values - array_ops.gather_nd(avg_first, grad.indices)
            first_moment_update = state_ops.scatter_add(
                avg_first, grad.indices,
                array_ops.where(self._counter < 1, math_ops.cast(
                    1., var.dtype), 1. - decay_tensor) * delta)

            with ops.control_dependencies([first_moment_update]):
                avg_second = state_ops.scatter_add(
                    avg_second, grad.indices,
                    math_ops.cast(self._counter < 1, var.dtype) *
                    -(1. - decay_tensor) *
                    (array_ops.gather_nd(avg_second, grad.indices) -
                     decay_tensor * math_ops.square(delta)))
                avg_second = array_ops.gather_nd(avg_second, grad.indices)
                # TODO (b/70783772) id:488 gh:489
                diag_preconditioner = clip_ops.clip_by_value(
                    avg_second, 1e-12, 1e12)
        else:
            raise errors.InvalidArgumentError(
                None, None, 'grad must of type Tensor or IndexedSlice')

        diag_preconditioner *= batch_size

        if self._use_single_learning_rate:
            diag_preconditioner = math_ops.reduce_mean(diag_preconditioner)

        # From Theorem 2 Corollary 1 of Mandt et al. 2017
        return 2. * batch_size / (
            math_ops.cast(self._total_num_examples, var.dtype.base_dtype) *
            diag_preconditioner)
 def _assert_self_adjoint(self):
   if all(operator.is_square for operator in self.operators):
     asserts = [operator.assert_self_adjoint() for operator in self.operators]
     return control_flow_ops.group(asserts)
   else:
     raise errors.InvalidArgumentError(
         node_def=None,
         op=None,
         message="All Kronecker factors must be square for the product to be "
         "invertible. Expected hint `is_square` to be True for every operator "
         "in argument `operators`.")
Beispiel #10
0
 def __init__(self, name, mode):
   self.__name = name
   self.__mode = mode
   self._read_buf = None
   self._writable_file = None
   self._binary_mode = "b" in mode
   mode = mode.replace("b", "")
   if mode not in ("r", "w", "a", "r+", "w+", "a+"):
     raise errors.InvalidArgumentError(
         None, None, "mode is not 'r' or 'w' or 'a' or 'r+' or 'w+' or 'a+'")
   self._read_check_passed = mode in ("r", "r+", "a+", "w+")
   self._write_check_passed = mode in ("a", "w", "r+", "a+", "w+")
Beispiel #11
0
def _get_graph_callable_inputs(shape_and_dtypes):
  """Maps specified shape_and_dtypes to graph inputs."""
  ret = []
  for x in shape_and_dtypes:
    if isinstance(x, ShapeAndDtype):
      ret.append(array_ops.placeholder(x.dtype, x.shape))
    elif isinstance(x, (tuple, list)):
      ret.append(_get_graph_callable_inputs(x))
    else:
      raise errors.InvalidArgumentError(
          None, None, "shape_and_dtypes not ShapeAndDtype, type: %s " % type(x))

  return tuple(ret) if isinstance(shape_and_dtypes, tuple) else ret
Beispiel #12
0
 def recalculate_output_shapes(output_shapes):
   """Recalculates the output_shapes after dividing it by num_workers."""
   if len(output_shapes) < 1:
     raise ValueError("Input shape should have at least one dimension.")
   if (tensor_shape.dimension_value(output_shapes[0]) and
       tensor_shape.dimension_value(output_shapes[0]) % num_workers != 0):
     raise errors.InvalidArgumentError(
         None, None,
         "First dim of input shape: %d is not divisible by num_workers: %d" %
         (output_shapes[0], num_workers))
   output_dims = [d for d in output_shapes.dims]
   output_dims[0] = output_dims[0] // num_workers
   return tensor_shape.TensorShape(output_dims)
Beispiel #13
0
def _get_graph_callable_inputs(shape_and_dtypes):
  """Maps specified shape_and_dtypes to graph inputs."""
  ret = []
  for x in shape_and_dtypes:
    if isinstance(x, ShapeAndDtype):
      ret.append(array_ops.placeholder(x.dtype, x.shape))
    elif isinstance(x, (tuple, list)):
      ret.append(_get_graph_callable_inputs(x))
    else:
      raise errors.InvalidArgumentError(
          None, None, "Expected the argument to @graph_callable to be a "
          "(possibly nested) list or tuple of ShapeAndDtype objects, "
          "but got an object of type: %s" % type(x))

  return tuple(ret) if isinstance(shape_and_dtypes, tuple) else ret
Beispiel #14
0
    def seek(self, offset=None, whence=0, position=None):
        # TODO(jhseu): Delete later. Used to omit `position` from docs.
        # pylint: disable=g-doc-args
        """Seeks to the offset in the file.

    Args:
      offset: The byte count relative to the whence argument.
      whence: Valid values for whence are:
        0: start of the file (default)
        1: relative to the current position of the file
        2: relative to the end of file. offset is usually negative.
    """
        # pylint: enable=g-doc-args
        self._preread_check()
        # We needed to make offset a keyword argument for backwards-compatibility.
        # This check exists so that we can convert back to having offset be a
        # positional argument.
        # TODO(jhseu): Make `offset` a positional argument after `position` is
        # deleted.
        if offset is None and position is None:
            raise TypeError("seek(): offset argument required")
        if offset is not None and position is not None:
            raise TypeError("seek(): offset and position may not be set "
                            "simultaneously.")

        if position is not None:
            offset = position

        with errors.raise_exception_on_not_ok_status() as status:
            if whence == 0:
                pass
            elif whence == 1:
                offset += self.tell()
            elif whence == 2:
                offset += self.size()
            else:
                raise errors.InvalidArgumentError(
                    None, None,
                    "Invalid whence argument: {}. Valid values are 0, 1, or 2."
                    .format(whence))
            ret_status = self._read_buf.Seek(offset)
            pywrap_tensorflow.Set_TF_Status_from_Status(status, ret_status)
  def most_specific_common_supertype(self, others: Sequence[trace.TraceType]):
    """See base class."""
    if not others:
      raise errors.InvalidArgumentError(
          "Argument `others` to function `most_specific_common_supertype` must be a non-empty Sequence."
      )

    if not all(self._has_same_structure(other) for other in others):
      return None

    new_components = []
    for i, component in enumerate(self.components):
      common = component.most_specific_common_supertype(
          [other.components[i] for other in others])
      if common is None:
        return None
      else:
        new_components.append(common)

    return type(self)(*new_components)
  def most_specific_common_supertype(self, others: Sequence[trace.TraceType]):
    """See base class."""

    if not others:
      raise errors.InvalidArgumentError(
          "Argument `others` to function `most_specific_common_supertype` must be a non-empty Sequence."
      )

    if not all(self._has_same_structure(other) for other in others):
      return None

    new_mapping = {}
    for key in self.mapping.keys():
      common = self.mapping[key].most_specific_common_supertype(
          [other.mapping[key] for other in others])
      if common is None:
        return None
      else:
        new_mapping[key] = common

    return DictType(new_mapping)
Beispiel #17
0
def ngrams(data,
           width,
           axis=-1,
           reduction_type=None,
           string_separator=" ",
           name=None):
    """Create a tensor of n-grams based on the input data `data`.

  Creates a tensor of n-grams based on `data`. The n-grams are of width `width`
  and are created along axis `axis`; the n-grams are created by combining
  windows of `width` adjacent elements from `data` using `reduction_type`. This
  op is intended to cover basic use cases; more complex combinations can be
  created using the sliding_window op.

  Args:
    data: The data to reduce.
    width: The width of the ngram window. If there is not sufficient data to
      fill out the ngram window, the resulting ngram will be empty.
    axis: The axis to create ngrams along. Note that for string join reductions,
      only axis '-1' is supported; for other reductions, any positive or
      negative axis can be used. Should be a constant.
    reduction_type: A member of the Reduction enum. Should be a constant.
      Currently supports:

      * `Reduction.SUM`: Add values in the window.
      * `Reduction.MEAN`: Average values in the window.
      * `Reduction.STRING_JOIN`: Join strings in the window.
        Note that axis must be -1 here.

    string_separator: The separator string used for `Reduction.STRING_JOIN`.
      Ignored otherwise. Must be a string constant, not a Tensor.
    name: The op name.

  Returns:
    A tensor of ngrams.

  Raises:
    InvalidArgumentError: if `reduction_type` is either None or not a Reduction,
      or if `reduction_type` is STRING_JOIN and `axis` is not -1.
  """

    with tf.name_scope(name, "NGrams", [data, width]):
        if reduction_type is None:
            raise errors.InvalidArgumentError(
                None, None, "reduction_type must be specified.")

        if not isinstance(reduction_type, Reduction):
            raise errors.InvalidArgumentError(
                None, None, "reduction_type must be a Reduction.")

        # TODO(b/122967921): Lift this restriction after ragged_reduce_join is done.
        if reduction_type is Reduction.STRING_JOIN and axis != -1:
            raise errors.InvalidArgumentError(
                None, None,
                "%s requires that ngrams' 'axis' parameter be -1." %
                Reduction.STRING_JOIN.name)

        windowed_data = sliding_window(data, width, axis)

        if axis < 0:
            reduction_axis = axis
        else:
            reduction_axis = axis + 1

        # Ragged reduction ops work on both Tensor and RaggedTensor, so we can
        # use them here regardless of the type of tensor in 'windowed_data'.
        if reduction_type is Reduction.SUM:
            return tf.reduce_sum(windowed_data, reduction_axis)
        elif reduction_type is Reduction.MEAN:
            return tf.reduce_mean(windowed_data, reduction_axis)
        elif reduction_type is Reduction.STRING_JOIN:
            if isinstance(data, tf.RaggedTensor):
                return tf.ragged.map_flat_values(tf.reduce_join,
                                                 windowed_data,
                                                 axis=axis,
                                                 separator=string_separator)
            else:
                return tf.reduce_join(windowed_data,
                                      axis=axis,
                                      separator=string_separator)
Beispiel #18
0
def profile(graph=None,
            run_meta=None,
            op_log=None,
            cmd='scope',
            options=_DEFAULT_PROFILE_OPTIONS):
    """Profile model.

    Tutorials and examples can be found in:
    https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/README.md

  Args:
    graph: tf.Graph. If None and eager execution is not enabled, use
        default graph.
    run_meta: optional tensorflow.RunMetadata proto. It is necessary to
        to support run time information profiling, such as time and memory.
    op_log: tensorflow.tfprof.OpLogProto proto. User can assign "types" to
        graph nodes with op_log. "types" allow user to flexibly group and
        account profiles using options['accounted_type_regexes'].
    cmd: string. Either 'op', 'scope', 'graph' or 'code'.
        'op' view organizes profile using operation type. (e.g. MatMul)
        'scope' view organizes profile using graph node name scope.
        'graph' view organizes profile using graph node inputs/outputs.
        'code' view organizes profile using Python call stack.
    options: A dict of options. See core/profiler/g3doc/options.md.
  Returns:
    If cmd is 'scope' or 'graph', returns GraphNodeProto proto.
    If cmd is 'op' or 'code', returns MultiGraphNodeProto proto.
    Side effect: stdout/file/timeline.json depending on options['output']
  """
    if not graph and not context.executing_eagerly():
        graph = ops.get_default_graph()

    if options == _DEFAULT_PROFILE_OPTIONS:
        options = (option_builder.ProfileOptionBuilder.
                   trainable_variables_parameter())
    # pylint: disable=protected-access
    op_log = tfprof_logger.merge_default_with_oplog(graph,
                                                    op_log,
                                                    run_meta,
                                                    add_trace=cmd == 'code')
    # pylint: enable=protected-access

    opts = _build_options(options)

    run_meta_str = run_meta.SerializeToString() if run_meta else b''

    graph_str = _graph_string(graph)

    if cmd == 'code' or cmd == 'op':
        tfprof_node = tfprof_output_pb2.MultiGraphNodeProto()
        ret = print_mdl.PrintModelAnalysis(graph_str, run_meta_str,
                                           op_log.SerializeToString(),
                                           cmd.encode('utf-8'),
                                           opts.SerializeToString())
        try:
            tfprof_node.ParseFromString(ret)
        except message.DecodeError as e:
            sys.stderr.write('Cannot parse returned proto: %s.\n' % e)

    elif cmd == 'graph' or cmd == 'scope':
        tfprof_node = tfprof_output_pb2.GraphNodeProto()
        ret = print_mdl.PrintModelAnalysis(graph_str, run_meta_str,
                                           op_log.SerializeToString(),
                                           cmd.encode('utf-8'),
                                           opts.SerializeToString())
        try:
            tfprof_node.ParseFromString(ret)
        except message.DecodeError as e:
            sys.stderr.write('Cannot parse returned proto: %s.\n' % e)
    else:
        raise errors.InvalidArgumentError(None, None,
                                          'unknown cmd: %s\n' % cmd)

    return tfprof_node
 def _assert_positive_definite(self):
     raise errors.InvalidArgumentError(
         node_def=None,
         op=None,
         message="Householder operators are always "
         "non-positive definite.")
Beispiel #20
0
    def __init__(self,
                 cluster_resolver,
                 checkpoint_or_checkpoint_manager,
                 checkpoint_dir=None,
                 termination_config=None):
        """Creates the `PreemptionCheckpointHandler`.

    Args:
      cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver`
        object. You may also obtain it through the `cluster_resolver` attribute
        of the distribution strategy in use.
      checkpoint_or_checkpoint_manager: a `tf.train.CheckpointManager` or a
        `tf.train.Checkpoint`. If you are using a `tf.train.CheckpointManager`
        to manage checkpoints outside the `PreemptionCheckpointHandler` for
        backup purpose as well, pass it as `checkpoint_or_checkpoint_manager`
        argument. Otherwise, pass a `tf.train.Checkpoint` and the
        `PreemptionCheckpointHandler` will create
        a `tf.train.CheckpointManager` to manage it in the `checkpoint_dir`.
      checkpoint_dir: a directory where the `PreemptionCheckpointHandler` saves
        and restores checkpoints. When a `PreemptionCheckpointHandler` is
        created, the latest checkpoint in the `checkpoint_dir` will be restored.
        (This is not needed if a `tf.train.CheckpointManager` instead of a
        `tf.train.Checkpoint` is passed as the
        `checkpoint_or_checkpoint_manager` argument.)
      termination_config: optional, a
        `tf.distribute.experimental.TerminationConfig` object to configure for a
        platform other than Google Borg or GCP.
    """
        self._cluster_resolver = cluster_resolver
        if isinstance(checkpoint_or_checkpoint_manager,
                      checkpoint_lib.Checkpoint) and not checkpoint_dir:
            raise errors.InvalidArgumentError(
                'When a checkpoint is passed, a '
                'checkpoint_dir must be passed as well'
                '.')
        self._id_in_cluster = str(
            multi_worker_util.id_in_cluster(
                self._cluster_resolver.cluster_spec(),
                self._cluster_resolver.task_type,
                self._cluster_resolver.task_id))

        # The number of calls to `PreemptionCheckpointHandler.run` when the latest
        # checkpoint was saved.
        self._checkpointed_runs = variables.Variable(
            initial_value=constant_op.constant(0, dtype=dtypes.int64),
            trainable=False,
            name=_ITERATION_VARIABLE)

        self._maybe_create_checkpoint_manager(checkpoint_or_checkpoint_manager,
                                              checkpoint_dir, cluster_resolver)

        if not hasattr(self._write_checkpoint_manager._checkpoint,
                       _ITERATION_VARIABLE):
            setattr(self._write_checkpoint_manager._checkpoint,
                    _ITERATION_VARIABLE, self._checkpointed_runs)

        if not hasattr(self._read_checkpoint_manager._checkpoint,
                       _ITERATION_VARIABLE):
            setattr(self._read_checkpoint_manager._checkpoint,
                    _ITERATION_VARIABLE, self._checkpointed_runs)

        self._read_checkpoint_manager.restore_or_initialize()

        # grace period countdown. Set to True for all workers once they finish
        # timing saving a checkpoint. Once entering this phase, new
        # preemption/maintenance notice will not be handled, since the whole cluster
        # goes down as the worker who first initiates the grace period goes down.
        self._final_checkpoint_countdown = False

        self._estimated_run_time = 0

        # An internal step counter that's restored to checkpointed_iterations when
        # training is restored. It increments by one every time
        # `PreemptionCheckpointHandler.run` is called. Note that in this case, the
        # user must pass a single-step training function to
        # `PreemptionCheckpointHandler.run` instead of a multiple-step one.
        self._run_counter = self._checkpointed_runs.numpy()

        # The worker itself has received preeption signal.
        self._received_own_sigterm = threading.Event()

        # Some member (could be oneself) has received preemption signal, and the
        # step number to save a checkpoint has been aligned.
        self._received_checkpoint_step = threading.Event()

        self._platform_device = gce_util.detect_platform()

        if self._platform_device in (gce_util.PlatformDevice.GCE_TPU,
                                     gce_util.PlatformDevice.GCE_CPU):
            # While running MultiWorkerMirroredStrategy training with GPUs and CPUs
            # are the same on Borg, GCE CPU VM and GPU VM are different in terms
            # of live migration, grace period, etc. We can make it work upon request.
            raise NotImplementedError(
                'PreemptionCheckpointHandler does not support '
                'training with TPU or CPU device on GCP.')

        completed_termination_config = _complete_config_for_environment(
            self._platform_device, termination_config)
        self._termination_watcher_fn = completed_termination_config.termination_watcher_fn
        self._exit_fn = completed_termination_config.exit_fn
        self._grace_period = completed_termination_config.grace_period

        # When training is interrupted, we explicitly call the cleanup methods for
        # the thread watching for local worker's termination signal and the thread
        # watching for clusterwise information before we save a checkpoint and exit.
        # In the final chapter of the training where no interruption is encountered,
        # we rely on __del__ to clean up. However, there is no guarantee when or
        # whether __del__ is executed, thus we make the threads daemon to avoid it
        # preventing program from exit.
        self._cluster_wise_termination_watcher_thread = threading.Thread(
            target=self._watch_step_to_save_key,
            name='PeerTerminationWatcher-%s' % self._id_in_cluster,
            daemon=True)
        logging.info('Start watcher for peer\'s signal.')
        self._cluster_wise_termination_watcher_thread.start()

        self._poll_termination_signal_thread = None

        if completed_termination_config.termination_watcher_fn:
            self._start_polling_for_termination_signal()
        else:
            self._start_watching_for_signal()
Beispiel #21
0
 def _assert_non_singular(self):
     raise errors.InvalidArgumentError(node_def=None,
                                       op=None,
                                       message="Zero operators are always "
                                       "non-invertible.")
Beispiel #22
0
def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
    """Assert the condition `x == y` holds element-wise.

  Example of adding a dependency to an operation:

  ```python
  with tf.control_dependencies([tf.assert_equal(x, y)]):
    output = tf.reduce_sum(x)
  ```

  This condition holds if for every pair of (possibly broadcast) elements
  `x[i]`, `y[i]`, we have `x[i] == y[i]`.
  If both `x` and `y` are empty, this is trivially satisfied.

  Args:
    x:  Numeric `Tensor`.
    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
    data:  The tensors to print out if the condition is False.  Defaults to
      error message and first few entries of `x`, `y`.
    summarize: Print this many entries of each tensor.
    message: A string to prefix to the default message.
    name: A name for this operation (optional).  Defaults to "assert_equal".

  Returns:
    Op that raises `InvalidArgumentError` if `x == y` is False.
    @compatibility{eager} returns None

  Raises:
    InvalidArgumentError: if the check can be performed immediately and
      `x == y` is False. The check can be performed immediately during eager
      execution or if `x` and `y` are statically known.
  """
    message = message or ''
    with ops.name_scope(name, 'assert_equal', [x, y, data]):
        x = ops.convert_to_tensor(x, name='x')
        y = ops.convert_to_tensor(y, name='y')

        if context.executing_eagerly():
            eq = math_ops.equal(x, y)
            condition = math_ops.reduce_all(eq)
            if not condition:
                # Prepare a message with first elements of x and y.
                summary_msg = ''
                # Default to printing 3 elements like control_flow_ops.Assert (used
                # by graph mode) does.
                summarize = 3 if summarize is None else summarize
                if summarize:
                    # reshape((-1,)) is the fastest way to get a flat array view.
                    x_np = x.numpy().reshape((-1, ))
                    y_np = y.numpy().reshape((-1, ))
                    x_sum = min(x_np.size, summarize)
                    y_sum = min(y_np.size, summarize)
                    summary_msg = ('First %d elements of x:\n%s\n'
                                   'First %d elements of y:\n%s\n' %
                                   (x_sum, x_np[:x_sum], y_sum, y_np[:y_sum]))

                index_and_values_str = ''
                if x.shape == y.shape and x.shape.as_list():
                    # If the shapes of x and y are the same (and not scalars),
                    # Get the values that actually differed and their indices.
                    # If shapes are different this information is more confusing
                    # than useful.
                    mask = math_ops.logical_not(eq)
                    indices = array_ops.where(mask)
                    indices_np = indices.numpy()
                    x_vals = array_ops.boolean_mask(x, mask)
                    y_vals = array_ops.boolean_mask(y, mask)
                    summarize = min(summarize, indices_np.shape[0])
                    index_and_values_str = (
                        'Indices of first %s different values:\n%s\n'
                        'Corresponding x values:\n%s\n'
                        'Corresponding y values:\n%s\n' %
                        (summarize, indices_np[:summarize],
                         x_vals.numpy().reshape(
                             (-1, ))[:summarize], y_vals.numpy().reshape(
                                 (-1, ))[:summarize]))

                raise errors.InvalidArgumentError(
                    node_def=None,
                    op=None,
                    message=(
                        '%s\nCondition x == y did not hold.\n%s%s' %
                        (message or '', index_and_values_str, summary_msg)))
            return

        if data is None:
            data = [
                message, 'Condition x == y did not hold element-wise:',
                'x (%s) = ' % x.name, x,
                'y (%s) = ' % y.name, y
            ]
        condition = math_ops.reduce_all(math_ops.equal(x, y))
        x_static = tensor_util.constant_value(x)
        y_static = tensor_util.constant_value(y)
        if x_static is not None and y_static is not None:
            condition_static = (x_static == y_static).all()
            _assert_static(condition_static, data)
        return control_flow_ops.Assert(condition, data, summarize=summarize)
Beispiel #23
0
def trace(service_addr,
          logdir,
          duration_ms,
          worker_list='',
          num_tracing_attempts=3,
          options=None):
  """Sends gRPC requests to one or more profiler servers to perform on-demand profiling.

  This method will block the calling thread until it receives responses from all
  servers or until deadline expiration. Both single host and multiple host
  profiling are supported on CPU, GPU, and TPU.
  The profiled results will be saved by each server to the specified TensorBoard
  log directory (i.e. the directory you save your model checkpoints). Use the
  TensorBoard profile plugin to view the visualization and analysis results.

  Args:
    service_addr: A comma delimited string of gRPC addresses of the workers to
      profile.
      e.g. service_addr='grpc://localhost:6009'
           service_addr='grpc://10.0.0.2:8466,grpc://10.0.0.3:8466'
           service_addr='grpc://localhost:12345,grpc://localhost:23456'
    logdir: Path to save profile data to, typically a TensorBoard log directory.
      This path must be accessible to both the client and server.
      e.g. logdir='gs://your_tb_dir'
    duration_ms: Duration of tracing or monitoring in milliseconds. Must be
      greater than zero.
    worker_list: An optional TPU only configuration. The list of workers to
      profile in the current session.
    num_tracing_attempts: Optional. Automatically retry N times when no trace
      event is collected (default 3).
    options: profiler.experimental.ProfilerOptions namedtuple for miscellaneous
      profiler options.

  Raises:
    InvalidArgumentError: For when arguments fail validation checks.
    UnavailableError: If no trace event was collected.

  Example usage (CPU/GPU):

  ```python
    # Start a profiler server before your model runs.
    tf.profiler.experimental.server.start(6009)
    # (Model code goes here).
    # Send gRPC request to the profiler server to collect a trace of your model.
    tf.profiler.experimental.client.trace('grpc://localhost:6009',
                                          '/nfs/tb_log', 2000)
  ```

  Example usage (Multiple GPUs):

  ```python
    # E.g. your worker IP addresses are 10.0.0.2, 10.0.0.3, 10.0.0.4, and you
    # would like to schedule start of profiling 1 second from now, for a
    # duration of 2 seconds.
    options['delay_ms'] = 1000
    tf.profiler.experimental.client.trace(
        'grpc://10.0.0.2:8466,grpc://10.0.0.3:8466,grpc://10.0.0.4:8466',
        'gs://your_tb_dir',
        2000,
        options=options)
  ```

  Example usage (TPU):

  ```python
    # Send gRPC request to a TPU worker to collect a trace of your model. A
    # profiler service has been started in the TPU worker at port 8466.
    # E.g. your TPU IP address is 10.0.0.2 and you want to profile for 2 seconds
    # .
    tf.profiler.experimental.client.trace('grpc://10.0.0.2:8466',
                                          'gs://your_tb_dir', 2000)
  ```

  Example usage (Multiple TPUs):

  ```python
    # Send gRPC request to a TPU pod to collect a trace of your model on
    # multiple TPUs. A profiler service has been started in all the TPU workers
    # at the port 8466.
    # E.g. your TPU IP addresses are 10.0.0.2, 10.0.0.3, 10.0.0.4, and you want
    # to profile for 2 seconds.
    tf.profiler.experimental.client.trace(
        'grpc://10.0.0.2:8466',
        'gs://your_tb_dir',
        2000,
        '10.0.0.2:8466,10.0.0.3:8466,10.0.0.4:8466')
  ```

  Launch TensorBoard and point it to the same logdir you provided to this API.

  ```shell
    # logdir can be gs://your_tb_dir as in the above examples.
    $ tensorboard --logdir=/tmp/tb_log
  ```

  Open your browser and go to localhost:6006/#profile to view profiling results.

  """
  if duration_ms <= 0:
    raise errors.InvalidArgumentError(None, None,
                                      'duration_ms must be greater than zero.')

  opts = dict(options._asdict()) if options is not None else {}
  _pywrap_profiler.trace(
      _strip_addresses(service_addr, _GRPC_PREFIX), logdir, worker_list, True,
      duration_ms, num_tracing_attempts, opts)
Beispiel #24
0
def sliding_window(data, width, axis=-1, name=None):
    """Builds a sliding window for `data` with a specified width.

  Returns a tensor constructed from `data`, where each element in
  dimension `axis` is a slice of `data` starting at the corresponding
  position, with the given width and step size.  I.e.:

  * `result.shape.ndims = data.shape.ndims + 1`
  * `result[i1..iaxis, a] = data[i1..iaxis, a:a+width]`
    (where `0 <= a < data[i1...iaxis].shape[0] - (width - 1)`).

  Note that each result row (along dimension `axis`) has `width - 1` fewer items
  than the corresponding `data` row.  If a `data` row has fewer than `width`
  items, then the corresponding `result` row will be empty.  If you wish for
  the `result` rows to be the same size as the `data` rows, you can use
  `pad_along_dimension` to add `width - 1` padding elements before calling
  this op.

  Args:
    data: `<dtype> [O1...ON, A, I1...IM]`
      A potentially ragged K-dimensional tensor with outer dimensions of size
      `O1...ON`; axis dimension of size `A`; and inner dimensions of size
      `I1...IM`.  I.e. `K = N + 1 + M`, where `N>=0` and `M>=0`.

    width: An integer constant specifying the width of the window. Must be
      greater than zero.

    axis: An integer constant specifying the axis along which sliding window
      is computed. Negative axis values from `-K` to `-1` are supported.

    name: The name for this op (optional)

  Returns:
    A `K+1` dimensional tensor with the same dtype as `data`, where:

    * `result[i1..iaxis, a]` = `data[i1..iaxis, a:a+width]`
    * `result.shape[:axis]` = `data.shape[:axis]`
    * `result.shape[axis]` = `data.shape[axis] - (width - 1)`
    * `result.shape[axis + 1]` = `width`
    * `result.shape[axis + 2:]` = `data.shape[axis + 1:]`

  #### Examples:

    Sliding window (width=3) across a sequence of tokens:

    ```python
    >>> # input: <string>[sequence_length]
    >>> input = tf.constant(["one", "two", "three", "four", "five", "six"])
    >>> # output: <string>[sequence_length-2, 3]
    >>> output = sliding_window(data=input, width=3, axis=0)
    >>> print output.eval()
    [["one", "two", "three"],
     ["two", "three", "four"],
     ["three", "four", "five"],
     ["four", "five", "six"]]
    >>> print("Shape: %s -> %s" % (input.shape, output.shape))
    Shape: (6,) -> (4, 3)
    ```

    Sliding window (width=2) across the inner dimension of a ragged matrix
    containing a batch of token sequences:

    ```python
    >>> # input: <string>[num_sentences, (num_words)]
    >>> input = tf.ragged.constant(
    ...     [['Up', 'high', 'in', 'the', 'air'],
    ...      ['Down', 'under', 'water'],
    ...      ['Away', 'to', 'outer', 'space']]
    >>> # output: <string>[num_sentences, (num_word-1), 2]
    >>> output = sliding_window(input, width=2, axis=-1)
    >>> print output.eval()
    [[['Up', 'high'], ['high', 'in'], ['in', 'the'], ['the', 'air']],
     [['Down', 'under'], ['under', 'water']],
     [['Away', 'to'], ['to', 'outer'], ['outer', 'space']]]
    >>> print("Shape: %s -> %s" % (input.shape, output.shape))
    Shape: (3, ?) -> (3, ?, 2)
    ```

    Sliding window across the second dimension of a 3-D tensor containing
    batches of sequences of embedding vectors:

    ```python
    >>> # input: <int32>[num_sequences, sequence_length, embedding_size]
    >>> input = tf.constant([
    ...     [[1, 1, 1], [2, 2, 1], [3, 3, 1], [4, 4, 1], [5, 5, 1]],
    ...     [[1, 1, 2], [2, 2, 2], [3, 3, 2], [4, 4, 2], [5, 5, 2]]])
    >>> # output: <int32>[num_sequences, sequence_length-1, 2, embedding_size]
    >>> output = sliding_window(data=input, width=2, axis=1)
    >>> print output.eval()
    [[[[1, 1, 1], [2, 2, 1]],
      [[2, 2, 1], [3, 3, 1]],
      [[3, 3, 1], [4, 4, 1]],
      [[4, 4, 1], [5, 5, 1]]],
     [[[1, 1, 2], [2, 2, 2]],
      [[2, 2, 2], [3, 3, 2]],
      [[3, 3, 2], [4, 4, 2]],
      [[4, 4, 2], [5, 5, 2]]]]
    >>> print("Shape: %s -> %s" % (input.shape, output.shape))
    Shape: (2, 5, 3) -> (2, 4, 2, 3)
    ```
  """
    with ops.name_scope(name, "SlidingWindow", [data, axis]):
        data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data,
                                                                name="data")

        if not isinstance(axis, int):
            raise TypeError("axis must be an int")

        if not isinstance(width, int):
            raise TypeError("width must be an int")

        if data.shape.ndims is not None and (axis < -data.shape.ndims
                                             or axis >= data.shape.ndims):
            raise errors.InvalidArgumentError(
                None, None,
                "axis must be between -k <= axis <= -1 OR 0 <= axis < k")

        if width <= 0:
            raise errors.InvalidArgumentError(
                None, None, "width must be an integer greater than 0")

        slices = []
        for start in range(width):
            stop = None if start - width + 1 == 0 else start - width + 1
            if axis >= 0:
                idx = [slice(None)] * axis + [slice(start, stop)]
            else:
                idx = [Ellipsis, slice(start, stop)
                       ] + [slice(None)] * (-axis - 1)
            slices.append(data[idx])

        # Stack the slices.
        stack_axis = axis + 1 if axis >= 0 else axis
        return array_ops.stack(slices, stack_axis)
def create_feature_bitmask(tensor, dtype=dtypes.int32, name=None):
    """Packs the innermost dimension of a boolean tensor into integer values.

  `result[i1...iN]` is the integer formed by interpreting the booleans
  `tensor[i1...iN, 0:num_bits]` as individual bits, with big-endian order.
  E.g., if `tensor[i1...iN, 0:num_bits] = [True, False, False, True, False]`,
  then `result[i1...iN] = 0b10010 = 18`.  The return tensor is of type `dtype`,
  if specified; if `dtype` is not set, `int32` will be used.

  If `num_bits` is too large to fit in `dtype`, then an exception is raised
  when this op is called (if `num_bits` is statically known) or when it is
  evaluated (if `num_bits` is not statically known).

  Args:
    tensor: `<bool>[D1...DN, num_bits]` The boolean tensor whose innermost
      dimension should be packed to form integer values.
    dtype: The datatype to output for this op (optional).
    name: The name for this op (optional).

  Returns:
    `<dtype> [D1...DN]`
      An integer tensor formed by interpreting the innermost dimension of
      `tensor` as individual bits.

  Raises:
    ValueError: If the data to be packed is too large for the chosen data
      type.
    ValueError: If the data to be packed is not boolean.
    InvalidArgumentError: If the input tensor is a list, or the dtype is not a
      supported integer type.

  Examples:
    ```python
    >>> assert create_feature_bitmask([True, False, False, True]) == 0b1001
    >>> create_feature_bitmask([[True, False], [False, True], [True, True]])
    [0b10, 0b01, 0b11]
    ```
  """
    with ops.name_scope(name, 'CreateFeatureBitmask', [tensor]):
        if (isinstance(tensor, (list, tuple)) and tensor
                and isinstance(tensor[0], ops.Tensor)):
            raise errors.InvalidArgumentError(
                None, None,
                'CreateFeatureBitmask does not support lists of tensors. Consider '
                'using tf.stack(list,-1) to create a single tensor before invoking '
                'this op.')

        tensor = ops.convert_to_tensor(tensor, dtypes.bool, 'tensor')

        if dtype not in _max_bits.keys():
            raise errors.InvalidArgumentError(
                None, None, 'dtype must be one of: [%s], was %s' %
                (sorted(_max_bits.items(), key=lambda kv: kv[1]), dtype.name))

        integer_data = math_ops.cast(tensor, dtype=dtype)
        shape = tensor.shape
        if shape.ndims is not None and shape.dims[-1].value is not None:
            num_bits = shape.dims[-1].value
            if num_bits > 63:
                raise ValueError(
                    'data.shape[-1] must be less than 64, is %d.' % num_bits)
            elif num_bits > _max_bits[dtype]:
                raise ValueError(
                    'data.shape[-1] is too large for %s (was %d, cannot exceed %d); '
                    'consider switching condense_boolean_tensor to a larger '
                    'dtype.' % (dtype.name, num_bits, _max_bits[dtype]))
            bit_masks = constant_op.constant(
                [2**pos for pos in range(num_bits - 1, -1, -1)], dtype)
        else:
            bit_masks = constant_op.constant(
                [2**pos for pos in range(_max_bits[dtype] - 1, -1, -1)], dtype)
            num_bits = array_ops.shape(tensor)[-1]
            with ops.control_dependencies([
                    check_ops.assert_less_equal(
                        num_bits,
                        _max_bits[dtype],
                        message=
                        'data.shape[-1] is too large for %s (cannot exceed %s)'
                        % (dtype.name, _max_bits[dtype]))
            ]):
                # The second slice ("[:num_bits]") is a no-op unless num_bits==0.
                bit_masks = bit_masks[-num_bits:][:num_bits]
        return math_ops.reduce_sum(integer_data * bit_masks, axis=-1)
Beispiel #26
0
def pad_along_dimension(data,
                        axis=-1,
                        left_pad=None,
                        right_pad=None,
                        name=None):
    """Add padding to the beginning and end of data in a specific dimension.

  Returns a tensor constructed from `data`, where each row in dimension `axis`
  is replaced by the concatenation of the left padding followed by the row
  followed by the right padding.  I.e., if `L=left_pad.shape[0]` and
  `R=right_pad.shape[0]`, then:

  ```python
  result[i1...iaxis, 0:L] = left_pad
  result[i1...iaxis, L:-R] = data[i0...iaxis]
  result[i1...iaxis, -R:] = right_pad
  ```

  Args:
    data: `<dtype>[O1...ON, A, I1...IM]` A potentially ragged `K` dimensional
      tensor with outer dimensions of size `O1...ON`; axis dimension of size
      `A`; and inner dimensions of size `I1...IM`.  I.e. `K = N + 1 + M`, where
      `N>=0` and `M>=0`.
    axis: An integer constant specifying the axis along which padding is added.
      Negative axis values from `-K` to `-1` are supported.
    left_pad: `<dtype>[L, I1...IM]` An `M+1` dimensional tensor that should be
      prepended to each row along dimension `axis`; or `None` if no padding
      should be added to the left side.
    right_pad: `<dtype>[R, I1...IM]` An `M+1` dimensional tensor that should be
      appended to each row along dimension `axis`; or `None` if no padding
      should be added to the right side.
    name: The name of this op (optional).

  Returns:
    `<dtype>[O1...ON, L + A + R, I1...IM]`
      A potentially ragged `K` dimensional tensor with outer dimensions of size
      `O1...ON`; padded axis dimension size `L+A+R`; and inner dimensions of
      size `I1...IM`.  If `data` is a `RaggedTensor`, then the returned tensor
      is a `RaggedTensor` with the same `ragged_rank`.
  """
    data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name="data")

    if not isinstance(axis, int):
        raise TypeError("axis must be an int; got %s" % type(axis).__name__)

    if left_pad is None and right_pad is None:
        return data

    with ops.name_scope(name, "PadAlongDimension", [data]):
        if data.shape.ndims is not None and (axis < -data.shape.ndims
                                             or axis >= data.shape.ndims):
            raise errors.InvalidArgumentError(
                None, None,
                "axis must be between -k <= axis <= -1 OR 0 <= axis < k")
        if isinstance(data, ragged_tensor.RaggedTensor):
            axis = _get_positive_axis(axis, data.shape.ndims)

        if left_pad is not None:
            left_pad = ragged_tensor.convert_to_tensor_or_ragged_tensor(
                left_pad, dtype=data.dtype, name="left_pad")
        if right_pad is not None:
            right_pad = ragged_tensor.convert_to_tensor_or_ragged_tensor(
                right_pad, dtype=data.dtype, name="left_pad")

        left_padding = _padding_for_dimension(data, axis, left_pad)
        right_padding = _padding_for_dimension(data, axis, right_pad)

        pieces = [left_padding, data, right_padding]
        if isinstance(data, ragged_tensor.RaggedTensor):
            return array_ops.concat([p for p in pieces if p is not None], axis)
        else:
            return array_ops.concat([p for p in pieces if p is not None], axis)