def profile(graph, run_meta=None, op_log=None, cmd='scope', options=_DEFAULT_PROFILE_OPTIONS): """Print model statistics. https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/README.md Args: graph: tf.Graph. run_meta: tensorflow::RunMetadata proto. When provided, also shows valid timing and memory information when 'select' option contains 'micros' and 'bytes'. op_log: tensorflow::tfprof::OpLog proto. users can use this proto to group together ops and use a op_type to select the group. cmd: string. Either 'op', 'scope', 'graph', 'code'. 'op' view organize outputs using operation type. (e.g. MatMul) 'scope' view organize outputs using graph node name scope. 'graph' view organize outputs using graph node inputs/outputs. 'code' view organize outputs using Python call stack. options: A dict of options. See core/profiler/g3doc/options.md. Returns: If cmd is 'scope' or 'graph', returns TFGraphNodeProto proto. If cmd is 'op' or 'code', returns TFMultiGraphNodeProto proto. Side effect: stdout/file/timeline.json depending on options['output'] """ if options == _DEFAULT_PROFILE_OPTIONS: options = TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy() # pylint: disable=protected-access op_log = tfprof_logger._merge_default_with_oplog(graph, op_log, run_meta, add_trace=cmd == 'code') # pylint: enable=protected-access opts = _build_options(options) run_meta_str = run_meta.SerializeToString() if run_meta else b'' if cmd == 'code' or cmd == 'op': tfprof_node = tfprof_output_pb2.TFMultiGraphNodeProto() tfprof_node.ParseFromString( print_mdl.PrintModelAnalysis( graph.as_graph_def(add_shapes=True).SerializeToString(), run_meta_str, op_log.SerializeToString(), cmd.encode('utf-8'), opts.SerializeToString())) elif cmd == 'graph' or cmd == 'scope': tfprof_node = tfprof_output_pb2.TFGraphNodeProto() tfprof_node.ParseFromString( print_mdl.PrintModelAnalysis( graph.as_graph_def(add_shapes=True).SerializeToString(), run_meta_str, op_log.SerializeToString(), cmd.encode('utf-8'), opts.SerializeToString())) else: raise errors.InvalidArgumentError(None, None, 'unknown cmd: %s\n' % cmd) return tfprof_node
def _assert_static(condition, data): """Raises a InvalidArgumentError with as much information as possible.""" if not condition: data_static = [_maybe_constant_value_string(x) for x in data] raise errors.InvalidArgumentError(node_def=None, op=None, message='\n'.join(data_static))
def max_spanning_tree_gradient(mst_op, d_loss_d_max_scores, *_): """Returns a subgradient of the MaximumSpanningTree op. Note that MaximumSpanningTree is only differentiable w.r.t. its |scores| input and its |max_scores| output. Args: mst_op: The MaximumSpanningTree op being differentiated. d_loss_d_max_scores: [B] vector where entry b is the gradient of the network loss w.r.t. entry b of the |max_scores| output of the |mst_op|. *_: The gradients w.r.t. the other outputs; ignored. Returns: 1. None, since the op is not differentiable w.r.t. its |num_nodes| input. 2. [B,M,M] tensor where entry b,t,s is a subgradient of the network loss w.r.t. entry b,t,s of the |scores| input, with the same dtype as |d_loss_d_max_scores|. """ dtype = d_loss_d_max_scores.dtype.base_dtype if dtype is None: raise errors.InvalidArgumentError("Expected (%s) is not None" % dtype) argmax_sources_bxm = mst_op.outputs[1] input_dim = array_ops.shape(argmax_sources_bxm)[1] # M in the docstring # The one-hot argmax is a subgradient of max. Convert the batch of maximal # spanning trees into 0/1 indicators, then scale them by the relevant output # gradients from |d_loss_d_max_scores|. Note that |d_loss_d_max_scores| must # be reshaped in order for it to broadcast across the batch dimension. indicators_bxmxm = standard_ops.one_hot( argmax_sources_bxm, input_dim, dtype=dtype) d_loss_d_max_scores_bx1 = array_ops.expand_dims(d_loss_d_max_scores, -1) d_loss_d_max_scores_bx1x1 = array_ops.expand_dims(d_loss_d_max_scores_bx1, -1) d_loss_d_scores_bxmxm = indicators_bxmxm * d_loss_d_max_scores_bx1x1 return None, d_loss_d_scores_bxmxm
def most_specific_common_supertype( self, others: Sequence[trace.TraceType]) -> Optional[trace.TraceType]: if not others: raise errors.InvalidArgumentError( "Argument `others` to function `most_specific_common_supertype` must be a non-empty Sequence." ) return None
def _assert_self_adjoint(self): if all(operator.is_square for operator in self.operators): asserts = [operator.assert_self_adjoint() for operator in self.operators] return control_flow_ops.group(asserts) else: raise errors.InvalidArgumentError( node_def=None, op=None, message="All Kronecker factors must be " "square for the product to be self adjoint.")
def print_model_analysis(graph, run_meta=None, op_log=None, tfprof_cmd='scope', tfprof_options=TRAINABLE_VARS_PARAMS_STAT_OPTIONS): """Print model statistics. See go/tfprof or README for examples and tutorials. Run tfprof tool for help: 'bazel run third_party/tensorflow/tools/tfprof help' Args: graph: tf.Graph. run_meta: tensorflow::RunMetadata proto. When provided, also shows valid timing and memory information when 'select' option contains 'micros' and 'bytes'. op_log: tensorflow::tfprof::OpLog proto. users can use this proto to group together ops and use a op_type to select the group. tfprof_cmd: string. Either 'op', 'scope', 'graph', 'code'. 'op' view organize outputs using operation type. (e.g. MatMul) 'scope' view organize outputs using graph node name scope. 'graph' view organize outputs using graph node inputs/outputs. 'code' view organize outputs using Python call stack. tfprof_options: See 'tfprof help' for details. Returns: If tfprof_cmd is 'scope' or 'graph', returns TFGraphNodeProto proto. If tfprof_cmd is 'op' or 'code', returns TFMultiGraphNodeProto proto. Side effect: stdout/file/timeline.json depending on tfprof_options['output'] """ # pylint: disable=protected-access op_log = tfprof_logger._merge_default_with_oplog( graph, op_log, run_meta, add_trace=tfprof_cmd == 'code') # pylint: enable=protected-access opts = _build_options(tfprof_options) run_meta_str = run_meta.SerializeToString() if run_meta else b'' if tfprof_cmd == 'code' or tfprof_cmd == 'op': tfprof_node = tfprof_output_pb2.TFMultiGraphNodeProto() tfprof_node.ParseFromString( print_mdl.PrintModelAnalysis( graph.as_graph_def(add_shapes=True).SerializeToString(), run_meta_str, op_log.SerializeToString(), tfprof_cmd.encode('utf-8'), opts.SerializeToString())) elif tfprof_cmd == 'graph' or tfprof_cmd == 'scope': tfprof_node = tfprof_output_pb2.TFGraphNodeProto() tfprof_node.ParseFromString( print_mdl.PrintModelAnalysis( graph.as_graph_def(add_shapes=True).SerializeToString(), run_meta_str, op_log.SerializeToString(), tfprof_cmd.encode('utf-8'), opts.SerializeToString())) else: raise errors.InvalidArgumentError( None, None, 'unknown tfprof_cmd: %s\n' % tfprof_cmd) return tfprof_node
def __init__(self, name, mode): self.__name = name self.__mode = mode self._read_buf = None if mode not in ("r", "w", "a", "r+", "w+", "a+"): raise errors.InvalidArgumentError( None, None, "mode is not 'r' or 'w' or 'a' or 'r+' or 'w+' or 'a+'") self._read_check_passed = mode in ("r", "r+", "a+", "w+") self._write_check_passed = mode in ("a", "w", "r+", "a+", "w+")
def _get_coordinatewise_learning_rate(self, grad, var): # Compute the learning rate using a moving average for the diagonal of BB^T avg_first = self.get_slot(var, 'first_moment') avg_second = self.get_slot(var, 'second_moment') decay_tensor = math_ops.cast(self._decay_tensor, var.dtype) batch_size = math_ops.cast(self._batch_size_tensor, var.dtype) # Create an estimator for the moving average of gradient mean and variance # via Welford's algorithm if isinstance(grad, ops.Tensor): delta = grad - avg_first first_moment_update = avg_first.assign_add( array_ops.where(self._counter < 1, math_ops.cast(1, var.dtype), 1. - decay_tensor) * delta) with ops.control_dependencies([first_moment_update]): second_moment_update = avg_second.assign_add( math_ops.cast(self._counter < 1, var.dtype) * -(1. - decay_tensor) * (avg_second - decay_tensor * math_ops.square(delta))) diag_preconditioner = control_flow_ops.with_dependencies( [second_moment_update], clip_ops.clip_by_value(avg_second, 1e-12, 1e12)) elif isinstance(grad, ops.IndexedSlices): delta = grad.values - array_ops.gather_nd(avg_first, grad.indices) first_moment_update = state_ops.scatter_add( avg_first, grad.indices, array_ops.where(self._counter < 1, math_ops.cast( 1., var.dtype), 1. - decay_tensor) * delta) with ops.control_dependencies([first_moment_update]): avg_second = state_ops.scatter_add( avg_second, grad.indices, math_ops.cast(self._counter < 1, var.dtype) * -(1. - decay_tensor) * (array_ops.gather_nd(avg_second, grad.indices) - decay_tensor * math_ops.square(delta))) avg_second = array_ops.gather_nd(avg_second, grad.indices) # TODO (b/70783772) id:488 gh:489 diag_preconditioner = clip_ops.clip_by_value( avg_second, 1e-12, 1e12) else: raise errors.InvalidArgumentError( None, None, 'grad must of type Tensor or IndexedSlice') diag_preconditioner *= batch_size if self._use_single_learning_rate: diag_preconditioner = math_ops.reduce_mean(diag_preconditioner) # From Theorem 2 Corollary 1 of Mandt et al. 2017 return 2. * batch_size / ( math_ops.cast(self._total_num_examples, var.dtype.base_dtype) * diag_preconditioner)
def _assert_self_adjoint(self): if all(operator.is_square for operator in self.operators): asserts = [operator.assert_self_adjoint() for operator in self.operators] return control_flow_ops.group(asserts) else: raise errors.InvalidArgumentError( node_def=None, op=None, message="All Kronecker factors must be square for the product to be " "invertible. Expected hint `is_square` to be True for every operator " "in argument `operators`.")
def __init__(self, name, mode): self.__name = name self.__mode = mode self._read_buf = None self._writable_file = None self._binary_mode = "b" in mode mode = mode.replace("b", "") if mode not in ("r", "w", "a", "r+", "w+", "a+"): raise errors.InvalidArgumentError( None, None, "mode is not 'r' or 'w' or 'a' or 'r+' or 'w+' or 'a+'") self._read_check_passed = mode in ("r", "r+", "a+", "w+") self._write_check_passed = mode in ("a", "w", "r+", "a+", "w+")
def _get_graph_callable_inputs(shape_and_dtypes): """Maps specified shape_and_dtypes to graph inputs.""" ret = [] for x in shape_and_dtypes: if isinstance(x, ShapeAndDtype): ret.append(array_ops.placeholder(x.dtype, x.shape)) elif isinstance(x, (tuple, list)): ret.append(_get_graph_callable_inputs(x)) else: raise errors.InvalidArgumentError( None, None, "shape_and_dtypes not ShapeAndDtype, type: %s " % type(x)) return tuple(ret) if isinstance(shape_and_dtypes, tuple) else ret
def recalculate_output_shapes(output_shapes): """Recalculates the output_shapes after dividing it by num_workers.""" if len(output_shapes) < 1: raise ValueError("Input shape should have at least one dimension.") if (tensor_shape.dimension_value(output_shapes[0]) and tensor_shape.dimension_value(output_shapes[0]) % num_workers != 0): raise errors.InvalidArgumentError( None, None, "First dim of input shape: %d is not divisible by num_workers: %d" % (output_shapes[0], num_workers)) output_dims = [d for d in output_shapes.dims] output_dims[0] = output_dims[0] // num_workers return tensor_shape.TensorShape(output_dims)
def _get_graph_callable_inputs(shape_and_dtypes): """Maps specified shape_and_dtypes to graph inputs.""" ret = [] for x in shape_and_dtypes: if isinstance(x, ShapeAndDtype): ret.append(array_ops.placeholder(x.dtype, x.shape)) elif isinstance(x, (tuple, list)): ret.append(_get_graph_callable_inputs(x)) else: raise errors.InvalidArgumentError( None, None, "Expected the argument to @graph_callable to be a " "(possibly nested) list or tuple of ShapeAndDtype objects, " "but got an object of type: %s" % type(x)) return tuple(ret) if isinstance(shape_and_dtypes, tuple) else ret
def seek(self, offset=None, whence=0, position=None): # TODO(jhseu): Delete later. Used to omit `position` from docs. # pylint: disable=g-doc-args """Seeks to the offset in the file. Args: offset: The byte count relative to the whence argument. whence: Valid values for whence are: 0: start of the file (default) 1: relative to the current position of the file 2: relative to the end of file. offset is usually negative. """ # pylint: enable=g-doc-args self._preread_check() # We needed to make offset a keyword argument for backwards-compatibility. # This check exists so that we can convert back to having offset be a # positional argument. # TODO(jhseu): Make `offset` a positional argument after `position` is # deleted. if offset is None and position is None: raise TypeError("seek(): offset argument required") if offset is not None and position is not None: raise TypeError("seek(): offset and position may not be set " "simultaneously.") if position is not None: offset = position with errors.raise_exception_on_not_ok_status() as status: if whence == 0: pass elif whence == 1: offset += self.tell() elif whence == 2: offset += self.size() else: raise errors.InvalidArgumentError( None, None, "Invalid whence argument: {}. Valid values are 0, 1, or 2." .format(whence)) ret_status = self._read_buf.Seek(offset) pywrap_tensorflow.Set_TF_Status_from_Status(status, ret_status)
def most_specific_common_supertype(self, others: Sequence[trace.TraceType]): """See base class.""" if not others: raise errors.InvalidArgumentError( "Argument `others` to function `most_specific_common_supertype` must be a non-empty Sequence." ) if not all(self._has_same_structure(other) for other in others): return None new_components = [] for i, component in enumerate(self.components): common = component.most_specific_common_supertype( [other.components[i] for other in others]) if common is None: return None else: new_components.append(common) return type(self)(*new_components)
def most_specific_common_supertype(self, others: Sequence[trace.TraceType]): """See base class.""" if not others: raise errors.InvalidArgumentError( "Argument `others` to function `most_specific_common_supertype` must be a non-empty Sequence." ) if not all(self._has_same_structure(other) for other in others): return None new_mapping = {} for key in self.mapping.keys(): common = self.mapping[key].most_specific_common_supertype( [other.mapping[key] for other in others]) if common is None: return None else: new_mapping[key] = common return DictType(new_mapping)
def ngrams(data, width, axis=-1, reduction_type=None, string_separator=" ", name=None): """Create a tensor of n-grams based on the input data `data`. Creates a tensor of n-grams based on `data`. The n-grams are of width `width` and are created along axis `axis`; the n-grams are created by combining windows of `width` adjacent elements from `data` using `reduction_type`. This op is intended to cover basic use cases; more complex combinations can be created using the sliding_window op. Args: data: The data to reduce. width: The width of the ngram window. If there is not sufficient data to fill out the ngram window, the resulting ngram will be empty. axis: The axis to create ngrams along. Note that for string join reductions, only axis '-1' is supported; for other reductions, any positive or negative axis can be used. Should be a constant. reduction_type: A member of the Reduction enum. Should be a constant. Currently supports: * `Reduction.SUM`: Add values in the window. * `Reduction.MEAN`: Average values in the window. * `Reduction.STRING_JOIN`: Join strings in the window. Note that axis must be -1 here. string_separator: The separator string used for `Reduction.STRING_JOIN`. Ignored otherwise. Must be a string constant, not a Tensor. name: The op name. Returns: A tensor of ngrams. Raises: InvalidArgumentError: if `reduction_type` is either None or not a Reduction, or if `reduction_type` is STRING_JOIN and `axis` is not -1. """ with tf.name_scope(name, "NGrams", [data, width]): if reduction_type is None: raise errors.InvalidArgumentError( None, None, "reduction_type must be specified.") if not isinstance(reduction_type, Reduction): raise errors.InvalidArgumentError( None, None, "reduction_type must be a Reduction.") # TODO(b/122967921): Lift this restriction after ragged_reduce_join is done. if reduction_type is Reduction.STRING_JOIN and axis != -1: raise errors.InvalidArgumentError( None, None, "%s requires that ngrams' 'axis' parameter be -1." % Reduction.STRING_JOIN.name) windowed_data = sliding_window(data, width, axis) if axis < 0: reduction_axis = axis else: reduction_axis = axis + 1 # Ragged reduction ops work on both Tensor and RaggedTensor, so we can # use them here regardless of the type of tensor in 'windowed_data'. if reduction_type is Reduction.SUM: return tf.reduce_sum(windowed_data, reduction_axis) elif reduction_type is Reduction.MEAN: return tf.reduce_mean(windowed_data, reduction_axis) elif reduction_type is Reduction.STRING_JOIN: if isinstance(data, tf.RaggedTensor): return tf.ragged.map_flat_values(tf.reduce_join, windowed_data, axis=axis, separator=string_separator) else: return tf.reduce_join(windowed_data, axis=axis, separator=string_separator)
def profile(graph=None, run_meta=None, op_log=None, cmd='scope', options=_DEFAULT_PROFILE_OPTIONS): """Profile model. Tutorials and examples can be found in: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/README.md Args: graph: tf.Graph. If None and eager execution is not enabled, use default graph. run_meta: optional tensorflow.RunMetadata proto. It is necessary to to support run time information profiling, such as time and memory. op_log: tensorflow.tfprof.OpLogProto proto. User can assign "types" to graph nodes with op_log. "types" allow user to flexibly group and account profiles using options['accounted_type_regexes']. cmd: string. Either 'op', 'scope', 'graph' or 'code'. 'op' view organizes profile using operation type. (e.g. MatMul) 'scope' view organizes profile using graph node name scope. 'graph' view organizes profile using graph node inputs/outputs. 'code' view organizes profile using Python call stack. options: A dict of options. See core/profiler/g3doc/options.md. Returns: If cmd is 'scope' or 'graph', returns GraphNodeProto proto. If cmd is 'op' or 'code', returns MultiGraphNodeProto proto. Side effect: stdout/file/timeline.json depending on options['output'] """ if not graph and not context.executing_eagerly(): graph = ops.get_default_graph() if options == _DEFAULT_PROFILE_OPTIONS: options = (option_builder.ProfileOptionBuilder. trainable_variables_parameter()) # pylint: disable=protected-access op_log = tfprof_logger.merge_default_with_oplog(graph, op_log, run_meta, add_trace=cmd == 'code') # pylint: enable=protected-access opts = _build_options(options) run_meta_str = run_meta.SerializeToString() if run_meta else b'' graph_str = _graph_string(graph) if cmd == 'code' or cmd == 'op': tfprof_node = tfprof_output_pb2.MultiGraphNodeProto() ret = print_mdl.PrintModelAnalysis(graph_str, run_meta_str, op_log.SerializeToString(), cmd.encode('utf-8'), opts.SerializeToString()) try: tfprof_node.ParseFromString(ret) except message.DecodeError as e: sys.stderr.write('Cannot parse returned proto: %s.\n' % e) elif cmd == 'graph' or cmd == 'scope': tfprof_node = tfprof_output_pb2.GraphNodeProto() ret = print_mdl.PrintModelAnalysis(graph_str, run_meta_str, op_log.SerializeToString(), cmd.encode('utf-8'), opts.SerializeToString()) try: tfprof_node.ParseFromString(ret) except message.DecodeError as e: sys.stderr.write('Cannot parse returned proto: %s.\n' % e) else: raise errors.InvalidArgumentError(None, None, 'unknown cmd: %s\n' % cmd) return tfprof_node
def _assert_positive_definite(self): raise errors.InvalidArgumentError( node_def=None, op=None, message="Householder operators are always " "non-positive definite.")
def __init__(self, cluster_resolver, checkpoint_or_checkpoint_manager, checkpoint_dir=None, termination_config=None): """Creates the `PreemptionCheckpointHandler`. Args: cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver` object. You may also obtain it through the `cluster_resolver` attribute of the distribution strategy in use. checkpoint_or_checkpoint_manager: a `tf.train.CheckpointManager` or a `tf.train.Checkpoint`. If you are using a `tf.train.CheckpointManager` to manage checkpoints outside the `PreemptionCheckpointHandler` for backup purpose as well, pass it as `checkpoint_or_checkpoint_manager` argument. Otherwise, pass a `tf.train.Checkpoint` and the `PreemptionCheckpointHandler` will create a `tf.train.CheckpointManager` to manage it in the `checkpoint_dir`. checkpoint_dir: a directory where the `PreemptionCheckpointHandler` saves and restores checkpoints. When a `PreemptionCheckpointHandler` is created, the latest checkpoint in the `checkpoint_dir` will be restored. (This is not needed if a `tf.train.CheckpointManager` instead of a `tf.train.Checkpoint` is passed as the `checkpoint_or_checkpoint_manager` argument.) termination_config: optional, a `tf.distribute.experimental.TerminationConfig` object to configure for a platform other than Google Borg or GCP. """ self._cluster_resolver = cluster_resolver if isinstance(checkpoint_or_checkpoint_manager, checkpoint_lib.Checkpoint) and not checkpoint_dir: raise errors.InvalidArgumentError( 'When a checkpoint is passed, a ' 'checkpoint_dir must be passed as well' '.') self._id_in_cluster = str( multi_worker_util.id_in_cluster( self._cluster_resolver.cluster_spec(), self._cluster_resolver.task_type, self._cluster_resolver.task_id)) # The number of calls to `PreemptionCheckpointHandler.run` when the latest # checkpoint was saved. self._checkpointed_runs = variables.Variable( initial_value=constant_op.constant(0, dtype=dtypes.int64), trainable=False, name=_ITERATION_VARIABLE) self._maybe_create_checkpoint_manager(checkpoint_or_checkpoint_manager, checkpoint_dir, cluster_resolver) if not hasattr(self._write_checkpoint_manager._checkpoint, _ITERATION_VARIABLE): setattr(self._write_checkpoint_manager._checkpoint, _ITERATION_VARIABLE, self._checkpointed_runs) if not hasattr(self._read_checkpoint_manager._checkpoint, _ITERATION_VARIABLE): setattr(self._read_checkpoint_manager._checkpoint, _ITERATION_VARIABLE, self._checkpointed_runs) self._read_checkpoint_manager.restore_or_initialize() # grace period countdown. Set to True for all workers once they finish # timing saving a checkpoint. Once entering this phase, new # preemption/maintenance notice will not be handled, since the whole cluster # goes down as the worker who first initiates the grace period goes down. self._final_checkpoint_countdown = False self._estimated_run_time = 0 # An internal step counter that's restored to checkpointed_iterations when # training is restored. It increments by one every time # `PreemptionCheckpointHandler.run` is called. Note that in this case, the # user must pass a single-step training function to # `PreemptionCheckpointHandler.run` instead of a multiple-step one. self._run_counter = self._checkpointed_runs.numpy() # The worker itself has received preeption signal. self._received_own_sigterm = threading.Event() # Some member (could be oneself) has received preemption signal, and the # step number to save a checkpoint has been aligned. self._received_checkpoint_step = threading.Event() self._platform_device = gce_util.detect_platform() if self._platform_device in (gce_util.PlatformDevice.GCE_TPU, gce_util.PlatformDevice.GCE_CPU): # While running MultiWorkerMirroredStrategy training with GPUs and CPUs # are the same on Borg, GCE CPU VM and GPU VM are different in terms # of live migration, grace period, etc. We can make it work upon request. raise NotImplementedError( 'PreemptionCheckpointHandler does not support ' 'training with TPU or CPU device on GCP.') completed_termination_config = _complete_config_for_environment( self._platform_device, termination_config) self._termination_watcher_fn = completed_termination_config.termination_watcher_fn self._exit_fn = completed_termination_config.exit_fn self._grace_period = completed_termination_config.grace_period # When training is interrupted, we explicitly call the cleanup methods for # the thread watching for local worker's termination signal and the thread # watching for clusterwise information before we save a checkpoint and exit. # In the final chapter of the training where no interruption is encountered, # we rely on __del__ to clean up. However, there is no guarantee when or # whether __del__ is executed, thus we make the threads daemon to avoid it # preventing program from exit. self._cluster_wise_termination_watcher_thread = threading.Thread( target=self._watch_step_to_save_key, name='PeerTerminationWatcher-%s' % self._id_in_cluster, daemon=True) logging.info('Start watcher for peer\'s signal.') self._cluster_wise_termination_watcher_thread.start() self._poll_termination_signal_thread = None if completed_termination_config.termination_watcher_fn: self._start_polling_for_termination_signal() else: self._start_watching_for_signal()
def _assert_non_singular(self): raise errors.InvalidArgumentError(node_def=None, op=None, message="Zero operators are always " "non-invertible.")
def assert_equal(x, y, data=None, summarize=None, message=None, name=None): """Assert the condition `x == y` holds element-wise. Example of adding a dependency to an operation: ```python with tf.control_dependencies([tf.assert_equal(x, y)]): output = tf.reduce_sum(x) ``` This condition holds if for every pair of (possibly broadcast) elements `x[i]`, `y[i]`, we have `x[i] == y[i]`. If both `x` and `y` are empty, this is trivially satisfied. Args: x: Numeric `Tensor`. y: Numeric `Tensor`, same dtype as and broadcastable to `x`. data: The tensors to print out if the condition is False. Defaults to error message and first few entries of `x`, `y`. summarize: Print this many entries of each tensor. message: A string to prefix to the default message. name: A name for this operation (optional). Defaults to "assert_equal". Returns: Op that raises `InvalidArgumentError` if `x == y` is False. @compatibility{eager} returns None Raises: InvalidArgumentError: if the check can be performed immediately and `x == y` is False. The check can be performed immediately during eager execution or if `x` and `y` are statically known. """ message = message or '' with ops.name_scope(name, 'assert_equal', [x, y, data]): x = ops.convert_to_tensor(x, name='x') y = ops.convert_to_tensor(y, name='y') if context.executing_eagerly(): eq = math_ops.equal(x, y) condition = math_ops.reduce_all(eq) if not condition: # Prepare a message with first elements of x and y. summary_msg = '' # Default to printing 3 elements like control_flow_ops.Assert (used # by graph mode) does. summarize = 3 if summarize is None else summarize if summarize: # reshape((-1,)) is the fastest way to get a flat array view. x_np = x.numpy().reshape((-1, )) y_np = y.numpy().reshape((-1, )) x_sum = min(x_np.size, summarize) y_sum = min(y_np.size, summarize) summary_msg = ('First %d elements of x:\n%s\n' 'First %d elements of y:\n%s\n' % (x_sum, x_np[:x_sum], y_sum, y_np[:y_sum])) index_and_values_str = '' if x.shape == y.shape and x.shape.as_list(): # If the shapes of x and y are the same (and not scalars), # Get the values that actually differed and their indices. # If shapes are different this information is more confusing # than useful. mask = math_ops.logical_not(eq) indices = array_ops.where(mask) indices_np = indices.numpy() x_vals = array_ops.boolean_mask(x, mask) y_vals = array_ops.boolean_mask(y, mask) summarize = min(summarize, indices_np.shape[0]) index_and_values_str = ( 'Indices of first %s different values:\n%s\n' 'Corresponding x values:\n%s\n' 'Corresponding y values:\n%s\n' % (summarize, indices_np[:summarize], x_vals.numpy().reshape( (-1, ))[:summarize], y_vals.numpy().reshape( (-1, ))[:summarize])) raise errors.InvalidArgumentError( node_def=None, op=None, message=( '%s\nCondition x == y did not hold.\n%s%s' % (message or '', index_and_values_str, summary_msg))) return if data is None: data = [ message, 'Condition x == y did not hold element-wise:', 'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y ] condition = math_ops.reduce_all(math_ops.equal(x, y)) x_static = tensor_util.constant_value(x) y_static = tensor_util.constant_value(y) if x_static is not None and y_static is not None: condition_static = (x_static == y_static).all() _assert_static(condition_static, data) return control_flow_ops.Assert(condition, data, summarize=summarize)
def trace(service_addr, logdir, duration_ms, worker_list='', num_tracing_attempts=3, options=None): """Sends gRPC requests to one or more profiler servers to perform on-demand profiling. This method will block the calling thread until it receives responses from all servers or until deadline expiration. Both single host and multiple host profiling are supported on CPU, GPU, and TPU. The profiled results will be saved by each server to the specified TensorBoard log directory (i.e. the directory you save your model checkpoints). Use the TensorBoard profile plugin to view the visualization and analysis results. Args: service_addr: A comma delimited string of gRPC addresses of the workers to profile. e.g. service_addr='grpc://localhost:6009' service_addr='grpc://10.0.0.2:8466,grpc://10.0.0.3:8466' service_addr='grpc://localhost:12345,grpc://localhost:23456' logdir: Path to save profile data to, typically a TensorBoard log directory. This path must be accessible to both the client and server. e.g. logdir='gs://your_tb_dir' duration_ms: Duration of tracing or monitoring in milliseconds. Must be greater than zero. worker_list: An optional TPU only configuration. The list of workers to profile in the current session. num_tracing_attempts: Optional. Automatically retry N times when no trace event is collected (default 3). options: profiler.experimental.ProfilerOptions namedtuple for miscellaneous profiler options. Raises: InvalidArgumentError: For when arguments fail validation checks. UnavailableError: If no trace event was collected. Example usage (CPU/GPU): ```python # Start a profiler server before your model runs. tf.profiler.experimental.server.start(6009) # (Model code goes here). # Send gRPC request to the profiler server to collect a trace of your model. tf.profiler.experimental.client.trace('grpc://localhost:6009', '/nfs/tb_log', 2000) ``` Example usage (Multiple GPUs): ```python # E.g. your worker IP addresses are 10.0.0.2, 10.0.0.3, 10.0.0.4, and you # would like to schedule start of profiling 1 second from now, for a # duration of 2 seconds. options['delay_ms'] = 1000 tf.profiler.experimental.client.trace( 'grpc://10.0.0.2:8466,grpc://10.0.0.3:8466,grpc://10.0.0.4:8466', 'gs://your_tb_dir', 2000, options=options) ``` Example usage (TPU): ```python # Send gRPC request to a TPU worker to collect a trace of your model. A # profiler service has been started in the TPU worker at port 8466. # E.g. your TPU IP address is 10.0.0.2 and you want to profile for 2 seconds # . tf.profiler.experimental.client.trace('grpc://10.0.0.2:8466', 'gs://your_tb_dir', 2000) ``` Example usage (Multiple TPUs): ```python # Send gRPC request to a TPU pod to collect a trace of your model on # multiple TPUs. A profiler service has been started in all the TPU workers # at the port 8466. # E.g. your TPU IP addresses are 10.0.0.2, 10.0.0.3, 10.0.0.4, and you want # to profile for 2 seconds. tf.profiler.experimental.client.trace( 'grpc://10.0.0.2:8466', 'gs://your_tb_dir', 2000, '10.0.0.2:8466,10.0.0.3:8466,10.0.0.4:8466') ``` Launch TensorBoard and point it to the same logdir you provided to this API. ```shell # logdir can be gs://your_tb_dir as in the above examples. $ tensorboard --logdir=/tmp/tb_log ``` Open your browser and go to localhost:6006/#profile to view profiling results. """ if duration_ms <= 0: raise errors.InvalidArgumentError(None, None, 'duration_ms must be greater than zero.') opts = dict(options._asdict()) if options is not None else {} _pywrap_profiler.trace( _strip_addresses(service_addr, _GRPC_PREFIX), logdir, worker_list, True, duration_ms, num_tracing_attempts, opts)
def sliding_window(data, width, axis=-1, name=None): """Builds a sliding window for `data` with a specified width. Returns a tensor constructed from `data`, where each element in dimension `axis` is a slice of `data` starting at the corresponding position, with the given width and step size. I.e.: * `result.shape.ndims = data.shape.ndims + 1` * `result[i1..iaxis, a] = data[i1..iaxis, a:a+width]` (where `0 <= a < data[i1...iaxis].shape[0] - (width - 1)`). Note that each result row (along dimension `axis`) has `width - 1` fewer items than the corresponding `data` row. If a `data` row has fewer than `width` items, then the corresponding `result` row will be empty. If you wish for the `result` rows to be the same size as the `data` rows, you can use `pad_along_dimension` to add `width - 1` padding elements before calling this op. Args: data: `<dtype> [O1...ON, A, I1...IM]` A potentially ragged K-dimensional tensor with outer dimensions of size `O1...ON`; axis dimension of size `A`; and inner dimensions of size `I1...IM`. I.e. `K = N + 1 + M`, where `N>=0` and `M>=0`. width: An integer constant specifying the width of the window. Must be greater than zero. axis: An integer constant specifying the axis along which sliding window is computed. Negative axis values from `-K` to `-1` are supported. name: The name for this op (optional) Returns: A `K+1` dimensional tensor with the same dtype as `data`, where: * `result[i1..iaxis, a]` = `data[i1..iaxis, a:a+width]` * `result.shape[:axis]` = `data.shape[:axis]` * `result.shape[axis]` = `data.shape[axis] - (width - 1)` * `result.shape[axis + 1]` = `width` * `result.shape[axis + 2:]` = `data.shape[axis + 1:]` #### Examples: Sliding window (width=3) across a sequence of tokens: ```python >>> # input: <string>[sequence_length] >>> input = tf.constant(["one", "two", "three", "four", "five", "six"]) >>> # output: <string>[sequence_length-2, 3] >>> output = sliding_window(data=input, width=3, axis=0) >>> print output.eval() [["one", "two", "three"], ["two", "three", "four"], ["three", "four", "five"], ["four", "five", "six"]] >>> print("Shape: %s -> %s" % (input.shape, output.shape)) Shape: (6,) -> (4, 3) ``` Sliding window (width=2) across the inner dimension of a ragged matrix containing a batch of token sequences: ```python >>> # input: <string>[num_sentences, (num_words)] >>> input = tf.ragged.constant( ... [['Up', 'high', 'in', 'the', 'air'], ... ['Down', 'under', 'water'], ... ['Away', 'to', 'outer', 'space']] >>> # output: <string>[num_sentences, (num_word-1), 2] >>> output = sliding_window(input, width=2, axis=-1) >>> print output.eval() [[['Up', 'high'], ['high', 'in'], ['in', 'the'], ['the', 'air']], [['Down', 'under'], ['under', 'water']], [['Away', 'to'], ['to', 'outer'], ['outer', 'space']]] >>> print("Shape: %s -> %s" % (input.shape, output.shape)) Shape: (3, ?) -> (3, ?, 2) ``` Sliding window across the second dimension of a 3-D tensor containing batches of sequences of embedding vectors: ```python >>> # input: <int32>[num_sequences, sequence_length, embedding_size] >>> input = tf.constant([ ... [[1, 1, 1], [2, 2, 1], [3, 3, 1], [4, 4, 1], [5, 5, 1]], ... [[1, 1, 2], [2, 2, 2], [3, 3, 2], [4, 4, 2], [5, 5, 2]]]) >>> # output: <int32>[num_sequences, sequence_length-1, 2, embedding_size] >>> output = sliding_window(data=input, width=2, axis=1) >>> print output.eval() [[[[1, 1, 1], [2, 2, 1]], [[2, 2, 1], [3, 3, 1]], [[3, 3, 1], [4, 4, 1]], [[4, 4, 1], [5, 5, 1]]], [[[1, 1, 2], [2, 2, 2]], [[2, 2, 2], [3, 3, 2]], [[3, 3, 2], [4, 4, 2]], [[4, 4, 2], [5, 5, 2]]]] >>> print("Shape: %s -> %s" % (input.shape, output.shape)) Shape: (2, 5, 3) -> (2, 4, 2, 3) ``` """ with ops.name_scope(name, "SlidingWindow", [data, axis]): data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name="data") if not isinstance(axis, int): raise TypeError("axis must be an int") if not isinstance(width, int): raise TypeError("width must be an int") if data.shape.ndims is not None and (axis < -data.shape.ndims or axis >= data.shape.ndims): raise errors.InvalidArgumentError( None, None, "axis must be between -k <= axis <= -1 OR 0 <= axis < k") if width <= 0: raise errors.InvalidArgumentError( None, None, "width must be an integer greater than 0") slices = [] for start in range(width): stop = None if start - width + 1 == 0 else start - width + 1 if axis >= 0: idx = [slice(None)] * axis + [slice(start, stop)] else: idx = [Ellipsis, slice(start, stop) ] + [slice(None)] * (-axis - 1) slices.append(data[idx]) # Stack the slices. stack_axis = axis + 1 if axis >= 0 else axis return array_ops.stack(slices, stack_axis)
def create_feature_bitmask(tensor, dtype=dtypes.int32, name=None): """Packs the innermost dimension of a boolean tensor into integer values. `result[i1...iN]` is the integer formed by interpreting the booleans `tensor[i1...iN, 0:num_bits]` as individual bits, with big-endian order. E.g., if `tensor[i1...iN, 0:num_bits] = [True, False, False, True, False]`, then `result[i1...iN] = 0b10010 = 18`. The return tensor is of type `dtype`, if specified; if `dtype` is not set, `int32` will be used. If `num_bits` is too large to fit in `dtype`, then an exception is raised when this op is called (if `num_bits` is statically known) or when it is evaluated (if `num_bits` is not statically known). Args: tensor: `<bool>[D1...DN, num_bits]` The boolean tensor whose innermost dimension should be packed to form integer values. dtype: The datatype to output for this op (optional). name: The name for this op (optional). Returns: `<dtype> [D1...DN]` An integer tensor formed by interpreting the innermost dimension of `tensor` as individual bits. Raises: ValueError: If the data to be packed is too large for the chosen data type. ValueError: If the data to be packed is not boolean. InvalidArgumentError: If the input tensor is a list, or the dtype is not a supported integer type. Examples: ```python >>> assert create_feature_bitmask([True, False, False, True]) == 0b1001 >>> create_feature_bitmask([[True, False], [False, True], [True, True]]) [0b10, 0b01, 0b11] ``` """ with ops.name_scope(name, 'CreateFeatureBitmask', [tensor]): if (isinstance(tensor, (list, tuple)) and tensor and isinstance(tensor[0], ops.Tensor)): raise errors.InvalidArgumentError( None, None, 'CreateFeatureBitmask does not support lists of tensors. Consider ' 'using tf.stack(list,-1) to create a single tensor before invoking ' 'this op.') tensor = ops.convert_to_tensor(tensor, dtypes.bool, 'tensor') if dtype not in _max_bits.keys(): raise errors.InvalidArgumentError( None, None, 'dtype must be one of: [%s], was %s' % (sorted(_max_bits.items(), key=lambda kv: kv[1]), dtype.name)) integer_data = math_ops.cast(tensor, dtype=dtype) shape = tensor.shape if shape.ndims is not None and shape.dims[-1].value is not None: num_bits = shape.dims[-1].value if num_bits > 63: raise ValueError( 'data.shape[-1] must be less than 64, is %d.' % num_bits) elif num_bits > _max_bits[dtype]: raise ValueError( 'data.shape[-1] is too large for %s (was %d, cannot exceed %d); ' 'consider switching condense_boolean_tensor to a larger ' 'dtype.' % (dtype.name, num_bits, _max_bits[dtype])) bit_masks = constant_op.constant( [2**pos for pos in range(num_bits - 1, -1, -1)], dtype) else: bit_masks = constant_op.constant( [2**pos for pos in range(_max_bits[dtype] - 1, -1, -1)], dtype) num_bits = array_ops.shape(tensor)[-1] with ops.control_dependencies([ check_ops.assert_less_equal( num_bits, _max_bits[dtype], message= 'data.shape[-1] is too large for %s (cannot exceed %s)' % (dtype.name, _max_bits[dtype])) ]): # The second slice ("[:num_bits]") is a no-op unless num_bits==0. bit_masks = bit_masks[-num_bits:][:num_bits] return math_ops.reduce_sum(integer_data * bit_masks, axis=-1)
def pad_along_dimension(data, axis=-1, left_pad=None, right_pad=None, name=None): """Add padding to the beginning and end of data in a specific dimension. Returns a tensor constructed from `data`, where each row in dimension `axis` is replaced by the concatenation of the left padding followed by the row followed by the right padding. I.e., if `L=left_pad.shape[0]` and `R=right_pad.shape[0]`, then: ```python result[i1...iaxis, 0:L] = left_pad result[i1...iaxis, L:-R] = data[i0...iaxis] result[i1...iaxis, -R:] = right_pad ``` Args: data: `<dtype>[O1...ON, A, I1...IM]` A potentially ragged `K` dimensional tensor with outer dimensions of size `O1...ON`; axis dimension of size `A`; and inner dimensions of size `I1...IM`. I.e. `K = N + 1 + M`, where `N>=0` and `M>=0`. axis: An integer constant specifying the axis along which padding is added. Negative axis values from `-K` to `-1` are supported. left_pad: `<dtype>[L, I1...IM]` An `M+1` dimensional tensor that should be prepended to each row along dimension `axis`; or `None` if no padding should be added to the left side. right_pad: `<dtype>[R, I1...IM]` An `M+1` dimensional tensor that should be appended to each row along dimension `axis`; or `None` if no padding should be added to the right side. name: The name of this op (optional). Returns: `<dtype>[O1...ON, L + A + R, I1...IM]` A potentially ragged `K` dimensional tensor with outer dimensions of size `O1...ON`; padded axis dimension size `L+A+R`; and inner dimensions of size `I1...IM`. If `data` is a `RaggedTensor`, then the returned tensor is a `RaggedTensor` with the same `ragged_rank`. """ data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name="data") if not isinstance(axis, int): raise TypeError("axis must be an int; got %s" % type(axis).__name__) if left_pad is None and right_pad is None: return data with ops.name_scope(name, "PadAlongDimension", [data]): if data.shape.ndims is not None and (axis < -data.shape.ndims or axis >= data.shape.ndims): raise errors.InvalidArgumentError( None, None, "axis must be between -k <= axis <= -1 OR 0 <= axis < k") if isinstance(data, ragged_tensor.RaggedTensor): axis = _get_positive_axis(axis, data.shape.ndims) if left_pad is not None: left_pad = ragged_tensor.convert_to_tensor_or_ragged_tensor( left_pad, dtype=data.dtype, name="left_pad") if right_pad is not None: right_pad = ragged_tensor.convert_to_tensor_or_ragged_tensor( right_pad, dtype=data.dtype, name="left_pad") left_padding = _padding_for_dimension(data, axis, left_pad) right_padding = _padding_for_dimension(data, axis, right_pad) pieces = [left_padding, data, right_padding] if isinstance(data, ragged_tensor.RaggedTensor): return array_ops.concat([p for p in pieces if p is not None], axis) else: return array_ops.concat([p for p in pieces if p is not None], axis)