def testWithTensorDependencies(self): with self.test_session(): v = tf.Variable(0.0) c1 = tf.constant(10) c2 = tf.constant(20) # c1_with_init_v depends on the init op for v c1_with_init_v = control_flow_ops.with_dependencies( name="c1_with_init_v", output_tensor=c1, dependencies=[v.initializer]) # c2_with_c1 depends on the value of c1_with_init_v c2_with_c1_dep = control_flow_ops.with_dependencies( name="c2_with_c1_dep", output_tensor=c2, dependencies=[c1_with_init_v]) # Fetching v directly will result in an uninitialized error with self.assertRaisesOpError("Attempting to use uninitialized value"): v.eval() # Get the value of 'c2_with_c1_dep', which should cause 'v' # to be initialized. self.assertAllEqual(20, c2_with_c1_dep.eval()) # Ensure that 'v' is initialized self.assertAllClose(0.0, v.eval())
def testIndexedSlices(self): for v1_first in [True, False]: with self.test_session(): v1 = tf.Variable(np.array([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]]).astype(np.float32)) v1_at_1 = tf.IndexedSlices( control_flow_ops.with_dependencies([v1.initializer], v1.ref()), tf.constant([1]) ) v2 = tf.Variable(np.array([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]]).astype(np.float32)) v2_at_1 = tf.IndexedSlices( control_flow_ops.with_dependencies([v2.initializer], v2.ref()), tf.constant([1]) ) st1, st2 = control_flow_ops.tuple([v1_at_1, v2_at_1]) g1 = tf.gather(st1.values, st1.indices) g2 = tf.gather(st2.values, st2.indices) # v1 is not initialized. with self.assertRaisesOpError("Attempting to use uninitialized value"): v1.eval() # v2 is not initialized. with self.assertRaisesOpError("Attempting to use uninitialized value"): v2.eval() if v1_first: # Getting g1 initializes v2. self.assertAllClose([[10.0, 11.0]], g1.eval()) self.assertAllClose([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]], v2.eval()) else: # Getting g2 initializes v1. self.assertAllClose([[10.1, 11.1]], g2.eval()) self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]], v1.eval())
def _check_shapes_dynamic(self, operator, v, diag): """Return (v, diag) with Assert dependencies, which check shape.""" checks = [] with ops.op_scope([operator, v, diag], 'check_shapes'): s_v = array_ops.shape(v) r_op = operator.rank() r_v = array_ops.rank(v) if diag is not None: s_d = array_ops.shape(diag) r_d = array_ops.rank(diag) # Check tensor rank. checks.append(check_ops.assert_rank(v, r_op)) if diag is not None: checks.append(check_ops.assert_rank(diag, r_op - 1)) # Check batch shape checks.append(check_ops.assert_equal( operator.batch_shape(), array_ops.slice(s_v, [0], [r_v - 2]))) if diag is not None: checks.append(check_ops.assert_equal( operator.batch_shape(), array_ops.slice(s_d, [0], [r_d - 1]))) # Check event shape checks.append(check_ops.assert_equal( operator.vector_space_dimension(), array_ops.gather(s_v, r_v - 2))) if diag is not None: checks.append(check_ops.assert_equal( array_ops.gather(s_v, r_v - 1), array_ops.gather(s_d, r_d - 1))) v = control_flow_ops.with_dependencies(checks, v) if diag is not None: diag = control_flow_ops.with_dependencies(checks, diag) return v, diag
def testTensors(self): for v1_first in [True, False]: with self.test_session(): v1 = tf.Variable([1.0]) add1 = tf.add( control_flow_ops.with_dependencies([v1.initializer], v1.ref()), 2.0) v2 = tf.Variable([10.0]) add2 = tf.add( control_flow_ops.with_dependencies([v2.initializer], v2.ref()), 20.0) t1, _, t2 = control_flow_ops.tuple([add1, None, add2]) # v1 is not initialized. with self.assertRaisesOpError("Attempting to use uninitialized value"): v1.eval() # v2 is not initialized. with self.assertRaisesOpError("Attempting to use uninitialized value"): v2.eval() if v1_first: # Getting t1 initializes v2. self.assertAllClose([3.0], t1.eval()) self.assertAllClose([10.0], v2.eval()) else: # Getting t2 initializes v1. self.assertAllClose([30.0], t2.eval()) self.assertAllClose([1.0], v1.eval())
def kl_multivariate_normal(loc_one, scale_one, loc_two=0.0, scale_two=1.0): """Calculate the KL of multivariate normal distributions with diagonal covariances. Parameters ---------- loc_one : tf.Tensor A 0-D tensor, 1-D tensor of length n, or 2-D tensor of shape M x n where each row represents the mean of a n-dimensional Gaussian. scale_one : tf.Tensor A tensor of same shape as ``loc_one``, representing the standard deviation. loc_two : tf.Tensor, optional A tensor of same shape as ``loc_one``, representing the mean of another Gaussian. scale_two : tf.Tensor, optional A tensor of same shape as ``loc_one``, representing the standard deviation of another Gaussian. Returns ------- tf.Tensor For 0-D or 1-D tensor inputs, outputs the 0-D tensor ``KL( N(z; loc_one, scale_one) || N(z; loc_two, scale_two) )`` For 2-D tensor inputs, outputs the 1-D tensor ``[KL( N(z; loc_one[m,:], scale_one[m,:]) || N(z; loc_two[m,:], scale_two[m,:]) )]_{m=1}^M`` Raises ------ InvalidArgumentError If the location variables have Inf or NaN values, or if the scale variables are not positive. """ dependencies = [tf.verify_tensor_all_finite(loc_one, msg=''), tf.verify_tensor_all_finite(loc_two, msg=''), tf.assert_positive(scale_one), tf.assert_positive(scale_two)] loc_one = control_flow_ops.with_dependencies(dependencies, loc_one) scale_one = control_flow_ops.with_dependencies(dependencies, scale_one) loc_one = tf.cast(loc_one, tf.float32) scale_one = tf.cast(scale_one, tf.float32) if loc_two == 0.0 and scale_two == 1.0: # With default arguments, we can avoid some intermediate computation. out = tf.square(scale_one) + tf.square(loc_one) - \ 1.0 - 2.0 * tf.log(scale_one) else: loc_two = control_flow_ops.with_dependencies(dependencies, loc_two) scale_two = control_flow_ops.with_dependencies(dependencies, scale_two) loc_two = tf.cast(loc_two, tf.float32) scale_two = tf.cast(scale_two, tf.float32) out = tf.square(scale_one/scale_two) + \ tf.square((loc_two - loc_one)/scale_two) - \ 1.0 + 2.0 * tf.log(scale_two) - 2.0 * tf.log(scale_one) if len(out.get_shape()) <= 1: # scalar or vector return 0.5 * tf.reduce_sum(out) else: # matrix return 0.5 * tf.reduce_sum(out, 1)
def _verify_input(tensor_list, labels, probs_list): """Verify that batched inputs are well-formed.""" checked_probs_list = [] for probs in probs_list: # Since number of classes shouldn't change at runtime, probabilities shape # should be fully defined. probs.get_shape().assert_is_fully_defined() # Probabilities must be 1D. probs.get_shape().assert_has_rank(1) # Probabilities must be nonnegative and sum to one. tol = 1e-6 prob_sum = math_ops.reduce_sum(probs) checked_probs = control_flow_ops.with_dependencies([ check_ops.assert_non_negative(probs), check_ops.assert_less(prob_sum, 1.0 + tol), check_ops.assert_less(1.0 - tol, prob_sum) ], probs) checked_probs_list.append(checked_probs) # All probabilities should be the same length. prob_length = checked_probs_list[0].get_shape().num_elements() for checked_prob in checked_probs_list: if checked_prob.get_shape().num_elements() != prob_length: raise ValueError('Probability parameters must have the same length.') # Labels tensor should only have batch dimension. labels.get_shape().assert_has_rank(1) for tensor in tensor_list: # Data tensor should have a batch dimension. shape = tensor.get_shape().with_rank_at_least(1) # Data and label batch dimensions must be compatible. tensor_shape.dimension_at_index(shape, 0).assert_is_compatible_with( labels.get_shape()[0]) # Data and labels must have the same, strictly positive batch size. Since we # can't assume we know the batch size at graph creation, add runtime checks. labels_batch_size = array_ops.shape(labels)[0] lbl_assert = check_ops.assert_positive(labels_batch_size) # Make each tensor depend on its own checks. labels = control_flow_ops.with_dependencies([lbl_assert], labels) tensor_list = [ control_flow_ops.with_dependencies([ lbl_assert, check_ops.assert_equal(array_ops.shape(x)[0], labels_batch_size) ], x) for x in tensor_list ] # Label's classes must be integers 0 <= x < num_classes. labels = control_flow_ops.with_dependencies([ check_ops.assert_integer(labels), check_ops.assert_non_negative(labels), check_ops.assert_less(labels, math_ops.cast(prob_length, labels.dtype)) ], labels) return tensor_list, labels, checked_probs_list
def _maybe_attach_assertion(x): if not validate_args: return x if assert_positive: return control_flow_ops.with_dependencies([ tf.assert_positive(x, message="diagonal part must be positive"), ], x) return control_flow_ops.with_dependencies([ tf.assert_none_equal( x, tf.zeros([], x.dtype), message="diagonal part must be non-zero") ], x)
def rbf(X, X2=None, lengthscale=1.0, variance=1.0): """Radial basis function kernel, also known as the squared exponential or exponentiated quadratic. It is defined as $k(x, x') = \sigma^2 \exp\Big( -\\frac{1}{2} \sum_{d=1}^D \\frac{1}{\ell_d^2} (x_d - x'_d)^2 \Big)$ for output variance $\sigma^2$ and lengthscale $\ell^2$. The kernel is evaluated over all pairs of rows, `k(X[i, ], X2[j, ])`. If `X2` is not specified, then it evaluates over all pairs of rows in `X`, `k(X[i, ], X[j, ])`. The output is a matrix where each entry (i, j) is the kernel over the ith and jth rows. Args: X: tf.Tensor. N x D matrix of N data points each with D features. X2: tf.Tensor. N x D matrix of N data points each with D features. lengthscale: tf.Tensor. Lengthscale parameter, a positive scalar or D-dimensional vector. variance: tf.Tensor. Output variance parameter, a positive scalar. #### Examples ```python X = tf.random_normal([100, 5]) K = ed.rbf(X) assert K.shape == (100, 100) ``` """ lengthscale = tf.convert_to_tensor(lengthscale) variance = tf.convert_to_tensor(variance) dependencies = [tf.assert_positive(lengthscale), tf.assert_positive(variance)] lengthscale = control_flow_ops.with_dependencies(dependencies, lengthscale) variance = control_flow_ops.with_dependencies(dependencies, variance) X = tf.convert_to_tensor(X) X = X / lengthscale Xs = tf.reduce_sum(tf.square(X), 1) if X2 is None: X2 = X X2s = Xs else: X2 = tf.convert_to_tensor(X2) X2 = X2 / lengthscale X2s = tf.reduce_sum(tf.square(X2), 1) square = tf.reshape(Xs, [-1, 1]) + tf.reshape(X2s, [1, -1]) - \ 2 * tf.matmul(X, X2, transpose_b=True) output = variance * tf.exp(-square / 2) return output
def kl_multivariate_normal(loc_one, scale_one, loc_two=0.0, scale_two=1.0): """Calculate the KL of multivariate normal distributions with diagonal covariances. Parameters ---------- loc_one : tf.Tensor n-dimensional vector, or M x n-dimensional matrix where each row represents the mean of a n-dimensional Gaussian scale_one : tf.Tensor n-dimensional vector, or M x n-dimensional matrix where each row represents the standard deviation of a n-dimensional Gaussian loc_two : tf.Tensor, optional n-dimensional vector, or M x n-dimensional matrix where each row represents the mean of a n-dimensional Gaussian scale_two : tf.Tensor, optional n-dimensional vector, or M x n-dimensional matrix where each row represents the standard deviation of a n-dimensional Gaussian Returns ------- tf.Tensor for scalar or vector inputs, outputs the scalar ``KL( N(z; loc_one, scale_one) || N(z; loc_two, scale_two) )`` for matrix inputs, outputs the vector ``[KL( N(z; loc_one[m,:], scale_one[m,:]) || N(z; loc_two[m,:], scale_two[m,:]) )]_{m=1}^M`` Raises ------ InvalidArgumentError If the location variables have Inf or NaN values, or if the scale variables are not positive. """ dependencies = [tf.verify_tensor_all_finite(loc_one, msg=''), tf.verify_tensor_all_finite(loc_two, msg=''), tf.assert_positive(scale_one), tf.assert_positive(scale_two)] loc_one = control_flow_ops.with_dependencies(dependencies, loc_one) loc_two = control_flow_ops.with_dependencies(dependencies, loc_two) scale_one = control_flow_ops.with_dependencies(dependencies, scale_one) scale_two = control_flow_ops.with_dependencies(dependencies, scale_two) if loc_two == 0.0 and scale_two == 1.0: return 0.5 * tf.reduce_sum( tf.square(scale_one) + tf.square(loc_one) - \ 1.0 - 2.0 * tf.log(scale_one)) else: return 0.5 * tf.reduce_sum( tf.square(scale_one/scale_two) + \ tf.square((loc_two - loc_one)/scale_two) - \ 1.0 + 2.0 * tf.log(scale_two) - 2.0 * tf.log(scale_one), 1)
def _check_domain_range_possibly_add_asserts(self): """Static check of init arg `num_rows`, possibly add asserts.""" # Possibly add asserts. if self._assert_proper_shapes: self._num_rows = control_flow_ops.with_dependencies([ check_ops.assert_rank( self._num_rows, 0, message="Argument num_rows must be a 0-D Tensor."), check_ops.assert_non_negative( self._num_rows, message="Argument num_rows must be non-negative."), ], self._num_rows) self._num_columns = control_flow_ops.with_dependencies([ check_ops.assert_rank( self._num_columns, 0, message="Argument num_columns must be a 0-D Tensor."), check_ops.assert_non_negative( self._num_columns, message="Argument num_columns must be non-negative."), ], self._num_columns) # Static checks. if not self._num_rows.dtype.is_integer: raise TypeError("Argument num_rows must be integer type. Found:" " %s" % self._num_rows) if not self._num_columns.dtype.is_integer: raise TypeError("Argument num_columns must be integer type. Found:" " %s" % self._num_columns) num_rows_static = self._num_rows_static num_columns_static = self._num_columns_static if num_rows_static is not None: if num_rows_static.ndim != 0: raise ValueError("Argument num_rows must be a 0-D Tensor. Found:" " %s" % num_rows_static) if num_rows_static < 0: raise ValueError("Argument num_rows must be non-negative. Found:" " %s" % num_rows_static) if num_columns_static is not None: if num_columns_static.ndim != 0: raise ValueError("Argument num_columns must be a 0-D Tensor. Found:" " %s" % num_columns_static) if num_columns_static < 0: raise ValueError("Argument num_columns must be non-negative. Found:" " %s" % num_columns_static)
def setUpClass(cls): cls._dump_root = tempfile.mkdtemp() cls._is_gpu_available = test.is_gpu_available() if cls._is_gpu_available: cls._main_device = "/job:localhost/replica:0/task:0/gpu:0" else: cls._main_device = "/job:localhost/replica:0/task:0/cpu:0" with session.Session() as sess: x_init_val = np.array([5.0, 3.0]) x_init = constant_op.constant(x_init_val, shape=[2]) x = variables.Variable(x_init, name="control_deps/x") y = math_ops.add(x, x, name="control_deps/y") y = control_flow_ops.with_dependencies([x], y, name="control_deps/ctrl_dep_y") z = math_ops.mul(x, y, name="control_deps/z") z = control_flow_ops.with_dependencies([x, y], z, name="control_deps/ctrl_dep_z") x.initializer.run() run_options = config_pb2.RunOptions(output_partition_graphs=True) debug_utils.watch_graph( run_options, sess.graph, debug_ops=["DebugIdentity"], debug_urls="file://%s" % cls._dump_root ) # Invoke Session.run(). run_metadata = config_pb2.RunMetadata() sess.run(z, options=run_options, run_metadata=run_metadata) debug_dump = debug_data.DebugDumpDir(cls._dump_root, partition_graphs=run_metadata.partition_graphs) # Construct the analyzer. analyzer = analyzer_cli.DebugAnalyzer(debug_dump) # Construct the handler registry. cls._registry = debugger_cli_common.CommandHandlerRegistry() # Register command handlers. cls._registry.register_command_handler( "node_info", analyzer.node_info, analyzer.get_help("node_info"), prefix_aliases=["ni"] ) cls._registry.register_command_handler( "list_inputs", analyzer.list_inputs, analyzer.get_help("list_inputs"), prefix_aliases=["li"] ) cls._registry.register_command_handler( "list_outputs", analyzer.list_outputs, analyzer.get_help("list_outputs"), prefix_aliases=["lo"] )
def _initialize_variables(self, data, initial_means=None): """Initializes variables. Args: data: a list of Tensors with data, each row is a new example. initial_means: a Tensor with a matrix of means. """ first_shard = data[0] # Initialize means: num_classes X 1 X dimensions. if initial_means is not None: means = array_ops.expand_dims(initial_means, 1) else: # Sample data randomly means = array_ops.expand_dims( _init_clusters_random(data, self._num_classes, self._random_seed), 1) # Initialize covariances. if self._covariance_type == FULL_COVARIANCE: cov = _covariance(first_shard, False) + self._min_var # A matrix per class, num_classes X dimensions X dimensions covs = array_ops.tile( array_ops.expand_dims(cov, 0), [self._num_classes, 1, 1]) elif self._covariance_type == DIAG_COVARIANCE: cov = _covariance(first_shard, True) + self._min_var # A diagonal per row, num_classes X dimensions. covs = array_ops.tile( array_ops.expand_dims(array_ops.diag_part(cov), 0), [self._num_classes, 1]) with ops.colocate_with(self._cluster_centers_initialized): initialized = control_flow_ops.with_dependencies( [means, covs], array_ops.identity(self._cluster_centers_initialized)) self._init_ops = [] with ops.colocate_with(self._means): init_means = state_ops.assign(self._means, means, validate_shape=False) init_means = control_flow_ops.with_dependencies( [init_means], state_ops.assign(self._cluster_centers_initialized, True)) self._init_ops.append(control_flow_ops.cond(initialized, control_flow_ops.no_op, lambda: init_means).op) with ops.colocate_with(self._covs): init_covs = state_ops.assign(self._covs, covs, validate_shape=False) init_covs = control_flow_ops.with_dependencies( [init_covs], state_ops.assign(self._cluster_centers_initialized, True)) self._init_ops.append(control_flow_ops.cond(initialized, control_flow_ops.no_op, lambda: init_covs).op)
def _verify_input(data, labels, probs_list): """Verify that batched inputs are well-formed.""" checked_probs_list = [] for probs in probs_list: # Probabilities must be able to be converted to non-object numpy array. np_probs = np.asarray(probs) if np_probs.dtype == np.dtype('object'): raise ValueError('Probabilities must be able to be converted to a numpy ' 'array.') checked_probs_list.append(np_probs) # Probabilities must sum to one. # TODO(joelshor): Investigate whether logits should be passed instead of # probs. if not np.isclose(np.sum(probs), 1.0): raise ValueError('Probabilities must sum to one.') # All probabilities should be the same length. if not np.array_equiv([probs.shape for probs in checked_probs_list], checked_probs_list[0].shape): raise ValueError('Probability parameters must have the same length.') # Labels tensor should only have batch dimension. labels.get_shape().assert_has_rank(1) # Data tensor should have a batch dimension. data_shape = data.get_shape().with_rank_at_least(1) # Data and label batch dimensions must be compatible. data_shape[0].assert_is_compatible_with(labels.get_shape()[0]) # Data and labels must have the same, strictly positive batch size. Since we # can't assume we know the batch size at graph creation, add runtime checks. data_batch_size = array_ops.shape(data)[0] labels_batch_size = array_ops.shape(labels)[0] data = control_flow_ops.with_dependencies( [check_ops.assert_positive(data_batch_size), check_ops.assert_equal(data_batch_size, labels_batch_size)], data) # Label's classes must be integers 0 <= x < num_classes. labels = control_flow_ops.with_dependencies( [check_ops.assert_integer(labels), check_ops.assert_non_negative(labels), check_ops.assert_less(labels, math_ops.cast(len(probs), labels.dtype))], labels) return data, labels, checked_probs_list
def _maybe_attach_assertion(x): if not validate_args: return x if assert_positive: return control_flow_ops.with_dependencies([ check_ops.assert_positive( array_ops.matrix_diag_part(x), message="diagonal part must be positive"), ], x) return control_flow_ops.with_dependencies([ check_ops.assert_none_equal( array_ops.matrix_diag_part(x), array_ops.zeros([], x.dtype), message="diagonal part must be non-zero"), ], x)
def _maybe_attach_assertion(x): if not validate_args: return x if assert_positive: return control_flow_ops.with_dependencies([ check_ops.assert_positive( x, message="diagonal part must be positive"), ], x) # TODO(b/35157376): Use `assert_none_equal` once it exists. return control_flow_ops.with_dependencies([ check_ops.assert_greater( math_ops.abs(x), array_ops.zeros([], x.dtype), message="diagonal part must be non-zero"), ], x)
def _check_alpha(self, alpha): alpha = ops.convert_to_tensor(alpha, name='alpha') if not self.strict: return alpha return control_flow_ops.with_dependencies( [check_ops.assert_rank_at_least(alpha, 1), check_ops.assert_positive(alpha)], alpha)
def _check_diag(self, diag): """Verify that `diag` is positive.""" diag = ops.convert_to_tensor(diag, name="diag") if not self.verify_pd: return diag deps = [check_ops.assert_positive(diag)] return control_flow_ops.with_dependencies(deps, diag)
def log_prob(self, x, name="log_prob"): """Log prob of observations in `x` under these Gamma distribution(s). Args: x: tensor of dtype `dtype`, must be broadcastable with `alpha` and `beta`. name: The name to give this op. Returns: log_prob: tensor of dtype `dtype`, the log-PDFs of `x`. Raises: TypeError: if `x` and `alpha` are different dtypes. """ with ops.name_scope(self.name): with ops.op_scope([self._alpha, self._beta, x], name): alpha = self._alpha beta = self._beta x = ops.convert_to_tensor(x) x = control_flow_ops.with_dependencies( [check_ops.assert_positive(x)] if self.strict else [], x) contrib_tensor_util.assert_same_float_dtype(tensors=[x,], dtype=self.dtype) return (alpha * math_ops.log(beta) + (alpha - 1) * math_ops.log(x) - beta * x - math_ops.lgamma(self._alpha))
def mode(self, name="mode"): """Mode of each batch member. The mode of a gamma distribution is `(alpha - 1) / beta` when `alpha > 1`, and `NaN` otherwise. If `self.strict_statistics` is `True`, an exception will be raised rather than returning `NaN`. Args: name: A name to give this op. Returns: The mode for every batch member, a `Tensor` with same `dtype` as self. """ alpha = self._alpha beta = self._beta with ops.name_scope(self.name): with ops.op_scope([alpha, beta], name): mode_if_defined = (alpha - 1.0) / beta if self.strict_statistics: one = ops.convert_to_tensor(1.0, dtype=self.dtype) return control_flow_ops.with_dependencies( [check_ops.assert_less(one, alpha)], mode_if_defined) else: alpha_ge_1 = alpha >= 1.0 nan = np.nan * self._ones() return math_ops.select(alpha_ge_1, mode_if_defined, nan)
def maybe_check_quadrature_param(param, name, validate_args): """Helper which checks validity of `loc` and `scale` init args.""" with ops.name_scope(name="check_" + name, values=[param]): assertions = [] if param.shape.ndims is not None: if param.shape.ndims == 0: raise ValueError("Mixing params must be a (batch of) vector; " "{}.rank={} is not at least one.".format( name, param.shape.ndims)) elif validate_args: assertions.append(check_ops.assert_rank_at_least( param, 1, message=("Mixing params must be a (batch of) vector; " "{}.rank is not at least one.".format( name)))) # TODO(jvdillon): Remove once we support k-mixtures. if param.shape.with_rank_at_least(1)[-1] is not None: if param.shape[-1].value != 1: raise NotImplementedError("Currently only bimixtures are supported; " "{}.shape[-1]={} is not 1.".format( name, param.shape[-1].value)) elif validate_args: assertions.append(check_ops.assert_equal( array_ops.shape(param)[-1], 1, message=("Currently only bimixtures are supported; " "{}.shape[-1] is not 1.".format(name)))) if assertions: return control_flow_ops.with_dependencies(assertions, param) return param
def mode(self, name="mode"): """Mode of the distribution. Note that the mode for the Beta distribution is only defined when `alpha > 1`. This returns the mode when `alpha > 1`, and NaN otherwise. If `self.allow_nan_stats` is `False`, an exception will be raised rather than returning `NaN`. Args: name: The name for this op. Returns: Mode of the Dirichlet distribution. """ with ops.name_scope(self.name): with ops.op_scope([self._alpha, self._alpha_0], name): one = constant_op.constant(1, self.dtype) mode = (self._alpha - 1)/ ( array_ops.expand_dims(self._alpha_0, -1) - math_ops.cast( self.event_shape()[0], self.dtype)) if self.allow_nan_stats: return math_ops.select( math_ops.greater(self._alpha, 1), mode, (constant_op.constant(float("NaN"), dtype=self.dtype) * array_ops.ones_like(self._alpha, dtype=self.dtype))) else: return control_flow_ops.with_dependencies([ check_ops.assert_less( one, self._alpha, message="mode not defined for components of alpha <= 1") ], mode)
def testShape(self): with ops.Graph().as_default(): tensor = tf.constant([1.0, 2.0]) self.assertEquals([2], tensor.get_shape()) self.assertEquals([2], control_flow_ops.with_dependencies( [tf.constant(1.0)], tensor).get_shape())
def _maybe_assert_valid_x(self, x): if not self.validate_args: return x is_valid = check_ops.assert_non_negative( x, message="Forward transformation input must be at least {}.".format(0)) return control_flow_ops.with_dependencies([is_valid], x)
def testComputeMovingVars(self): height, width = 3, 3 with self.test_session() as sess: image_shape = (10, height, width, 3) image_values = np.random.rand(*image_shape) expected_mean = np.mean(image_values, axis=(0, 1, 2)) expected_var = np.var(image_values, axis=(0, 1, 2)) images = tf.constant(image_values, shape=image_shape, dtype=tf.float32) output = ops.batch_norm(images, decay=0.1) update_ops = tf.get_collection(ops.UPDATE_OPS_COLLECTION) with tf.control_dependencies(update_ops): barrier = tf.no_op(name='gradient_barrier') output = control_flow_ops.with_dependencies([barrier], output) # Initialize all variables sess.run(tf.global_variables_initializer()) moving_mean = variables.get_variables('BatchNorm/moving_mean')[0] moving_variance = variables.get_variables('BatchNorm/moving_variance')[0] mean, variance = sess.run([moving_mean, moving_variance]) # After initialization moving_mean == 0 and moving_variance == 1. self.assertAllClose(mean, [0] * 3) self.assertAllClose(variance, [1] * 3) for _ in range(10): sess.run([output]) mean = moving_mean.eval() variance = moving_variance.eval() # After 10 updates with decay 0.1 moving_mean == expected_mean and # moving_variance == expected_var. self.assertAllClose(mean, expected_mean) self.assertAllClose(variance, expected_var)
def testReuseVars(self): height, width = 3, 3 with self.test_session() as sess: image_shape = (10, height, width, 3) image_values = np.random.rand(*image_shape) expected_mean = np.mean(image_values, axis=(0, 1, 2)) expected_var = np.var(image_values, axis=(0, 1, 2)) images = tf.constant(image_values, shape=image_shape, dtype=tf.float32) output = ops.batch_norm(images, decay=0.1, is_training=False) update_ops = tf.get_collection(ops.UPDATE_OPS_COLLECTION) with tf.control_dependencies(update_ops): barrier = tf.no_op(name='gradient_barrier') output = control_flow_ops.with_dependencies([barrier], output) # Initialize all variables sess.run(tf.global_variables_initializer()) moving_mean = variables.get_variables('BatchNorm/moving_mean')[0] moving_variance = variables.get_variables('BatchNorm/moving_variance')[0] mean, variance = sess.run([moving_mean, moving_variance]) # After initialization moving_mean == 0 and moving_variance == 1. self.assertAllClose(mean, [0] * 3) self.assertAllClose(variance, [1] * 3) # Simulate assigment from saver restore. init_assigns = [tf.assign(moving_mean, expected_mean), tf.assign(moving_variance, expected_var)] sess.run(init_assigns) for _ in range(10): sess.run([output], {images: np.random.rand(*image_shape)}) mean = moving_mean.eval() variance = moving_variance.eval() # Although we feed different images, the moving_mean and moving_variance # shouldn't change. self.assertAllClose(mean, expected_mean) self.assertAllClose(variance, expected_var)
def cumprod(xs): """Cumulative product of a tensor along its outer dimension. https://github.com/tensorflow/tensorflow/issues/813 Parameters ---------- xs : tf.Tensor A 1-D or higher tensor. Returns ------- tf.Tensor A tensor with `cumprod` applied along its outer dimension. Raises ------ InvalidArgumentError If the input has Inf or NaN values. """ dependencies = [tf.verify_tensor_all_finite(xs, msg='')] xs = control_flow_ops.with_dependencies(dependencies, xs) xs = tf.cast(xs, dtype=tf.float32) values = tf.unpack(xs) out = [] prev = tf.ones_like(values[0]) for val in values: s = prev * val out.append(s) prev = s result = tf.pack(out) return result
def __init__(self, event_ndims=0, validate_args=False, name="absolute_value"): """Instantiates the `AbsoluteValue` bijector. Args: event_ndims: Python scalar indicating the number of dimensions associated with a particular draw from the distribution. Currently only zero is supported. validate_args: Python `bool` indicating whether arguments should be checked for correctness. name: Python `str` name given to ops managed by this object. Raises: ValueError: If `event_ndims` is not zero. """ self._graph_parents = [] self._name = name event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims") event_ndims_const = tensor_util.constant_value(event_ndims) if event_ndims_const is not None and event_ndims_const not in (0,): raise ValueError("event_ndims(%s) was not 0" % event_ndims_const) else: if validate_args: event_ndims = control_flow_ops.with_dependencies( [check_ops.assert_equal( event_ndims, 0, message="event_ndims was not 0")], event_ndims) with self._name_scope("init"): super(AbsoluteValue, self).__init__( event_ndims=event_ndims, validate_args=validate_args, name=name)
def _entropy(self): probs = self._probs if self.validate_args: probs = control_flow_ops.with_dependencies( [check_ops.assert_less( probs, constant_op.constant(1., probs.dtype), message="Entropy is undefined when logits = inf or probs = 1.")], probs) # Claim: entropy(p) = softplus(s)/p - s # where s=logits and p=probs. # # Proof: # # entropy(p) # := -[(1-p)log(1-p) + plog(p)]/p # = -[log(1-p) + plog(p/(1-p))]/p # = -[-softplus(s) + ps]/p # = softplus(s)/p - s # # since, # log[1-sigmoid(s)] # = log[1/(1+exp(s)] # = -log[1+exp(s)] # = -softplus(s) # # using the fact that, # 1-sigmoid(s) = sigmoid(-s) = 1/(1+exp(s)) return nn.softplus(self.logits) / probs - self.logits
def _check_batch_shape_possibly_add_asserts(self): """Static check of init arg `batch_shape`, possibly add asserts.""" if self._batch_shape_arg is None: return # Possibly add asserts if self._assert_proper_shapes: self._batch_shape_arg = control_flow_ops.with_dependencies( [ check_ops.assert_rank( self._batch_shape_arg, 1, message="Argument batch_shape must be a 1-D Tensor."), check_ops.assert_non_negative( self._batch_shape_arg, message="Argument batch_shape must be non-negative."), ], self._batch_shape_arg) # Static checks if not self._batch_shape_arg.dtype.is_integer: raise TypeError("Argument batch_shape must be integer type. Found:" " %s" % self._batch_shape_arg) if self._batch_shape_static is None: return # Cannot do any other static checks. if self._batch_shape_static.ndim != 1: raise ValueError("Argument batch_shape must be a 1-D Tensor. Found:" " %s" % self._batch_shape_static) if np.any(self._batch_shape_static < 0): raise ValueError("Argument batch_shape must be non-negative. Found:" "%s" % self._batch_shape_static)
def log_sum_exp(input_tensor, reduction_indices=None, keep_dims=False): """Compute the ``log_sum_exp`` of elements in a tensor, taking the sum across axes given by ``reduction_indices``. Parameters ---------- input_tensor : tf.Tensor The tensor to reduce. Should have numeric type. reduction_indices : int or list of int, optional The dimensions to reduce. If `None` (the default), reduces all dimensions. keep_dims : bool, optional If true, retains reduced dimensions with length 1. Returns ------- tf.Tensor The reduced tensor. Raises ------ InvalidArgumentError If the input has Inf or NaN values. """ dependencies = [tf.verify_tensor_all_finite(input_tensor, msg='')] input_tensor = control_flow_ops.with_dependencies(dependencies, input_tensor); input_tensor = tf.cast(input_tensor, dtype=tf.float32) x_max = tf.reduce_max(input_tensor, reduction_indices, keep_dims=True) return tf.squeeze(x_max) + tf.log(tf.reduce_sum( tf.exp(input_tensor - x_max), reduction_indices, keep_dims))
def __init__(self, df, scale_operator, cholesky_input_output_matrices=False, validate_args=False, allow_nan_stats=True, name=None): """Construct Wishart distributions. Args: df: `float` or `double` tensor, the degrees of freedom of the distribution(s). `df` must be greater than or equal to `k`. scale_operator: `float` or `double` instance of `LinearOperator`. cholesky_input_output_matrices: Python `bool`. Any function which whose input or output is a matrix assumes the input is Cholesky and returns a Cholesky factored matrix. Example `log_prob` input takes a Cholesky and `sample_n` returns a Cholesky when `cholesky_input_output_matrices=True`. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: TypeError: if scale is not floating-type TypeError: if scale.dtype != df.dtype ValueError: if df < k, where scale operator event shape is `(k, k)` """ parameters = dict(locals()) self._cholesky_input_output_matrices = cholesky_input_output_matrices with ops.name_scope(name) as name: with ops.name_scope("init", values=[df, scale_operator]): if not scale_operator.dtype.is_floating: raise TypeError( "scale_operator.dtype=%s is not a floating-point type" % scale_operator.dtype) if not scale_operator.is_square: print(scale_operator.to_dense().eval()) raise ValueError("scale_operator must be square.") self._scale_operator = scale_operator self._df = ops.convert_to_tensor( df, dtype=scale_operator.dtype, name="df") contrib_tensor_util.assert_same_float_dtype( (self._df, self._scale_operator)) if (self._scale_operator.shape.ndims is None or self._scale_operator.shape.dims[-1].value is None): self._dimension = math_ops.cast( self._scale_operator.domain_dimension_tensor(), dtype=self._scale_operator.dtype, name="dimension") else: self._dimension = ops.convert_to_tensor( self._scale_operator.shape.dims[-1].value, dtype=self._scale_operator.dtype, name="dimension") df_val = tensor_util.constant_value(self._df) dim_val = tensor_util.constant_value(self._dimension) if df_val is not None and dim_val is not None: df_val = np.asarray(df_val) if not df_val.shape: df_val = [df_val] if any(df_val < dim_val): raise ValueError( "Degrees of freedom (df = %s) cannot be less than " "dimension of scale matrix (scale.dimension = %s)" % (df_val, dim_val)) elif validate_args: assertions = check_ops.assert_less_equal( self._dimension, self._df, message=("Degrees of freedom (df = %s) cannot be " "less than dimension of scale matrix " "(scale.dimension = %s)" % (self._dimension, self._df))) self._df = control_flow_ops.with_dependencies( [assertions], self._df) super(_WishartLinearOperator, self).__init__( dtype=self._scale_operator.dtype, validate_args=validate_args, allow_nan_stats=allow_nan_stats, reparameterization_type=distribution.FULLY_REPARAMETERIZED, parameters=parameters, graph_parents=([self._df, self._dimension] + self._scale_operator.graph_parents), name=name)
def create_clones(batch_queue): with tf.device('/cpu:0'): global_step = slim.create_global_step() learning_rate = tf.constant(FLAGS.learning_rate, name='learning_rate') tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=FLAGS.momentum, name='Momentum') # place clones seglink_loss = 0; # for summary only gradients = [] for clone_idx, gpu in enumerate(config.gpus): do_summary = clone_idx == 0 # only summary on the first clone with tf.variable_scope(tf.get_variable_scope(), reuse = True):# the variables has been created in config.init_config with tf.name_scope(config.clone_scopes[clone_idx]) as clone_scope: with tf.device(gpu) as clone_device: b_image, b_seg_label, b_seg_loc, b_link_label = batch_queue.dequeue() net = seglink_symbol.SegLinkNet(inputs = b_image, data_format = config.data_format) # build seglink loss net.build_loss(seg_labels = b_seg_label, seg_offsets = b_seg_loc, link_labels = b_link_label, do_summary = do_summary) # gather seglink losses losses = tf.get_collection(tf.GraphKeys.LOSSES, clone_scope) assert len(losses) == 3 # 3 is the number of seglink losses: seg_cls, seg_loc, link_cls total_clone_loss = tf.add_n(losses) / config.num_clones seglink_loss = seglink_loss + total_clone_loss # gather regularization loss and add to clone_0 only if clone_idx == 0: regularization_loss = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) total_clone_loss = total_clone_loss + regularization_loss # compute clone gradients clone_gradients = optimizer.compute_gradients(total_clone_loss)# all variables will be updated. gradients.append(clone_gradients) tf.summary.scalar('seglink_loss', seglink_loss) tf.summary.scalar('regularization_loss', regularization_loss) # add all gradients together # note that the gradients do not need to be averaged, because the average operation has been done on loss. averaged_gradients = sum_gradients(gradients) update_op = optimizer.apply_gradients(averaged_gradients, global_step=global_step) train_ops = [update_op] # moving average if FLAGS.using_moving_average: tf.logging.info('using moving average in training, \ with decay = %f'%(FLAGS.moving_average_decay)) ema = tf.train.ExponentialMovingAverage(FLAGS.moving_average_decay) ema_op = ema.apply(tf.trainable_variables()) with tf.control_dependencies([update_op]):# ema after updating train_ops.append(tf.group(ema_op)) train_op = control_flow_ops.with_dependencies(train_ops, seglink_loss, name='train_op') return train_op
args.learning_rate, global_step, args.learning_rate_decay_steps, args.learning_rate_decay, staircase=True, name='exponential_decay_learning_rate') optimizer = tf.train.AdamOptimizer(learning_rate) variables_to_train = tf.trainable_variables() total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') with tf.name_scope('summaries'): end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.histogram_summary('activations/' + end_point, x)) summaries.add( tf.scalar_summary('sparsity/' + end_point, tf.nn.zero_fraction(x))) for variable in slim.get_model_variables(): summaries.add(tf.histogram_summary(variable.op.name, variable)) summaries.add( tf.scalar_summary('learning_rate', learning_rate, name='learning_rate'))
def _log_prob(self, x): x = control_flow_ops.with_dependencies( [check_ops.assert_positive(x)] if self.validate_args else [], x) return (self.alpha * math_ops.log(self.beta) - math_ops.lgamma(self.alpha) - (self.alpha + 1.) * math_ops.log(x) - self.beta / x)
def _cdf(self, x): x = control_flow_ops.with_dependencies( [check_ops.assert_positive(x)] if self.validate_args else [], x) # Note that igammac returns the upper regularized incomplete gamma # function Q(a, x), which is what we want for the CDF. return math_ops.igammac(self.alpha, self.beta / x)
def build_graph(top_k): keep_prob = tf.placeholder(dtype=tf.float32, shape=[], name='keep_prob') # dropout打开概率 images = tf.placeholder(dtype=tf.float32, shape=[None, 64, 64, 1], name='image_batch') labels = tf.placeholder(dtype=tf.int64, shape=[None], name='label_batch') is_training = tf.placeholder(dtype=tf.bool, shape=[], name='train_flag') with tf.device(tf.test.gpu_device_name()): # network: conv2d->max_pool2d->conv2d->max_pool2d->conv2d->max_pool2d->conv2d->conv2d-> # max_pool2d->fully_connected->fully_connected #给slim.conv2d和slim.fully_connected准备了默认参数:batch_norm with slim.arg_scope([slim.conv2d, slim.fully_connected], normalizer_fn=slim.batch_norm, normalizer_params={'is_training': is_training}): conv3_1 = slim.conv2d(images, 64, [3, 3], 1, padding='SAME', scope='conv3_1') max_pool_1 = slim.max_pool2d(conv3_1, [2, 2], [2, 2], padding='SAME', scope='pool1') conv3_2 = slim.conv2d(max_pool_1, 128, [3, 3], padding='SAME', scope='conv3_2') max_pool_2 = slim.max_pool2d(conv3_2, [2, 2], [2, 2], padding='SAME', scope='pool2') conv3_3 = slim.conv2d(max_pool_2, 256, [3, 3], padding='SAME', scope='conv3_3') max_pool_3 = slim.max_pool2d(conv3_3, [2, 2], [2, 2], padding='SAME', scope='pool3') conv3_4 = slim.conv2d(max_pool_3, 512, [3, 3], padding='SAME', scope='conv3_4') conv3_5 = slim.conv2d(conv3_4, 512, [3, 3], padding='SAME', scope='conv3_5') max_pool_4 = slim.max_pool2d(conv3_5, [2, 2], [2, 2], padding='SAME', scope='pool4') flatten = slim.flatten(max_pool_4) fc1 = slim.fully_connected(slim.dropout(flatten, keep_prob), 1000, activation_fn=tf.nn.relu, scope='fc1') logits = slim.fully_connected(slim.dropout(fc1, keep_prob), FLAGS.charset_size, activation_fn=None, scope='fc2') # 因为我们没有做热编码,所以使用sparse_softmax_cross_entropy_with_logits loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)) accuracy = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(logits, 1), labels), tf.float32)) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: updates = tf.group(*update_ops) loss = control_flow_ops.with_dependencies([updates], loss) global_step = tf.get_variable("step", [], initializer=tf.constant_initializer(0.0), trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=0.1) train_op = slim.learning.create_train_op(loss, optimizer, global_step=global_step) probabilities = tf.nn.softmax(logits) # 绘制loss accuracy曲线 tf.summary.scalar('loss', loss) tf.summary.scalar('accuracy', accuracy) merged_summary_op = tf.summary.merge_all() # 返回top k 个预测结果及其概率;返回top K accuracy predicted_val_top_k, predicted_index_top_k = tf.nn.top_k(probabilities, k=top_k) accuracy_in_top_k = tf.reduce_mean( tf.cast(tf.nn.in_top_k(probabilities, labels, top_k), tf.float32)) return { 'images': images, 'labels': labels, 'keep_prob': keep_prob, 'top_k': top_k, 'global_step': global_step, 'train_op': train_op, 'loss': loss, 'is_training': is_training, 'accuracy': accuracy, 'accuracy_top_k': accuracy_in_top_k, 'merged_summary_op': merged_summary_op, 'predicted_distribution': probabilities, 'predicted_index_top_k': predicted_index_top_k, 'predicted_val_top_k': predicted_val_top_k }
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ###################### # Config model_deploy# ###################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir, splits_to_sizes={ 'train': FLAGS.num_train_samples, 'validation': FLAGS.num_validation_samples, 'test': FLAGS.num_test_samples }) #################### # Select the network # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label, bbox] = provider.get(['image', 'label', 'bbox']) label -= FLAGS.labels_offset bbx = tf.reshape(bbox, [1, 1, 4]) train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size, bbox=bbx) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: slim.losses.softmax_cross_entropy( end_points['AuxLogits'], labels, label_smoothing=FLAGS.label_smoothing, weight=0.4, scope='aux_loss') slim.losses.softmax_cross_entropy( logits, labels, label_smoothing=FLAGS.label_smoothing, weight=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.histogram_summary('activations/' + end_point, x)) summaries.add( tf.scalar_summary('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.scalar_summary('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.histogram_summary(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add( tf.scalar_summary('learning_rate', learning_rate, name='learning_rate')) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add( tf.scalar_summary('total_loss', total_loss, name='total_loss')) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.merge_summary(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def __init__(self, df, scale=None, scale_tril=None, input_output_cholesky=False, validate_args=False, allow_nan_stats=True, name="Wishart"): """Construct Wishart distributions. Args: df: `float` or `double` `Tensor`. Degrees of freedom, must be greater than or equal to dimension of the scale matrix. scale: `float` or `double` `Tensor`. The symmetric positive definite scale matrix of the distribution. Exactly one of `scale` and 'scale_tril` must be passed. scale_tril: `float` or `double` `Tensor`. The Cholesky factorization of the symmetric positive definite scale matrix of the distribution. Exactly one of `scale` and 'scale_tril` must be passed. input_output_cholesky: Python `bool`. If `True`, functions whose input or output have the semantics of samples assume inputs are in Cholesky form and return outputs in Cholesky form. In particular, if this flag is `True`, input to `log_prob` is presumed of Cholesky form and output from `sample`, `mean`, and `mode` are of Cholesky form. Setting this argument to `True` is purely a computational optimization and does not change the underlying distribution; for instance, `mean` returns the Cholesky of the mean, not the mean of Cholesky factors. The `variance` and `stddev` methods are unaffected by this flag. Default value: `False` (i.e., input/output does not have Cholesky semantics). validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: ValueError: if zero or both of 'scale' and 'scale_tril' are passed in. """ parameters = dict(locals()) with tf.name_scope(name, values=[scale, scale_tril]) as name: with tf.name_scope("init", values=[scale, scale_tril]): if (scale is None) == (scale_tril is None): raise ValueError( "Must pass scale or scale_tril, but not both.") if scale is not None: scale = tf.convert_to_tensor(scale) if validate_args: scale = distribution_util.assert_symmetric(scale) scale_tril = tf.cholesky(scale) else: # scale_tril is not None scale_tril = tf.convert_to_tensor(scale_tril) if validate_args: scale_tril = control_flow_ops.with_dependencies([ tf.assert_positive( tf.matrix_diag_part(scale_tril), message="scale_tril must be positive definite" ), tf.assert_equal( tf.shape(scale_tril)[-1], tf.shape(scale_tril)[-2], message="scale_tril must be square") ], scale_tril) super(Wishart, self).__init__( df=df, scale_operator=tf.linalg.LinearOperatorLowerTriangular( tril=scale_tril, is_non_singular=True, is_positive_definite=True, is_square=True), input_output_cholesky=input_output_cholesky, validate_args=validate_args, allow_nan_stats=allow_nan_stats, name=name) self._parameters = parameters
def __init__(self, df, scale_operator, input_output_cholesky=False, validate_args=False, allow_nan_stats=True, name=None): """Construct Wishart distributions. Args: df: `float` or `double` tensor, the degrees of freedom of the distribution(s). `df` must be greater than or equal to `k`. scale_operator: `float` or `double` instance of `LinearOperator`. input_output_cholesky: Python `bool`. If `True`, functions whose input or output have the semantics of samples assume inputs are in Cholesky form and return outputs in Cholesky form. In particular, if this flag is `True`, input to `log_prob` is presumed of Cholesky form and output from `sample`, `mean`, and `mode` are of Cholesky form. Setting this argument to `True` is purely a computational optimization and does not change the underlying distribution; for instance, `mean` returns the Cholesky of the mean, not the mean of Cholesky factors. The `variance` and `stddev` methods are unaffected by this flag. Default value: `False` (i.e., input/output does not have Cholesky semantics). validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: TypeError: if scale is not floating-type TypeError: if scale.dtype != df.dtype ValueError: if df < k, where scale operator event shape is `(k, k)` """ parameters = dict(locals()) self._input_output_cholesky = input_output_cholesky with tf.name_scope(name) as name: with tf.name_scope("init", values=[df, scale_operator]): if not scale_operator.dtype.is_floating: raise TypeError( "scale_operator.dtype=%s is not a floating-point type" % scale_operator.dtype) if not scale_operator.is_square: print(scale_operator.to_dense().eval()) raise ValueError("scale_operator must be square.") self._scale_operator = scale_operator self._df = tf.convert_to_tensor(df, dtype=scale_operator.dtype, name="df") contrib_tensor_util.assert_same_float_dtype( (self._df, self._scale_operator)) if (self._scale_operator.shape.ndims is None or self._scale_operator.shape[-1].value is None): self._dimension = tf.cast( self._scale_operator.domain_dimension_tensor(), dtype=self._scale_operator.dtype, name="dimension") else: self._dimension = tf.convert_to_tensor( self._scale_operator.shape[-1].value, dtype=self._scale_operator.dtype, name="dimension") df_val = tensor_util.constant_value(self._df) dim_val = tensor_util.constant_value(self._dimension) if df_val is not None and dim_val is not None: df_val = np.asarray(df_val) if not df_val.shape: df_val = [df_val] if any(df_val < dim_val): raise ValueError( "Degrees of freedom (df = %s) cannot be less than " "dimension of scale matrix (scale.dimension = %s)" % (df_val, dim_val)) elif validate_args: assertions = tf.assert_less_equal( self._dimension, self._df, message=("Degrees of freedom (df = %s) cannot be " "less than dimension of scale matrix " "(scale.dimension = %s)" % (self._dimension, self._df))) self._df = control_flow_ops.with_dependencies([assertions], self._df) super(_WishartLinearOperator, self).__init__( dtype=self._scale_operator.dtype, validate_args=validate_args, allow_nan_stats=allow_nan_stats, reparameterization_type=tf.distributions.FULLY_REPARAMETERIZED, parameters=parameters, graph_parents=([self._df, self._dimension] + self._scale_operator.graph_parents), name=name)
def squeeze(input: ragged_tensor.Ragged, axis=None, name=None): # pylint: disable=redefined-builtin """Ragged compatible squeeze. If `input` is a `tf.Tensor`, then this calls `tf.squeeze`. If `input` is a `tf.RaggedTensor`, then this operation takes `O(N)` time, where `N` is the number of elements in the squeezed dimensions. Args: input: A potentially ragged tensor. The input to squeeze. axis: An optional list of ints. Defaults to `None`. If the `input` is ragged, it only squeezes the dimensions listed. It fails if `input` is ragged and axis is []. If `input` is not ragged it calls tf.squeeze. Note that it is an error to squeeze a dimension that is not 1. It must be in the range of [-rank(input), rank(input)). name: A name for the operation (optional). Returns: A potentially ragged tensor. Contains the same data as input, but has one or more dimensions of size 1 removed. """ with ops.name_scope(name, 'RaggedSqueeze', [input]): input = ragged_tensor.convert_to_tensor_or_ragged_tensor(input) if isinstance(input, ops.Tensor): return array_ops.squeeze(input, axis, name) if axis is None: raise ValueError('Ragged.squeeze must have an axis argument.') if isinstance(axis, int): axis = [axis] elif ((not isinstance(axis, (list, tuple))) or (not all(isinstance(d, int) for d in axis))): raise TypeError('Axis must be a list or tuple of integers.') dense_dims = [] ragged_dims = [] # Normalize all the dims in axis to be positive axis = [ array_ops.get_positive_axis(d, input.shape.ndims, 'axis[%d]' % i, 'rank(input)') for i, d in enumerate(axis) ] for dim in axis: if dim > input.ragged_rank: dense_dims.append(dim - input.ragged_rank) else: ragged_dims.append(dim) # Make sure the specified ragged dimensions are squeezable. assertion_list = [] scalar_tensor_one = constant_op.constant(1, dtype=input.row_splits.dtype) for i, r in enumerate(input.nested_row_lengths()): if i + 1 in ragged_dims: assertion_list.append( control_flow_ops.Assert( math_ops.reduce_all( math_ops.equal(r, scalar_tensor_one)), [ 'the given axis (axis = %d) is not squeezable!' % (i + 1) ])) if 0 in ragged_dims: scalar_tensor_two = constant_op.constant(2, dtype=dtypes.int32) assertion_list.append( control_flow_ops.Assert( math_ops.equal(array_ops.size(input.row_splits), scalar_tensor_two), ['the given axis (axis = 0) is not squeezable!'])) # Till now, we are sure that the ragged dimensions are squeezable. squeezed_rt = None squeezed_rt = control_flow_ops.with_dependencies( assertion_list, input.flat_values) if dense_dims: # Gives error if the dense dimension is not squeezable. squeezed_rt = array_ops.squeeze(squeezed_rt, dense_dims) remaining_row_splits = [] remaining_row_splits = list() for i, row_split in enumerate(input.nested_row_splits): # each row_splits tensor is for dimension #(i+1) . if (i + 1) not in ragged_dims: remaining_row_splits.append(row_split) # Take care of the first row if it is to be squeezed. if remaining_row_splits and 0 in ragged_dims: remaining_row_splits.pop(0) squeezed_rt = RaggedTensor.from_nested_row_splits( squeezed_rt, remaining_row_splits) # Corner case: when removing all the ragged dimensions and the output is # a scalar tensor e.g. ragged.squeeze(ragged.constant([[[1]]])). if set(range(0, input.ragged_rank + 1)).issubset(set(ragged_dims)): squeezed_rt = array_ops.squeeze(squeezed_rt, [0], name) return squeezed_rt
def detection_model(features, labels, mode, params): num_classes = params['num_classes'] initial_weights_path = params.get('initial_weights_path', '') log_dir = params['log_dir'] collect_priors_summary = params['collect_priors_summary'] data_format = params.get('data_format', 'NHWC') depth_multiplier = params.get('depth_multiplier', 1.0) priors_rule = params.get('priors_rule', 'caffe') custom_priors = params.get('priors', []) learning_rate = params.get('learning_rate', 0.01) steps_per_epoch = params.get('steps_per_epoch', 1) mobilenet_version = params.get('mobilenet_version', 'v2') weight_regularization = params.get('weight_regularization', 4e-5) optimizer_func = params.get( 'optimizer', lambda learning_rate: tf.train.AdagradOptimizer( learning_rate=learning_rate)) # Override default FileWriter. Don't store the graph definition. # pylint: disable=protected-access tf.summary.FileWriterCache._cache[log_dir] = tf.summary.FileWriter( log_dir, graph=None) if callable(learning_rate): learning_rate = learning_rate() is_training = mode == tf.estimator.ModeKeys.TRAIN ssd = MobileNetSSD( input_tensor=features, num_classes=num_classes, depth_multiplier=depth_multiplier, is_training=is_training, data_format=data_format, priors_rule=priors_rule, priors=custom_priors, mobilenet_version=mobilenet_version, weight_regularization=weight_regularization) # 1. Build model if mode == tf.estimator.ModeKeys.PREDICT: decoded_predictions = ssd.detection_output( use_plain_caffe_format=False) return tf.estimator.EstimatorSpec(mode, predictions=decoded_predictions) assert mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL) targets = ssd.create_targets(labels) # 2. Build GT from annotation if collect_priors_summary: with tf.name_scope('summary/'): assigned_priors = create_tensors_and_streaming_ops_for_assigned_priors( targets, ssd.priors_info, num_classes) detailed_assigned_priors = get_detailed_assigned_priors_summary_tf( assigned_priors, ssd.priors_info) loss_func = MultiboxLoss(neg_pos_ratio=3.0) # 3. Build loss-object eval_iteration = tf.get_variable('eval_iteration', initializer=0, dtype=tf.int32, trainable=False) if mode == tf.estimator.ModeKeys.EVAL: eval_print_steps = steps_per_epoch // 50 eval_print_steps = 1 if eval_print_steps == 0 else eval_print_steps every_eval_print_steps = tf.equal( tf.mod(eval_iteration + 1, eval_print_steps), 0) eval_iteration = tf.assign(eval_iteration, eval_iteration + 1) targets = with_dependencies([eval_iteration], targets) loss = loss_func.eval_summary(targets, ssd.predictions) loss = tf.cond( every_eval_print_steps, lambda: tf.Print(loss, [ tf.round(100 * eval_iteration / steps_per_epoch), loss ], '[%][loss]: '), lambda: loss) eval_metric_ops = {} for key, val in loss_func.eval_tensors.items(): eval_metric_ops['loss_function/' + key] = tf.metrics.mean(val) if collect_priors_summary: for key, metric_ops in assigned_priors.items( ): # We need only update ops eval_metric_ops[key] = metric_ops for key, assigned_priors_tensor in detailed_assigned_priors.items( ): eval_metric_ops['prior_histogram/' + key] = (assigned_priors_tensor, tf.no_op()) decoded_predictions = ssd.detection_output( use_plain_caffe_format=False) eval_metric_ops['predictions'] = tf.contrib.metrics.streaming_concat( decoded_predictions) return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops) assert mode == tf.estimator.ModeKeys.TRAIN if initial_weights_path: tf.logging.info('Initialize from: ' + initial_weights_path) ssd.load_weights(initial_weights_path) bboxes = ssd._decode_boxes(ssd.predictions['locs'], priors=ssd.priors[0, 0], variance=ssd.priors[0, 1]) loss = loss_func.loss(targets, ssd.predictions, bboxes) # 4. Compute loss with NMS if collect_priors_summary: with tf.name_scope('summary/'): loss = with_dependencies( [operation for key, (_, operation) in assigned_priors.items()], loss) for name, assigned_priors_tensor in detailed_assigned_priors.items(): tf.summary.scalar(name, tf.reduce_sum(assigned_priors_tensor)) py_func_ops = [] priors_dir = os.path.join(log_dir, 'priors') with tf.name_scope('write_histogram'): every_epoch = tf.equal( tf.mod(tf.train.get_global_step() + 1, steps_per_epoch), 0) for name, (group, _) in assigned_priors.items(): def write_hist2d(): # pylint: disable=cell-var-from-loop return tf.py_func(write_histogram_2d_tf, [ group, pickle.dumps(ssd.priors_info), name, tf.train.get_global_step(), priors_dir ], tf.bool) write_hist2d_once_per_epoch = tf.cond(every_epoch, write_hist2d, tf.no_op) py_func_ops.append(write_hist2d_once_per_epoch) loss = with_dependencies(py_func_ops, loss) optimizer = optimizer_func(learning_rate) tf.summary.scalar('learning_rate', learning_rate) regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) regularization_loss = tf.add_n( regularization_losses, name='loss_function/regularization_losses_sum') total_loss = tf.add(loss, regularization_loss, name='loss_function/total_loss') tf.summary.scalar('loss_function/regularization_loss', regularization_loss) with tf.variable_scope('train_loop'): train_op = optimizer.minimize(total_loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
def _setup_model_loss(self, update_ops=None, num_classes=6): self.learning_rate_d = tf.placeholder(tf.float32, shape=[], name="learning_rate_placeholder") self.learning_rate_g = tf.placeholder(tf.float32, shape=[], name="learning_rate_placeholder") d_optimizer = self._optimizer(self.learning_rate_d, optname=self.cnf.get( 'optname', 'momentum'), **self.cnf.get('opt_kwargs', {'decay': 0.9})) g_optimizer = self._optimizer(self.learning_rate_g, optname=self.cnf.get( 'optname', 'momentum'), **self.cnf.get('opt_kwargs', {'decay': 0.9})) # Get images and labels for ImageNet and split the batch across GPUs. assert self.cnf['batch_size_train'] % self.cnf.get( 'num_gpus', 1) == 0, ( 'Batch size must be divisible by number of GPUs') self.inputs = tf.placeholder(tf.float32, shape=(None, self.model.image_size[0], self.model.image_size[0], 3), name="input") self.labels = tf.placeholder(tf.int32, shape=(None, )) self._tower_loss_semi_supervised(self.inputs, self.labels, num_classes=num_classes) # global_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) global_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops is None: update_ops = global_update_ops else: update_ops = set(update_ops) # Make sure update_ops are computed before total_loss. if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name='update_barrier') self.d_losses[-1] = control_flow_ops.with_dependencies( [barrier], self.d_losses[-1]) self.g_losses[-1] = control_flow_ops.with_dependencies( [barrier], self.g_losses[-1]) self.d_loss_real = control_flow_ops.with_dependencies( [barrier], self.d_loss_real) self.d_loss_fake = control_flow_ops.with_dependencies( [barrier], self.d_loss_fake) self.d_loss_class = control_flow_ops.with_dependencies( [barrier], self.d_loss_class) t_vars = self._get_vars_semi_supervised() if self.clip_by_global_norm: self.capped_d_grads = self._clip_grad_global_norms( t_vars['d_vars'], self.d_losses[-1], d_optimizer, gradient_noise_scale=0.0) self.capped_g_grads = self._clip_grad_global_norms( t_vars['g_vars'], self.g_losses[-1], g_optimizer, gradient_noise_scale=0.0) else: self.capped_d_grads = self._clip_grad_norms( d_optimizer.compute_gradients(self.d_losses[-1], t_vars['d_vars'])) self.capped_g_grads = self._clip_grad_norms( g_optimizer.compute_gradients(self.g_losses[-1], t_vars['g_vars'])) global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) if self.gradient_multipliers is not None: with tf.name_scope('multiply_grads'): self.capped_d_grads = self._multiply_gradients( self.capped_d_grads, self.gradient_multipliers) apply_d_gradient_op = d_optimizer.apply_gradients( self.capped_d_grads, global_step=global_step) apply_g_gradient_op = g_optimizer.apply_gradients( self.capped_g_grads, global_step=global_step) self.train_op_d = control_flow_ops.with_dependencies( [apply_d_gradient_op], self.d_losses[-1]) self.train_op_g = control_flow_ops.with_dependencies( [apply_g_gradient_op], self.g_losses[-1])
def create_train_op(total_loss, optimizer, global_step=None, update_ops=None, variables_to_train=None, clip_gradient_norm=0, summarize_gradients=False, gate_gradients=tf_optimizer.Optimizer.GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, gradient_multipliers=None): """Creates an `Operation` that evaluates the gradients and returns the loss. Args: total_loss: A `Tensor` representing the total loss. optimizer: A tf.Optimizer to use for computing the gradients. global_step: A `Tensor` representing the global step variable. If left as `None`, then slim.variables.global_step() is used. update_ops: an optional list of updates to execute. Note that the update_ops that are used are the union of those update_ops passed to the function and the value of slim.ops.GetUpdateOps(). Therefore, if `update_ops` is None, then the value of slim.ops.GetUpdateOps() is still used. variables_to_train: an optional list of variables to train. If None, it will default to all tf.trainable_variables(). clip_gradient_norm: If greater than 0 then the gradients would be clipped by it. summarize_gradients: Whether or not add summaries for each gradient. gate_gradients: How to gate the computation of gradients. See tf.Optimizer. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: Whether or not to try colocating the gradients with the ops that generated them. gradient_multipliers: A dictionary of either `Variables` or `Variable` op names to the coefficient by which the associated gradient should be scaled. Returns: A `Tensor` that when evaluated, computes the gradients and returns the total loss value. """ if global_step is None: global_step = variables.get_or_create_global_step() # Update ops use GraphKeys.UPDATE_OPS collection if update_ops is None. global_update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) if update_ops is None: update_ops = global_update_ops else: update_ops = set(update_ops) if not global_update_ops.issubset(update_ops): logging.warning( 'update_ops in create_train_op does not contain all the ' ' update_ops in GraphKeys.UPDATE_OPS') # Make sure update_ops are computed before total_loss. if update_ops: with ops.control_dependencies(update_ops): barrier = control_flow_ops.no_op(name='update_barrier') total_loss = control_flow_ops.with_dependencies([barrier], total_loss) if variables_to_train is None: # Default to tf.trainable_variables() variables_to_train = tf_variables.trainable_variables() else: # Make sure that variables_to_train are in tf.trainable_variables() for v in variables_to_train: assert v in tf_variables.trainable_variables() assert variables_to_train # Create the gradients. Note that apply_gradients adds the gradient # computation to the current graph. grads = optimizer.compute_gradients( total_loss, variables_to_train, gate_gradients=gate_gradients, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) # Scale gradients. if gradient_multipliers: grads = multiply_gradients(grads, gradient_multipliers) # Clip gradients. if clip_gradient_norm > 0: grads = clip_gradient_norms(grads, clip_gradient_norm) # Summarize gradients. if summarize_gradients: add_gradients_summaries(grads) # Create gradient updates. grad_updates = optimizer.apply_gradients(grads, global_step=global_step) # Make sure total_loss is valid. total_loss = array_ops.check_numerics(total_loss, 'LossTensor is inf or nan') # Ensure the train_tensor computes grad_updates. return control_flow_ops.with_dependencies([grad_updates], total_loss)
def _create_scale_operator(self, identity_multiplier, diag, tril, perturb_diag, perturb_factor, shift, validate_args): """Construct `scale` from various components. Args: identity_multiplier: floating point rank 0 `Tensor` representing a scaling done to the identity matrix. diag: Floating-point `Tensor` representing the diagonal matrix. `scale_diag` has shape [N1, N2, ... k], which represents a k x k diagonal matrix. tril: Floating-point `Tensor` representing the diagonal matrix. `scale_tril` has shape [N1, N2, ... k], which represents a k x k lower triangular matrix. perturb_diag: Floating-point `Tensor` representing the diagonal matrix of the low rank update. perturb_factor: Floating-point `Tensor` representing factor matrix. shift: Floating-point `Tensor` representing `shift in `scale @ X + shift`. validate_args: Python `bool` indicating whether arguments should be checked for correctness. Returns: scale. In the case of scaling by a constant, scale is a floating point `Tensor`. Otherwise, scale is a `LinearOperator`. Raises: ValueError: if all of `tril`, `diag` and `identity_multiplier` are `None`. """ identity_multiplier = _as_tensor(identity_multiplier, "identity_multiplier") diag = _as_tensor(diag, "diag") tril = _as_tensor(tril, "tril") perturb_diag = _as_tensor(perturb_diag, "perturb_diag") perturb_factor = _as_tensor(perturb_factor, "perturb_factor") # If possible, use the low rank update to infer the shape of # the identity matrix, when scale represents a scaled identity matrix # with a low rank update. shape_hint = None if perturb_factor is not None: shape_hint = distribution_util.dimension_size(perturb_factor, axis=-2) if self._is_only_identity_multiplier: if validate_args: return control_flow_ops.with_dependencies( [check_ops.assert_none_equal( identity_multiplier, array_ops.zeros([], identity_multiplier.dtype), ["identity_multiplier should be non-zero."])], identity_multiplier) return identity_multiplier scale = distribution_util.make_tril_scale( loc=shift, scale_tril=tril, scale_diag=diag, scale_identity_multiplier=identity_multiplier, validate_args=validate_args, assert_positive=False, shape_hint=shape_hint) if perturb_factor is not None: return linalg.LinearOperatorLowRankUpdate( scale, u=perturb_factor, diag_update=perturb_diag, is_diag_update_positive=perturb_diag is None, is_non_singular=True, # Implied by is_positive_definite=True. is_self_adjoint=True, is_positive_definite=True, is_square=True) return scale
def resize_image_bboxes_with_crop_or_pad(image, bboxes, target_height, target_width): """Crops and/or pads an image to a target width and height. Resizes an image to a target width and height by either centrally cropping the image or padding it evenly with zeros. If `width` or `height` is greater than the specified `target_width` or `target_height` respectively, this op centrally crops along that dimension. If `width` or `height` is smaller than the specified `target_width` or `target_height` respectively, this op centrally pads with 0 along that dimension. Args: image: 3-D tensor of shape `[height, width, channels]` target_height: Target height. target_width: Target width. Raises: ValueError: if `target_height` or `target_width` are zero or negative. Returns: Cropped and/or padded image of shape `[target_height, target_width, channels]` """ with tf.name_scope('resize_with_crop_or_pad'): image = ops.convert_to_tensor(image, name='image') assert_ops = [] assert_ops += _Check3DImage(image, require_static=False) assert_ops += _assert(target_width > 0, ValueError, 'target_width must be > 0.') assert_ops += _assert(target_height > 0, ValueError, 'target_height must be > 0.') image = control_flow_ops.with_dependencies(assert_ops, image) # `crop_to_bounding_box` and `pad_to_bounding_box` have their own checks. # Make sure our checks come first, so that error messages are clearer. if _is_tensor(target_height): target_height = control_flow_ops.with_dependencies( assert_ops, target_height) if _is_tensor(target_width): target_width = control_flow_ops.with_dependencies( assert_ops, target_width) def max_(x, y): if _is_tensor(x) or _is_tensor(y): return math_ops.maximum(x, y) else: return max(x, y) def min_(x, y): if _is_tensor(x) or _is_tensor(y): return math_ops.minimum(x, y) else: return min(x, y) def equal_(x, y): if _is_tensor(x) or _is_tensor(y): return math_ops.equal(x, y) else: return x == y height, width, _ = _ImageDimensions(image) width_diff = target_width - width offset_crop_width = max_(-width_diff // 2, 0) offset_pad_width = max_(width_diff // 2, 0) height_diff = target_height - height offset_crop_height = max_(-height_diff // 2, 0) offset_pad_height = max_(height_diff // 2, 0) # Maybe crop if needed. height_crop = min_(target_height, height) width_crop = min_(target_width, width) cropped = tf.image.crop_to_bounding_box(image, offset_crop_height, offset_crop_width, height_crop, width_crop) bboxes = bboxes_crop_or_pad(bboxes, height, width, -offset_crop_height, -offset_crop_width, height_crop, width_crop) # Maybe pad if needed. resized = tf.image.pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width, target_height, target_width) bboxes = bboxes_crop_or_pad(bboxes, height_crop, width_crop, offset_pad_height, offset_pad_width, target_height, target_width) # In theory all the checks below are redundant. if resized.get_shape().ndims is None: raise ValueError('resized contains no shape.') resized_height, resized_width, _ = _ImageDimensions(resized) assert_ops = [] assert_ops += _assert(equal_(resized_height, target_height), ValueError, 'resized height is not correct.') assert_ops += _assert(equal_(resized_width, target_width), ValueError, 'resized width is not correct.') resized = control_flow_ops.with_dependencies(assert_ops, resized) return resized, bboxes
def body(i): new_u = state_ops.assign_add(u, v) new_i = math_ops.add(i, 1) op = control_flow_ops.group(new_u) new_i = control_flow_ops.with_dependencies([op], new_i) return [new_i]
def optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, moving_average_decay=0.9, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None): """Given loss and parameters for optimizer, returns a training op. Args: loss: Tensor, 0 dimensional. global_step: Tensor, step counter for each update. learning_rate: float or Tensor, magnitude of update per each training step. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of tf.Optimizer that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantion of tf.Optimizer sub-class and have `compute_gradients` and `apply_gradients` functions. gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this value. gradient_multipliers: dict of variables or variable names to floats. If present, gradients for specified variables will be multiplied by given constant. clip_gradients: float or `None`, clips gradients by this value. moving_average_decay: float or None, takes into account previous loss to make learning smoother due to outliers. learning_rate_decay_fn: function, takes `learning_rate` and `global_step` `Tensor`s, returns `Tensor`. Can be used to implement any learning rate decay functions. For example: tf.train.exponential_decay. update_ops: list of update `Operation`s to execute at each step. If `None`, uses elements of UPDATE_OPS collection. variables: list of variables to optimize or `None` to use all trainable variables. name: The name for this operation is used to scope operations and summaries. Returns: Training op. Raises: ValueError: if optimizer is wrong type. """ with vs.variable_op_scope([loss, global_step], name, "OptimizeLoss"): # Update ops take UPDATE_OPS collection if not provided. update_ops = (set(update_ops or []) or set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))) # Make sure update ops are ran before computing loss. if update_ops: with ops.control_dependencies(update_ops): barrier = control_flow_ops.no_op(name="update_barrier") loss = control_flow_ops.with_dependencies([barrier], loss) # Moving average of the loss with decay. if moving_average_decay is not None: # Generate moving averages of the loss. loss_averages = train.ExponentialMovingAverage( moving_average_decay, name="avg") loss_averages_op = loss_averages.apply([loss]) logging_ops.scalar_summary("loss/mean", loss_averages.average(loss)) loss = control_flow_ops.with_dependencies([loss_averages_op], loss) # Learning rate variable, with possible decay. if (isinstance(learning_rate, ops.Tensor) and learning_rate.get_shape().ndims == 0): lr = learning_rate elif isinstance(learning_rate, float): lr = vs.get_variable( "learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) else: raise ValueError("Learning rate should be 0d Tensor or float. " "Got %s of type %s" % (str(learning_rate), str(type(learning_rate)))) if learning_rate_decay_fn is not None: lr = learning_rate_decay_fn(lr, global_step) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( "Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr) elif isinstance(optimizer, type) and issubclass( optimizer, optimizer_.Optimizer): opt = optimizer(learning_rate=lr) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer else: raise ValueError("Unrecognized optimizer: should be string, " "subclass of Optimizer or instance of " "subclass of Optimizer. Got %s." % str(optimizer)) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() # Compute gradients. gradients = opt.compute_gradients(loss, variables) # Optionally add gradient noise. if gradient_noise_scale is not None: gradients = _add_scaled_noise_to_gradients(gradients, gradient_noise_scale) # Multiply some gradients. if gradient_multipliers is not None: gradients = _multiply_gradients(gradients, gradient_multipliers) # Optionally clip gradients by global norm. if clip_gradients is not None: gradients = _clip_gradients_by_norm(gradients, clip_gradients) # Add scalar summary for loss. logging_ops.scalar_summary("loss", loss) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if grad_values is not None: logging_ops.histogram_summary(variable.name, variable) logging_ops.histogram_summary(variable.name + "/gradients", grad_values) logging_ops.histogram_summary( variable.name + "/gradient_norm", clip_ops.global_norm([grad_values])) # Create gradient updates. grad_updates = opt.apply_gradients(gradients, global_step=global_step, name="train") # Make sure total_loss is valid. final_loss = array_ops.check_numerics(loss, "Loss is inf or nan") # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss) return train_tensor
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): # Config model_deploy. Keep TF Slim Models structure. # Useful if want to need multiple GPUs and/or servers in the future. deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=0, num_replicas=1, num_ps_tasks=0) # Create global_step. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() # Select the dataset. dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) # Get the SSD network and its anchors. ssd_class = nets_factory.get_network(FLAGS.model_name) ssd_params = ssd_class.default_params._replace(num_classes=FLAGS.num_classes) ssd_net = ssd_class(ssd_params) ssd_shape = ssd_net.params.img_shape ssd_anchors = ssd_net.anchors(ssd_shape) # Select the preprocessing function. preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) tf_utils.print_configuration(FLAGS.__flags, ssd_params, dataset.data_sources, FLAGS.train_dir) # =================================================================== # # Create a dataset provider and batches. # =================================================================== # with tf.device(deploy_config.inputs_device()): with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=True) # Get for SSD network: image, labels, bboxes. [image, shape, glabels, gbboxes] = provider.get(['image', 'shape', 'object/label', 'object/bbox']) # Pre-processing image, labels and bboxes. image, glabels, gbboxes = \ image_preprocessing_fn(image, glabels, gbboxes, out_shape=ssd_shape, data_format=DATA_FORMAT) # Encode groundtruth labels and bboxes. gclasses, glocalisations, gscores = \ ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors) batch_shape = [1] + [len(ssd_anchors)] * 3 # Training batches and queue. r = tf.train.batch( tf_utils.reshape_list([image, gclasses, glocalisations, gscores]), batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(r, batch_shape) # Intermediate queueing: unique batch computation pipeline for all # GPUs running the training. batch_queue = slim.prefetch_queue.prefetch_queue( tf_utils.reshape_list([b_image, b_gclasses, b_glocalisations, b_gscores]), capacity=2 * deploy_config.num_clones) # =================================================================== # # Define the model running on every GPU. # =================================================================== # def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" # Dequeue batch. b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) # Construct SSD network. arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay, data_format=DATA_FORMAT) with slim.arg_scope(arg_scope): predictions, localisations, logits, end_points = \ ssd_net.net(b_image, is_training=True) # Add loss function. ssd_net.losses(logits, localisations, b_gclasses, b_glocalisations, b_gscores, match_threshold=FLAGS.match_threshold, negative_ratio=FLAGS.negative_ratio, alpha=FLAGS.loss_alpha, label_smoothing=FLAGS.label_smoothing) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # =================================================================== # # Add summaries from first clone. # =================================================================== # clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses and extra losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) for loss in tf.get_collection('EXTRA_LOSSES', first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) # =================================================================== # # Configure the moving averages. # =================================================================== # if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None # =================================================================== # # Configure the optimization procedure. # =================================================================== # with tf.device(deploy_config.optimizer_device()): learning_rate = tf_utils.configure_learning_rate(FLAGS, dataset.num_samples, global_step) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = tf_utils.get_variables_to_train(FLAGS) # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # =================================================================== # # Kicks off the training. # =================================================================== # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, sync_optimizer=None)
def deploy(config, model_fn, args=None, kwargs=None, optimizer=None, summarize_gradients=False): """Deploys a Slim-constructed model across multiple clones. The deployment options are specified by the config object and support deploying one or several clones on different GPUs and one or several replicas of such clones. The argument `model_fn` is called `config.num_clones` times to create the model clones as `model_fn(*args, **kwargs)`. The optional argument `optimizer` is an `Optimizer` object. If not `None`, the deployed model is configured for training with that optimizer. If `config` specifies deployment on multiple replicas then the default tensorflow device is set appropriatly for each call to `model_fn` and for the slim variable creation functions: model and global variables will be created on the `ps` device, the clone operations will be on the `worker` device. Args: config: A `DeploymentConfig` object. model_fn: A callable. Called as `model_fn(*args, **kwargs)` args: Optional list of arguments to pass to `model_fn`. kwargs: Optional list of keyword arguments to pass to `model_fn`. optimizer: Optional `Optimizer` object. If passed the model is deployed for training with that optimizer. summarize_gradients: Whether or not add summaries to the gradients. Returns: A `DeployedModel` namedtuple. """ # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Create Clones. clones = create_clones(config, model_fn, args, kwargs) first_clone = clones[0] # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone.scope) train_op = None total_loss = None with tf.device(config.optimizer_device()): if optimizer: # Place the global step on the device storing the variables. with tf.device(config.variables_device()): global_step = slim.get_or_create_global_step() # Compute the gradients for the clones. total_loss, clones_gradients = optimize_clones(clones, optimizer) if clones_gradients: if summarize_gradients: # Add summaries to the gradients. summaries |= set( _add_gradients_summaries(clones_gradients)) # Create gradient updates. grad_updates = optimizer.apply_gradients( clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_op = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') else: clones_losses = [] regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) for clone in clones: with tf.name_scope(clone.scope): clone_loss = _gather_clone_loss(clone, len(clones), regularization_losses) if clone_loss is not None: clones_losses.append(clone_loss) # Only use regularization_losses for the first clone regularization_losses = None if clones_losses: total_loss = tf.add_n(clones_losses, name='total_loss') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone.scope)) if total_loss is not None: # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) if summaries: # Merge all summaries together. summary_op = tf.merge_summary(list(summaries), name='summary_op') else: summary_op = None return DeployedModel(train_op, summary_op, total_loss, clones)
def _forward_log_det_jacobian(self, x): # Let Y be a symmetric, positive definite matrix and write: # Y = X X.T # where X is lower-triangular. # # Observe that, # dY[i,j]/dX[a,b] # = d/dX[a,b] { X[i,:] X[j,:] } # = sum_{d=1}^p { I[i=a] I[d=b] X[j,d] + I[j=a] I[d=b] X[i,d] } # # To compute the Jacobian dX/dY we must represent X,Y as vectors. Since Y is # symmetric and X is lower-triangular, we need vectors of dimension: # d = p (p + 1) / 2 # where X, Y are p x p matrices, p > 0. We use a row-major mapping, i.e., # k = { i (i + 1) / 2 + j i>=j # { undef i<j # and assume zero-based indexes. When k is undef, the element is dropped. # Example: # j k # 0 1 2 3 / # 0 [ 0 . . . ] # i 1 [ 1 2 . . ] # 2 [ 3 4 5 . ] # 3 [ 6 7 8 9 ] # Write vec[.] to indicate transforming a matrix to vector via k(i,j). (With # slight abuse: k(i,j)=undef means the element is dropped.) # # We now show d vec[Y] / d vec[X] is lower triangular. Assuming both are # defined, observe that k(i,j) < k(a,b) iff (1) i<a or (2) i=a and j<b. # In both cases dvec[Y]/dvec[X]@[k(i,j),k(a,b)] = 0 since: # (1) j<=i<a thus i,j!=a. # (2) i=a>j thus i,j!=a. # # Since the Jacobian is lower-triangular, we need only compute the product # of diagonal elements: # d vec[Y] / d vec[X] @[k(i,j), k(i,j)] # = X[j,j] + I[i=j] X[i,j] # = 2 X[j,j]. # Since there is a 2 X[j,j] term for every lower-triangular element of X we # conclude: # |Jac(d vec[Y]/d vec[X])| = 2^p prod_{j=0}^{p-1} X[j,j]^{p-j}. diag = array_ops.matrix_diag_part(x) # We now ensure diag is columnar. Eg, if `diag = [1, 2, 3]` then the output # is `[[1], [2], [3]]` and if `diag = [[1, 2, 3], [4, 5, 6]]` then the # output is unchanged. diag = self._make_columnar(diag) if self.validate_args: is_matrix = check_ops.assert_rank_at_least( x, 2, message="Input must be a (batch of) matrix.") shape = array_ops.shape(x) is_square = check_ops.assert_equal( shape[-2], shape[-1], message="Input must be a (batch of) square matrix.") # Assuming lower-triangular means we only need check diag>0. is_positive_definite = check_ops.assert_positive( diag, message="Input must be positive definite.") x = control_flow_ops.with_dependencies( [is_matrix, is_square, is_positive_definite], x) # Create a vector equal to: [p, p-1, ..., 2, 1]. if x.get_shape().ndims is None or x.get_shape().dims[-1].value is None: p_int = array_ops.shape(x)[-1] p_float = math_ops.cast(p_int, dtype=x.dtype) else: p_int = x.get_shape().dims[-1].value p_float = np.array(p_int, dtype=x.dtype.as_numpy_dtype) exponents = math_ops.linspace(p_float, 1., p_int) sum_weighted_log_diag = array_ops.squeeze(math_ops.matmul( math_ops.log(diag), exponents[..., array_ops.newaxis]), axis=-1) fldj = p_float * np.log(2.) + sum_weighted_log_diag # We finally need to undo adding an extra column in non-scalar cases # where there is a single matrix as input. if x.get_shape().ndims is not None: if x.get_shape().ndims == 2: fldj = array_ops.squeeze(fldj, axis=-1) return fldj shape = array_ops.shape(fldj) maybe_squeeze_shape = array_ops.concat([ shape[:-1], distribution_util.pick_vector(math_ops.equal(array_ops.rank(x), 2), np.array([], dtype=np.int32), shape[-1:]) ], 0) return array_ops.reshape(fldj, maybe_squeeze_shape)
def random_crop(image_list, crop_height, crop_width): """Crops the given list of images. The function applies the same crop to each image in the list. This can be effectively applied when there are multiple image inputs of the same dimension such as: image, depths, normals = _random_crop([image, depths, normals], 120, 150) Args: image_list: a list of image tensors of the same dimension but possibly varying channel. crop_height: the new height. crop_width: the new width. Returns: the image_list with cropped images. Raises: ValueError: if there are multiple image inputs provided with different size or the images are smaller than the crop dimensions. """ if not image_list: raise ValueError('Empty image_list.') # Compute the rank assertions. rank_assertions = [] for i in range(len(image_list)): image_rank = tf.rank(image_list[i]) rank_assert = tf.Assert(tf.equal(image_rank, 3), [ 'Wrong rank for tensor %s [expected] [actual]', image_list[i].name, 3, image_rank ]) rank_assertions.append(rank_assert) image_shape = control_flow_ops.with_dependencies([rank_assertions[0]], tf.shape(image_list[0])) image_height = image_shape[0] image_width = image_shape[1] crop_size_assert = tf.Assert( tf.logical_and(tf.greater_equal(image_height, crop_height), tf.greater_equal(image_width, crop_width)), ['Crop size greater than the image size.']) asserts = [rank_assertions[0], crop_size_assert] for i in range(1, len(image_list)): image = image_list[i] asserts.append(rank_assertions[i]) shape = control_flow_ops.with_dependencies([rank_assertions[i]], tf.shape(image)) height = shape[0] width = shape[1] height_assert = tf.Assert(tf.equal(height, image_height), [ 'Wrong height for tensor %s [expected][actual]', image.name, height, image_height ]) width_assert = tf.Assert(tf.equal(width, image_width), [ 'Wrong width for tensor %s [expected][actual]', image.name, width, image_width ]) asserts.extend([height_assert, width_assert]) # Create a random bounding box. max_offset_height = control_flow_ops.with_dependencies( asserts, tf.reshape(image_height - crop_height + 1, [])) max_offset_width = control_flow_ops.with_dependencies( asserts, tf.reshape(image_width - crop_width + 1, [])) offset_height = tf.random_uniform([], maxval=max_offset_height, dtype=tf.int32) offset_width = tf.random_uniform([], maxval=max_offset_width, dtype=tf.int32) return [ crop(image, offset_height, offset_width, crop_height, crop_width) for image in image_list ]
def __init__(self, mix_loc, temperature, distribution, loc=None, scale=None, quadrature_size=8, quadrature_fn=quadrature_scheme_softmaxnormal_quantiles, validate_args=False, allow_nan_stats=True, name="VectorDiffeomixture"): """Constructs the VectorDiffeomixture on `R^d`. The vector diffeomixture (VDM) approximates the compound distribution ```none p(x) = int p(x | z) p(z) dz, where z is in the K-simplex, and p(x | z) := p(x | loc=sum_k z[k] loc[k], scale=sum_k z[k] scale[k]) ``` Args: mix_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`. In terms of samples, larger `mix_loc[..., k]` ==> `Z` is more likely to put more weight on its `kth` component. temperature: `float`-like `Tensor`. Broadcastable with `mix_loc`. In terms of samples, smaller `temperature` means one component is more likely to dominate. I.e., smaller `temperature` makes the VDM look more like a standard mixture of `K` components. distribution: `tf.Distribution`-like instance. Distribution from which `d` iid samples are used as input to the selected affine transformation. Must be a scalar-batch, scalar-event distribution. Typically `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is a function of non-trainable parameters. WARNING: If you backprop through a VectorDiffeomixture sample and the `distribution` is not `FULLY_REPARAMETERIZED` yet is a function of trainable variables, then the gradient will be incorrect! loc: Length-`K` list of `float`-type `Tensor`s. The `k`-th element represents the `shift` used for the `k`-th affine transformation. If the `k`-th item is `None`, `loc` is implicitly `0`. When specified, must have shape `[B1, ..., Bb, d]` where `b >= 0` and `d` is the event size. scale: Length-`K` list of `LinearOperator`s. Each should be positive-definite and operate on a `d`-dimensional vector space. The `k`-th element represents the `scale` used for the `k`-th affine transformation. `LinearOperator`s must have shape `[B1, ..., Bb, d, d]`, `b >= 0`, i.e., characterizes `b`-batches of `d x d` matrices quadrature_size: Python `int` scalar representing number of quadrature points. Larger `quadrature_size` means `q_N(x)` better approximates `p(x)`. quadrature_fn: Python callable taking `normal_loc`, `normal_scale`, `quadrature_size`, `validate_args` and returning `tuple(grid, probs)` representing the SoftmaxNormal grid and corresponding normalized weight. normalized) weight. Default value: `quadrature_scheme_softmaxnormal_quantiles`. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: ValueError: if `not scale or len(scale) < 2`. ValueError: if `len(loc) != len(scale)` ValueError: if `quadrature_grid_and_probs is not None` and `len(quadrature_grid_and_probs[0]) != len(quadrature_grid_and_probs[1])` ValueError: if `validate_args` and any not scale.is_positive_definite. TypeError: if any scale.dtype != scale[0].dtype. TypeError: if any loc.dtype != scale[0].dtype. NotImplementedError: if `len(scale) != 2`. ValueError: if `not distribution.is_scalar_batch`. ValueError: if `not distribution.is_scalar_event`. """ parameters = dict(locals()) with ops.name_scope(name, values=[mix_loc, temperature]) as name: if not scale or len(scale) < 2: raise ValueError("Must specify list (or list-like object) of scale " "LinearOperators, one for each component with " "num_component >= 2.") if loc is None: loc = [None]*len(scale) if len(loc) != len(scale): raise ValueError("loc/scale must be same-length lists " "(or same-length list-like objects).") dtype = scale[0].dtype.base_dtype loc = [ops.convert_to_tensor(loc_, dtype=dtype, name="loc{}".format(k)) if loc_ is not None else None for k, loc_ in enumerate(loc)] for k, scale_ in enumerate(scale): if validate_args and not scale_.is_positive_definite: raise ValueError("scale[{}].is_positive_definite = {} != True".format( k, scale_.is_positive_definite)) if scale_.dtype.base_dtype != dtype: raise TypeError( "dtype mismatch; scale[{}].base_dtype=\"{}\" != \"{}\"".format( k, scale_.dtype.base_dtype.name, dtype.name)) self._endpoint_affine = [ AffineLinearOperator(shift=loc_, scale=scale_, validate_args=validate_args, name="endpoint_affine_{}".format(k)) for k, (loc_, scale_) in enumerate(zip(loc, scale))] # TODO(jvdillon): Remove once we support k-mixtures. # We make this assertion here because otherwise `grid` would need to be a # vector not a scalar. if len(scale) != 2: raise NotImplementedError("Currently only bimixtures are supported; " "len(scale)={} is not 2.".format(len(scale))) mix_loc = ops.convert_to_tensor( mix_loc, dtype=dtype, name="mix_loc") temperature = ops.convert_to_tensor( temperature, dtype=dtype, name="temperature") self._grid, probs = tuple(quadrature_fn( mix_loc / temperature, 1. / temperature, quadrature_size, validate_args)) # Note: by creating the logits as `log(prob)` we ensure that # `self.mixture_distribution.logits` is equivalent to # `math_ops.log(self.mixture_distribution.probs)`. self._mixture_distribution = categorical_lib.Categorical( logits=math_ops.log(probs), validate_args=validate_args, allow_nan_stats=allow_nan_stats) asserts = distribution_util.maybe_check_scalar_distribution( distribution, dtype, validate_args) if asserts: self._grid = control_flow_ops.with_dependencies( asserts, self._grid) self._distribution = distribution self._interpolated_affine = [ AffineLinearOperator(shift=loc_, scale=scale_, validate_args=validate_args, name="interpolated_affine_{}".format(k)) for k, (loc_, scale_) in enumerate(zip( interpolate_loc(self._grid, loc), interpolate_scale(self._grid, scale)))] [ self._batch_shape_, self._batch_shape_tensor_, self._event_shape_, self._event_shape_tensor_, ] = determine_batch_event_shapes(self._grid, self._endpoint_affine) super(VectorDiffeomixture, self).__init__( dtype=dtype, # We hard-code `FULLY_REPARAMETERIZED` because when # `validate_args=True` we verify that indeed # `distribution.reparameterization_type == FULLY_REPARAMETERIZED`. A # distribution which is a function of only non-trainable parameters # also implies we can use `FULLY_REPARAMETERIZED`. However, we cannot # easily test for that possibility thus we use `validate_args=False` # as a "back-door" to allow users a way to use non # `FULLY_REPARAMETERIZED` distribution. In such cases IT IS THE USERS # RESPONSIBILITY to verify that the base distribution is a function of # non-trainable parameters. reparameterization_type=distribution_lib.FULLY_REPARAMETERIZED, validate_args=validate_args, allow_nan_stats=allow_nan_stats, parameters=parameters, graph_parents=( distribution._graph_parents # pylint: disable=protected-access + [loc_ for loc_ in loc if loc_ is not None] + [p for scale_ in scale for p in scale_.graph_parents]), name=name)
def model_fn(self, features, mode, config): """Model function for the estimator. Note that this does not take a `labels` arg. This works, but `input_fn` must return either `features` or, equivalently, `(features, None)`. Args: features: The input points. See `tf.estimator.Estimator`. mode: See `tf.estimator.Estimator`. config: See `tf.estimator.Estimator`. Returns: A `tf.estimator.EstimatorSpec` (see `tf.estimator.Estimator`) specifying this behavior: * `train_op`: Execute one mini-batch or full-batch run of Lloyd's algorithm. * `loss`: The sum of the squared distances from each input point to its closest center. * `eval_metric_ops`: Maps `SCORE` to `loss`. * `predictions`: Maps `ALL_DISTANCES` to the distance from each input point to each cluster center; maps `CLUSTER_INDEX` to the index of the closest cluster center for each input point. """ # input_points is a single Tensor. Therefore, the sharding functionality # in clustering_ops is unused, and some of the values below are lists of a # single item. input_points = _parse_features_if_necessary(features, self._feature_columns) # Let N = the number of input_points. # all_distances: A list of one matrix of shape (N, num_clusters). Each value # is the distance from an input point to a cluster center. # model_predictions: A list of one vector of shape (N). Each value is the # cluster id of an input point. # losses: Similar to cluster_idx but provides the distance to the cluster # center. # is_initialized: scalar indicating whether the initial cluster centers # have been chosen; see init_op. # init_op: an op to choose the initial cluster centers. A single worker # repeatedly executes init_op until is_initialized becomes True. # training_op: an op that runs an iteration of training, either an entire # Lloyd iteration or a mini-batch of a Lloyd iteration. Multiple workers # may execute this op, but only after is_initialized becomes True. (all_distances, model_predictions, losses, is_initialized, init_op, training_op) = clustering_ops.KMeans( inputs=input_points, num_clusters=self._num_clusters, initial_clusters=self._initial_clusters, distance_metric=self._distance_metric, use_mini_batch=self._use_mini_batch, mini_batch_steps_per_iteration=self. _mini_batch_steps_per_iteration, random_seed=self._random_seed, kmeans_plus_plus_num_retries=self._kmeans_plus_plus_num_retries ).training_graph() loss = math_ops.reduce_sum(losses) summary.scalar('loss/raw', loss) incr_step = state_ops.assign_add(training_util.get_global_step(), 1) training_op = control_flow_ops.with_dependencies( [training_op, incr_step], loss) training_hooks = [ _InitializeClustersHook(init_op, is_initialized, config.is_chief) ] if self._relative_tolerance is not None: training_hooks.append( _LossRelativeChangeHook(loss, self._relative_tolerance)) export_outputs = { KMeansClustering.ALL_DISTANCES: export_output.PredictOutput(all_distances[0]), KMeansClustering.CLUSTER_INDEX: export_output.PredictOutput(model_predictions[0]), signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: export_output.PredictOutput(model_predictions[0]) } return model_fn_lib.EstimatorSpec( mode=mode, predictions={ KMeansClustering.ALL_DISTANCES: all_distances[0], KMeansClustering.CLUSTER_INDEX: model_predictions[0], }, loss=loss, train_op=training_op, eval_metric_ops={KMeansClustering.SCORE: metrics.mean(loss)}, training_hooks=training_hooks, export_outputs=export_outputs)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = tf.train.get_or_create_global_step() ###################### # Select the dataset # ###################### # dataset = dataset_factory.get_dataset( # FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) dataset = ucf11.get_split(FLAGS.dataset_split_name, FLAGS.dataset_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn(FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## train_image_size = FLAGS.train_image_size or network_fn.default_image_size input, label = ucf11.build_data(dataset) input = image_preprocessing_fn(input, train_image_size, train_image_size) inputs, labels = tf.train.batch( [input, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding(labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue([inputs, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" inputs, labels = batch_queue.dequeue() images = tf.unstack(inputs, axis=1) logits, end_points = network_fn(images[0]) ############################# # Specify the loss function # ############################# tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) # # Accuracy # logits = end_points['Logits'] # pred = tf.nn.softmax(logits) # predictions = tf.argmax(pred, axis=1) # truth = tf.argmax(labels, axis=1) # truth = tf.squeeze(truth) # # # accuracy, accuracy_update = tf.metrics.accuracy(truth, predictions) # # # # # # update_ops.append(accuracy_update) # # # summaries.add(tf.summary.scalar('Accuracy', accuracy)) # # # names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ # 'Accuracy': slim.metrics.streaming_accuracy(predictions, truth), # 'Recall_5': slim.metrics.streaming_recall_at_k( # logits, truth, 5), # }) # # # Print the summaries to screen. # for name, value in names_to_values.items(): # summary_name = 'train/%s' % name # op = tf.summary.scalar(summary_name, value, collections=[]) # op = tf.Print(op, [value], summary_name) # summaries.add(op) # # update_ops.append(names_to_updates) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None, session_config=session_config)
def split(value: ragged_tensor.Ragged, num_or_size_splits, axis=0, num=None, name=None): """Splits a RaggedTensor `value` into a list of sub RaggedTensors. If `num_or_size_splits` is an `int`, then it splits `value` along the dimension `axis` into `num_or_size_splits` smaller RaggedTensors. This requires that `value.shape[axis]` is divisible by `num_or_size_splits`. If `num_or_size_splits` is a 1-D Tensor (or list), then `value` is split into `len(num_or_size_splits)` elements. The shape of the `i`-th element has the same size as the `value` except along dimension `axis` where the size is `num_or_size_splits[i]`. Splits along a ragged dimension is not allowed. For example: >>> rt = tf.RaggedTensor.from_row_lengths( ... np.arange(6 * 3).reshape(6, 3), row_lengths=[1, 2, 2, 1]) >>> rt.shape TensorShape([4, None, 3]) >>> >>> rt1, rt2 = tf.split(rt, 2) # uniform splits >>> rt1.shape TensorShape([2, None, 3]) >>> rt2.shape TensorShape([2, None, 3]) >>> >>> rt3, rt4, rt5 = tf.split(rt, [1, 2, 1]) # ragged splits >>> rt3.shape TensorShape([1, None, 3]) >>> rt4.shape TensorShape([2, None, 3]) >>> rt5.shape TensorShape([1, None, 3]) >>> >>> rt6, rt7 = tf.split(rt, [1, 2], axis=2) # splits along axis 2 >>> rt6.shape TensorShape([4, None, 1]) >>> rt7.shape TensorShape([4, None, 2]) Args: value: The `RaggedTensor` to split. num_or_size_splits: Either an `int` indicating the number of splits along `axis` or a 1-D integer `Tensor` or Python list containing the sizes of each output tensor along `axis`. If a Python int, then it must evenly divide `value.shape[axis]`; otherwise the sum of sizes along the split axis must match that of the `value`. axis: An `int` or scalar `int32` `Tensor`. The dimension along which to split. Must be in the range `[-rank(value), rank(value))`. Defaults to 0. num: An `int` used to specify the number of outputs when `num_or_size_splits` is a 1-D list or `Tensor` and its length is statically unknown, e.g., specifying `tf.TensorSepc(None)` with the `input_signature` argument of `tf.function` (optional). name: A name for the operation (optional). Returns: if `num_or_size_splits` is an `int` returns a list of `num_or_size_splits` `RaggedTensor` objects; if `num_or_size_splits` is a 1-D Tensor returns `num_or_size_splits.get_shape[0]` `RaggedTensor` objects resulting from splitting `value`. Raises: ValueError: If the dimension `axis` of `value` is a ragged dimension. ValueError: If `num` is unspecified and cannot be inferred. ValueError: If `num` is specified but doesn't match the length of `num_or_size_splits`. ValueError: If `num_or_size_splits` is an `int` and less than 1. TypeError: If `num_or_size_splits` is not an `int` or 1-D list or 1-D `Tensor`. InvalidArgumentError: If the `axis` of `value` cannot be exactly splitted by `num_or_size_splits`. InvalidArgumentError: If `num_or_size_splits` is contains negative integers. InvalidArgumentError: If `num_or_size_splits`'s static shape is unknown and its dynamic shape is inconsistent `num`. InvalidArgumentError: If `num_or_size_splits`'s static rank is unknown and `axis` is a negative integer. """ with ops.name_scope(name, 'RaggedSplit'): if isinstance(num_or_size_splits, int) and num_or_size_splits == 1: return [value] # static assert check_ops.assert_integer_v2( num_or_size_splits, message=('`num_or_size_splits` must be an `int` or 1-D list or ' '`Tensor` of integers.')) value_shape = ragged_shape.RaggedShape.from_tensor(value) axis = array_ops.get_positive_axis(axis, value_shape.rank) try: dim_size = value_shape[axis] except ValueError: raise ValueError( 'Cannot split a ragged dimension. Got `value` with ' f'shape {value_shape} and `axis` {axis}.') if isinstance(num_or_size_splits, int): # Uniform split num_splits = num_or_size_splits if num_splits < 1: raise ValueError( '`num_or_size_splits` must be >=1 if it is an `int`.' f'Received {num_or_size_splits}.') split_length = math_ops.floordiv(dim_size, num_splits) split_lengths = array_ops.repeat(split_length, num_splits) else: # Ragged split num_splits = None split_lengths = ops.convert_to_tensor(num_or_size_splits) if split_lengths.shape.ndims is not None: if split_lengths.shape.ndims != 1: raise TypeError( '`num_or_size_splits` must be an `int` or 1-D list ' f'or `Tensor`. Received {num_or_size_splits}.') num_splits = tensor_shape.dimension_value( split_lengths.shape[0]) if num_splits is None: if num is None: raise ValueError( '`num` must be specified as an `int` when the ' 'size of `num_or_size_split` is statically ' f'unknown. Received `num`: {num} and ' f'`num_or_size_split`: {num_or_size_splits}.') num_splits = num else: if num is not None and num != num_splits: raise ValueError( '`num` does not match the size of ' f'`num_or_size_split`. Received `num`: {num} and ' f'size of `num_or_size_split`: {num_splits}.') splits = array_ops.concat([[0], math_ops.cumsum(split_lengths)], axis=0) checks = [] checks.append( check_ops.assert_non_negative_v2( num_or_size_splits, message='`num_or_size_splits` must be non-negative.')) checks.append( check_ops.assert_equal_v2( num_splits, array_ops.shape(split_lengths)[0], message= '`num` is inconsistent with `num_or_size_split.shape[0]`.')) checks.append( check_ops.assert_equal_v2( math_ops.cast(dim_size, splits.dtype), splits[-1], message=( 'Cannot exactly split the `axis` dimension of `value` ' 'with the given `num_or_size_split`.'))) splits = control_flow_ops.with_dependencies(checks, splits) splited_rts = [] slices = [slice(None)] * (axis + 1) for i in range(num_splits): slices[-1] = slice(splits[i], splits[i + 1]) splited_rts.append(value[tuple(slices)]) return splited_rts
def optimize_loss(loss, optimizer, optimizer_params, learning_rate_decay_fn, dtype=tf.float32, clip_gradients=None, summaries=None, larc_params=None, loss_scaling=1.0, loss_scaling_params=None, on_horovod=False, iter_size=1, skip_update_ph=None): """Given loss and parameters for optimizer, returns a training op. Args: loss: Scalar `Tensor`. optimizer: string or class of optimizer, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of `tf.Optimizer` that implements `compute_gradients` and `apply_gradients` functions. optimizer_params: parameters of the optimizer. dtype: model dtype (tf.float16, tf.float32 or "mixed"). learning_rate_decay_fn: function, takes `global_step` `Tensor`s, returns `Tensor`. Can be used to implement any learning rate decay functions. For example: `tf.train.exponential_decay`. Ignored if `learning_rate` is not supplied. clip_gradients: float, max gradient norm to clip to. summaries: List of internal quantities to visualize on tensorboard. If not set only the loss and the learning rate will be reported. The complete list is in OPTIMIZER_SUMMARIES. larc_params: If not None, LARC re-scaling will be applied with corresponding parameters. loss_scaling: could be float or string. If float, static loss scaling is applied. If string, the corresponding automatic loss scaling algorithm is used. Must be one of 'Backoff' of 'LogMax' (case insensitive). Only used when dtype="mixed". on_horovod: whether the model is run on horovod. Returns: training op. """ if summaries is None: summaries = ["learning_rate", "global_gradient_norm", "loss_scale"] else: for summ in summaries: if summ not in OPTIMIZER_SUMMARIES: raise ValueError( "Summaries should be one of [{}], you provided {}.".format( ", ".join(OPTIMIZER_SUMMARIES), summ, ) ) if clip_gradients is not None and larc_params is not None: raise AttributeError( "LARC and gradient norm clipping should not be used together" ) global_step = tf.train.get_or_create_global_step() lr = learning_rate_decay_fn(global_step) if "learning_rate" in summaries: tf.summary.scalar("learning_rate", lr) with tf.variable_scope("Loss_Optimization"): update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) loss = control_flow_ops.with_dependencies(list(update_ops), loss) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( "Optimizer name should be one of [{}], you provided {}.".format( ", ".join(OPTIMIZER_CLS_NAMES), optimizer ) ) optimizer = OPTIMIZER_CLS_NAMES[optimizer] opt = optimizer(learning_rate=lr, **optimizer_params) if isinstance(loss_scaling, six.string_types): loss_scaling = AutomaticLossScaler( algorithm=loss_scaling, params=loss_scaling_params ) if "loss_scale" in summaries: tf.summary.scalar("loss_scale", loss_scaling.loss_scale) if dtype == 'mixed': opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scaling) # Compute gradients. grads_and_vars = opt.compute_gradients( loss, colocate_gradients_with_ops=True, ) if on_horovod: if iter_size > 1: grads_and_vars_accum = [] accum_ops = [] for grad, var in grads_and_vars: # necessary to use tf.Variable directly to instantiate cudnn rnn cells # which don't have explicit shape. grad_accum = tf.Variable( initial_value=tf.zeros_like(var), name=grad.name.split(":")[0] + "_accum", expected_shape=var.shape, dtype=grad.dtype, trainable=False, validate_shape=bool(var.get_shape()) ) if isinstance(grad, tf.IndexedSlices): add_grads = tf.scatter_nd_add(grad_accum, grad.indices, grad.values / iter_size) else: add_grads = grad_accum + grad / iter_size accum_ops.append(tf.assign(grad_accum, add_grads)) grads_and_vars_accum.append((grad_accum, var)) accum_op = tf.group(accum_ops) def update_and_clear_op(): with tf.control_dependencies([accum_op]): red_grad_updates = opt.apply_gradients( post_process_gradients( reduce_gradients(grads_and_vars_accum, on_horovod=True), lr=lr, clip_gradients=clip_gradients, larc_params=larc_params, summaries=summaries, ), global_step=global_step, ) with tf.control_dependencies([red_grad_updates]): return tf.group([tf.assign(g, tf.zeros_like(g)) for g, v in grads_and_vars_accum]) grad_updates = tf.cond( pred=skip_update_ph, true_fn=lambda: accum_op, false_fn=update_and_clear_op, ) else: grad_updates = opt.apply_gradients( post_process_gradients( reduce_gradients(grads_and_vars, on_horovod=True), lr=lr, clip_gradients=clip_gradients, larc_params=larc_params, summaries=summaries, ), global_step=global_step, ) else: grad_updates = opt.apply_gradients( post_process_gradients( grads_and_vars, lr=lr, clip_gradients=clip_gradients, larc_params=larc_params, summaries=summaries, ), global_step=global_step, ) # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) return train_tensor
def optimize_loss(loss, optimizer, optimizer_params, learning_rate_decay_fn, var_list = None, dtype = tf.float32, clip_gradients = None, summaries = None, larc_params = None, loss_scaling = 1.0, loss_scaling_params = None, iter_size = 1, skip_update_ph = None, model = None): """ Given loss and parameters for optimizer, returns a training op. """ if summaries is None: summaries = ["learning_rate", "global_gradient_norm", "loss_scale"] else: for sumn in summaries: if sumn not in OPTIMIZER_SUMMARIES: raise ValueError( "Summaries should be one of [{}], you provided {}.".format( ",".join(OPTIMIZER_SUMMARIES), sumn)) if clip_gradients is not None and larc_params is not None: raise AttributeError( "LARC and gradient norm clipping should not be used together") global_step = tf.train.get_or_create_global_step() lr = learning_rate_decay_fn(global_step) if "learning_rate" in summaries: tf.summary.scalar("learning_rate", lr) with tf.variable_scope("LossOptimization"): update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) """ contro_flow_ops.with_dependencies 实现图节点之间的依赖控制 with_dependencies(dependencies, output_tensor, name = None) """ loss = control_flow_ops.with_dependencies(list(update_ops), loss) if optimizer == "AdamW": optimizer_params["weight_decay"] = optimizer_params["weight_decay"] * lr # Create optimizer, given specified parameters if isinstance(optimizer, six.string_types): if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError("Optimizer name should be one of [{}], you provided {}".format(", ".join(OPTIMIZER_CLS_NAMES), optimizer)) optimizer = OPTIMIZER_CLS_NAMES[optimizer] opt = optimizer(learning_rate = lr, **optimizer_params) if isinstance(loss_scaling, six.string_types): loss_scaling = AutomaticLossScaler(algorithm = loss_scaling, params = loss_scaling_params) # if "loss_scale" in summaries: # tf.summary.scalar("loss_scale", loss_scaling.loss_scale) #if dtype == "mixed": # opt = MixedPrecisionOptimizerWrapper(opt, loss_scale = loss_scaling) """ Compute gradients Inputs: var_list: A list or tuple of tf.Variable to update to minimize loss. Defaults to the list of variables collected in the graph under the key GraphKeys.TRAINABLE_VARIABLES Returns: A list of (gradients, variable) pairs. Variable is always present but gradient can be None """ grads_and_vars = opt.compute_gradients( loss, colocate_gradients_with_ops = True, var_list = var_list) print("#################\n", grads_and_vars, "\n##################\n") """ apply_gradients returns an Operation that applies gradients. Inputs grads_and_vars: List of (gradients, variable) pairs as returned by compute_gradients() global_step: Optional Varibale to increment by one after the variables have been updated Returns: If global_step was not None, that operation also increments gloabl_step """ grad_updates = opt.apply_gradients( post_process_gradients( grads_and_vars, lr = lr, clip_gradients = clip_gradients, larc_params = larc_params, summaries = summaries), global_step = global_step) # ensure the train tensor computes grad_updates print("###########\n {} \n#########\n".format(grad_updates)) train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) print("###########\n {} \n#########\n".format(train_tensor)) return train_tensor, grads_and_vars
def __init__(self, loc=None, covariance_matrix=None, validate_args=False, allow_nan_stats=True, name="MultivariateNormalFullCovariance"): """Construct Multivariate Normal distribution on `R^k`. The `batch_shape` is the broadcast shape between `loc` and `covariance_matrix` arguments. The `event_shape` is given by last dimension of the matrix implied by `covariance_matrix`. The last dimension of `loc` (if provided) must broadcast with this. A non-batch `covariance_matrix` matrix is a `k x k` symmetric positive definite matrix. In other words it is (real) symmetric with all eigenvalues strictly positive. Additional leading dimensions (if any) will index batches. Args: loc: Floating-point `Tensor`. If this is set to `None`, `loc` is implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where `b >= 0` and `k` is the event size. covariance_matrix: Floating-point, symmetric positive definite `Tensor` of same `dtype` as `loc`. The strict upper triangle of `covariance_matrix` is ignored, so if `covariance_matrix` is not symmetric no error will be raised (unless `validate_args is True`). `covariance_matrix` has shape `[B1, ..., Bb, k, k]` where `b >= 0` and `k` is the event size. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: ValueError: if neither `loc` nor `covariance_matrix` are specified. """ parameters = locals() # Convert the covariance_matrix up to a scale_tril and call MVNTriL. with ops.name_scope(name) as name: with ops.name_scope("init", values=[loc, covariance_matrix]): if covariance_matrix is None: scale_tril = None else: covariance_matrix = ops.convert_to_tensor( covariance_matrix, name="covariance_matrix") if validate_args: covariance_matrix = control_flow_ops.with_dependencies([ check_ops.assert_near( covariance_matrix, array_ops.matrix_transpose(covariance_matrix), message="Matrix was not symmetric")], covariance_matrix) # No need to validate that covariance_matrix is non-singular. # LinearOperatorLowerTriangular has an assert_non_singular method that # is called by the Bijector. # However, cholesky() ignores the upper triangular part, so we do need # to separately assert symmetric. scale_tril = linalg_ops.cholesky(covariance_matrix) super(MultivariateNormalFullCovariance, self).__init__( loc=loc, scale_tril=scale_tril, validate_args=validate_args, allow_nan_stats=allow_nan_stats, name=name) self._parameters = parameters
def __init__(self, loc, scale, skewness=None, tailweight=None, distribution=None, validate_args=False, allow_nan_stats=True, name="SinhArcsinh"): """Construct SinhArcsinh distribution on `(-inf, inf)`. Arguments `(loc, scale, skewness, tailweight)` must have broadcastable shape (indexing batch dimensions). They must all have the same `dtype`. Args: loc: Floating-point `Tensor`. scale: `Tensor` of same `dtype` as `loc`. skewness: Skewness parameter. Default is `0.0` (no skew). tailweight: Tailweight parameter. Default is `1.0` (unchanged tailweight) distribution: `tf.Distribution`-like instance. Distribution that is transformed to produce this distribution. Default is `tf.distributions.Normal(0., 1.)`. Must be a scalar-batch, scalar-event distribution. Typically `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is a function of non-trainable parameters. WARNING: If you backprop through a `SinhArcsinh` sample and `distribution` is not `FULLY_REPARAMETERIZED` yet is a function of trainable variables, then the gradient will be incorrect! validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. """ parameters = dict(locals()) with tf.name_scope(name, values=[loc, scale, skewness, tailweight]) as name: loc = tf.convert_to_tensor(loc, name="loc") dtype = loc.dtype scale = tf.convert_to_tensor(scale, name="scale", dtype=dtype) tailweight = 1. if tailweight is None else tailweight has_default_skewness = skewness is None skewness = 0. if skewness is None else skewness tailweight = tf.convert_to_tensor(tailweight, name="tailweight", dtype=dtype) skewness = tf.convert_to_tensor(skewness, name="skewness", dtype=dtype) batch_shape = distribution_util.get_broadcast_shape( loc, scale, tailweight, skewness) # Recall, with Z a random variable, # Y := loc + C * F(Z), # F(Z) := Sinh( (Arcsinh(Z) + skewness) * tailweight ) # F_0(Z) := Sinh( Arcsinh(Z) * tailweight ) # C := 2 * scale / F_0(2) if distribution is None: distribution = tf.distributions.Normal( loc=tf.zeros([], dtype=dtype), scale=tf.ones([], dtype=dtype), allow_nan_stats=allow_nan_stats) else: asserts = distribution_util.maybe_check_scalar_distribution( distribution, dtype, validate_args) if asserts: loc = control_flow_ops.with_dependencies(asserts, loc) # Make the SAS bijector, 'F'. f = bijectors.SinhArcsinh(skewness=skewness, tailweight=tailweight) if has_default_skewness: f_noskew = f else: f_noskew = bijectors.SinhArcsinh( skewness=skewness.dtype.as_numpy_dtype(0.), tailweight=tailweight) # Make the AffineScalar bijector, Z --> loc + scale * Z (2 / F_0(2)) c = 2 * scale / f_noskew.forward( tf.convert_to_tensor(2, dtype=dtype)) affine = bijectors.AffineScalar(shift=loc, scale=c, validate_args=validate_args) bijector = bijectors.Chain([affine, f]) super(SinhArcsinh, self).__init__(distribution=distribution, bijector=bijector, batch_shape=batch_shape, validate_args=validate_args, name=name) self._parameters = parameters self._loc = loc self._scale = scale self._tailweight = tailweight self._skewness = skewness
def ragged_assert_compatible_and_get_flat_values(values, mask=None): """If ragged, it checks the compatibility and then returns the flat_values. Note: If two tensors are dense, it does not check their compatibility. Note: Although two ragged tensors with different ragged ranks could have identical overall rank and dimension sizes and hence be compatible, we do not support those cases. Args: values: A list of potentially ragged tensor of the same ragged_rank. mask: A potentially ragged tensor of the same ragged_rank as elements in Values. Returns: A tuple in which the first element is the list of tensors and the second is the mask tensor. ([Values], mask). Mask and the element in Values are equal to the flat_values of the input arguments (if they were ragged). """ if isinstance(values, list): is_all_ragged = \ all(isinstance(rt, ragged_tensor.RaggedTensor) for rt in values) is_any_ragged = \ any(isinstance(rt, ragged_tensor.RaggedTensor) for rt in values) else: is_all_ragged = isinstance(values, ragged_tensor.RaggedTensor) is_any_ragged = is_all_ragged if (is_all_ragged and ((mask is None) or isinstance(mask, ragged_tensor.RaggedTensor))): to_be_stripped = False if not isinstance(values, list): values = [values] to_be_stripped = True # NOTE: we leave the flat_values compatibility to # tf.TensorShape `assert_is_compatible_with` # check if both dynamic dimensions are equal and then use the flat_values. nested_row_split_list = [rt.nested_row_splits for rt in values] assertion_list = _assert_splits_match(nested_row_split_list) # if both are ragged sample_weights also should be ragged with same dims. if isinstance(mask, ragged_tensor.RaggedTensor): assertion_list_for_mask = _assert_splits_match( [nested_row_split_list[0], mask.nested_row_splits]) tmp = control_flow_ops.with_dependencies(assertion_list_for_mask, mask.flat_values) mask = array_ops.expand_dims(tmp, -1) # values has at least 1 element. flat_values = [] for value in values: tmp = control_flow_ops.with_dependencies(assertion_list, value.flat_values) flat_values.append(array_ops.expand_dims(tmp, -1)) values = flat_values[0] if to_be_stripped else flat_values elif is_any_ragged: raise TypeError('One of the inputs does not have acceptable types.') # values are empty or value are not ragged and mask is ragged. elif isinstance(mask, ragged_tensor.RaggedTensor): raise TypeError('Ragged mask is not allowed with non-ragged inputs.') return values, mask