def train(self, sentences): token_ids, token_values, token_dense_shape = self._tokenize(sentences) tokens_sparse = tf.sparse.SparseTensor( indices=token_ids, values=token_values, dense_shape=token_dense_shape) tokens = tf.sparse.to_dense(tokens_sparse, default_value="") sparse_lookup_ids = tf.sparse.SparseTensor( indices=tokens_sparse.indices, values=self._words_to_indices(tokens_sparse.values), dense_shape=tokens_sparse.dense_shape) lookup_ids = tf.sparse.to_dense(sparse_lookup_ids, default_value=0) # Targets are the next word for each word of the sentence. tokens_ids_seq = lookup_ids[:, 0:-1] tokens_ids_target = lookup_ids[:, 1:] tokens_prefix = tokens[:, 0:-1] # Mask determining which positions we care about for a loss: all positions # that have a valid non-terminal token. mask = tf.logical_and( tf.logical_not(tf.equal(tokens_prefix, "")), tf.logical_not(tf.equal(tokens_prefix, "<E>"))) input_mask = tf.cast(mask, tf.int32) with tf.GradientTape() as t: sentence_embeddings = tf.nn.embedding_lookup(self._embeddings, tokens_ids_seq) lstm_initial_state = self._lstm_cell.get_initial_state( sentence_embeddings) lstm_output = self._rnn_layer( inputs=sentence_embeddings, initial_state=lstm_initial_state) # Stack LSTM outputs into a batch instead of a 2D array. lstm_output = tf.reshape(lstm_output, [-1, self._lstm_cell.output_size]) logits = self._logit_layer(lstm_output) targets = tf.reshape(tokens_ids_target, [-1]) weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32) losses = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=targets, logits=logits) # Final loss is the mean loss for all token losses. final_loss = tf.math.divide( tf.reduce_sum(tf.multiply(losses, weights)), tf.reduce_sum(weights), name="final_loss") watched = t.watched_variables() gradients = t.gradient(final_loss, watched) for w, g in zip(watched, gradients): w.assign_sub(g) return final_loss
def train_step(inputs): """Build `step_fn` for efficientnet learning.""" images, labels = inputs images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1]) labels = tf.tile(labels, [FLAGS.ensemble_size, 1]) num_replicas = tf.cast(strategy.num_replicas_in_sync, tf.float32) l2_coeff = tf.cast(FLAGS.l2, tf.float32) with tf.GradientTape() as tape: logits = model(images, training=True) logits = tf.cast(logits, tf.float32) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy( labels, logits, from_logits=True, label_smoothing=FLAGS.label_smoothing)) filtered_variables = [] for var in model.trainable_variables: # Apply l2 on the slow weights and bias terms. This excludes BN # parameters and fast weight approximate posterior/prior parameters, # but pay caution to their naming scheme. if 'kernel' in var.name or 'bias' in var.name: filtered_variables.append(tf.reshape(var, (-1, ))) l2_loss = FLAGS.l2 * 2 * tf.nn.l2_loss( tf.concat(filtered_variables, axis=0)) loss = negative_log_likelihood + l2_coeff * l2_loss scaled_loss = loss / num_replicas grads = tape.gradient(scaled_loss, model.trainable_weights) # Separate learning rate implementation. if FLAGS.fast_weight_lr_multiplier != 1.0: grads_and_vars = [] for grad, var in zip(grads, model.trainable_variables): # Apply different learning rate on the fast weights. This excludes BN # and slow weights, but pay caution to the naming scheme. if ('batch_norm' not in var.name and 'kernel' not in var.name): grads_and_vars.append( (grad * FLAGS.fast_weight_lr_multiplier, var)) else: grads_and_vars.append((grad, var)) optimizer.apply_gradients(grads_and_vars) else: optimizer.apply_gradients(zip(grads, model.trainable_variables)) sparse_labels = tf.cast( tf.math.argmax(labels, axis=-1, output_type=tf.int32), tf.float32) probs = tf.nn.softmax(logits) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, logits) metrics['train/ece'].update_state(sparse_labels, probs) step_info = { 'loss/negative_log_likelihood': negative_log_likelihood / num_replicas, 'loss/total_loss': scaled_loss, } return step_info
def _call(self, r): mean = grad_mean = tf.math.exp(r) variance = mean + mean**2 / tf.cast(self._total_count, r.dtype) return mean, variance, grad_mean
def _call(self, r): mean = tf.math.reciprocal(r) grad_mean = -r**-2 s = tf.cast(self._scale, r.dtype) variance = tf.fill(tf.shape(r), s**2.) return mean, variance, grad_mean
def _as_distribution(self, r): concentration = DeferredTensor(self._concentration, lambda x: tf.cast(x, r.dtype), dtype=r.dtype) return tfd.Gamma(concentration=concentration, rate=DeferredTensor(r, lambda x: tf.math.exp(-x)))
def _as_distribution(self, r): total_count = DeferredTensor(self._total_count, lambda x: tf.cast(x, r.dtype), dtype=r.dtype) return tfd.Binomial(total_count=total_count, logits=r)
def sparse_read(self, indices, name=None): """Reads the value of this variable sparsely, using `gather`.""" val = self._variable.sparse_read(indices, name=name) return tf.cast(val, self._cast_dtype)
def value(self): val = self._variable.value() if not self._should_cast(): return val return tf.cast(val, self._cast_dtype)
def mixup_or_cutmix(batch): """Randomly applies one of cutmix or mixup to a batch.""" logging.info('Randomly applying cutmix or mixup with 50% chance!') return tf.cond( tf.cast(tf.random.uniform([], maxval=2, dtype=tf.int32), tf.bool), lambda: my_mixup(batch), lambda: my_cutmix(batch))
def cast_fn(batch): batch = dict(**batch) batch['images'] = tf.cast(batch['images'], _to_tf_dtype(dtype)) return batch
def _milstein_step(*, dim, i, written_count, current_state, result, drift_fn, volatility_fn, grad_volatility_fn, wiener_mean, num_samples, times, dt, sqrt_dt, keep_mask, random_type, seed, normal_draws, input_gradients, stratonovich_order, aux_normal_draws): """Performs one step of Milstein scheme.""" current_time = times[i + 1] written_count = tf.cast(written_count, tf.int32) if normal_draws is not None: dw = normal_draws[i] else: dw = random.mv_normal_sample((num_samples, ), mean=wiener_mean, random_type=random_type, seed=seed) if aux_normal_draws is not None: stratonovich_draws = [] for j in range(3): stratonovich_draws.append( tf.reshape(aux_normal_draws[j][i], [num_samples, dim, stratonovich_order])) else: stratonovich_draws = [] # Three sets of normal draws for stratonovich integrals. for j in range(3): stratonovich_draws.append( random.mv_normal_sample( (num_samples, ), mean=tf.zeros((dim, stratonovich_order), dtype=current_state.dtype, name='stratonovich_draws_{}'.format(j)), random_type=random_type, seed=seed)) if dim == 1: drift = drift_fn(current_time, current_state) vol = volatility_fn(current_time, current_state) grad_vol = grad_volatility_fn(current_time, current_state, tf.ones_like(current_state)) next_state = _milstein_1d(dw=dw, dt=dt[i], sqrt_dt=sqrt_dt[i], current_state=current_state, drift=drift, vol=vol, grad_vol=grad_vol) else: drift = drift_fn(current_time, current_state) vol = volatility_fn(current_time, current_state) # This is a list of size equal to the dimension of the state space `dim`. # It contains tensors of shape [num_samples, dim, wiener_dim] representing # the gradient of the volatility function. In our case, the dimension of the # wiener process `wiener_dim` is equal to the state dimension `dim`. grad_vol = [ grad_volatility_fn(current_time, current_state, start) for start in input_gradients ] next_state = _milstein_nd(dim=dim, num_samples=num_samples, dw=dw, dt=dt[i], sqrt_dt=sqrt_dt[i], current_state=current_state, drift=drift, vol=vol, grad_vol=grad_vol, stratonovich_draws=stratonovich_draws, stratonovich_order=stratonovich_order) result = utils.maybe_update_along_axis(tensor=result, do_update=keep_mask[i + 1], ind=written_count, axis=1, new_tensor=tf.expand_dims( next_state, axis=1)) written_count += tf.cast(keep_mask[i + 1], dtype=tf.int32) return i + 1, written_count, next_state, result
def sample(*, dim, drift_fn, volatility_fn, times, time_step=None, num_time_steps=None, num_samples=1, initial_state=None, grad_volatility_fn=None, random_type=None, seed=None, swap_memory=True, skip=0, precompute_normal_draws=True, watch_params=None, stratonovich_order=5, dtype=None, name=None): r"""Returns a sample paths from the process using the Milstein method. For an Ito process, ``` dX = a(t, X_t) dt + b(t, X_t) dW_t ``` given drift `a`, volatility `b` and derivative of volatility `b'`, the Milstein method generates a sequence {Y_n} approximating X ``` Y_{n+1} = Y_n + a(t_n, Y_n) dt + b(t_n, Y_n) dW_n + \frac{1}{2} b(t_n, Y_n) b'(t_n, Y_n) ((dW_n)^2 - dt) ``` where `dt = t_{n+1} - t_n`, `dW_n = (N(0, t_{n+1}) - N(0, t_n))` and `N` is a sample from the Normal distribution. In higher dimensions, when `a(t, X_t)` is a d-dimensional vector valued function and `W_t` is a d-dimensional Wiener process, we have for the kth element of the expansion: ``` Y_{n+1}[k] = Y_n[k] + a(t_n, Y_n)[k] dt + \sum_{j=1}^d b(t_n, Y_n)[k, j] dW_n[j] + \sum_{j_1=1}^d \sum_{j_2=1}^d L_{j_1} b(t_n, Y_n)[k, j_2] I(j_1, j_2) ``` where `L_{j} = \sum_{i=1}^d b(t_n, Y_n)[i, j] \frac{\partial}{\partial x^i}` is an operator and `I(j_1, j_2) = \int_{t_n}^{t_{n+1}} \int_{t_n}^{s_1} dW_{s_2}[j_1] dW_{s_1}[j_2]` is a multiple Ito integral. See [1] and [2] for details. #### References [1]: Wikipedia. Milstein method: https://en.wikipedia.org/wiki/Milstein_method [2]: Peter E. Kloeden, Eckhard Platen. Numerical Solution of Stochastic Differential Equations. Springer. 1992 Args: dim: Python int greater than or equal to 1. The dimension of the Ito Process. drift_fn: A Python callable to compute the drift of the process. The callable should accept two real `Tensor` arguments of the same dtype. The first argument is the scalar time t, the second argument is the value of Ito process X - tensor of shape `batch_shape + [dim]`. The result is value of drift a(t, X). The return value of the callable is a real `Tensor` of the same dtype as the input arguments and of shape `batch_shape + [dim]`. volatility_fn: A Python callable to compute the volatility of the process. The callable should accept two real `Tensor` arguments of the same dtype as `times`. The first argument is the scalar time t, the second argument is the value of Ito process X - tensor of shape `batch_shape + [dim]`. The result is value of volatility b(t, X). The return value of the callable is a real `Tensor` of the same dtype as the input arguments and of shape `batch_shape + [dim, dim]`. times: Rank 1 `Tensor` of increasing positive real values. The times at which the path points are to be evaluated. time_step: An optional scalar real `Tensor` - maximal distance between points in grid in Milstein schema. Either this or `num_time_steps` should be supplied. Default value: `None`. num_time_steps: An optional Scalar integer `Tensor` - a total number of time steps performed by the algorithm. The maximal distance betwen points in grid is bounded by `times[-1] / (num_time_steps - times.shape[0])`. Either this or `time_step` should be supplied. Default value: `None`. num_samples: Positive scalar `int`. The number of paths to draw. Default value: 1. initial_state: `Tensor` of shape `[dim]`. The initial state of the process. Default value: None which maps to a zero initial state. grad_volatility_fn: An optional python callable to compute the gradient of `volatility_fn`. The callable should accept three real `Tensor` arguments of the same dtype as `times`. The first argument is the scalar time t. The second argument is the value of Ito process X - tensor of shape `batch_shape + [dim]`. The third argument is a tensor of input gradients of shape `batch_shape + [dim]` to pass to `gradient.fwd_gradient`. The result is a list of values corresponding to the forward gradient of volatility b(t, X) with respect to X. The return value of the callable is a list of size `dim` containing real `Tensor`s of the same dtype as the input arguments and of shape `batch_shape + [dim, dim]`. Each index of the list corresponds to a dimension of the state. If `None`, the gradient is computed from `volatility_fn` using forward differentiation. random_type: Enum value of `RandomType`. The type of (quasi)-random number generator to use to generate the paths. Default value: None which maps to the standard pseudo-random numbers. seed: Seed for the random number generator. The seed is only relevant if `random_type` is one of `[STATELESS, PSEUDO, HALTON_RANDOMIZED, PSEUDO_ANTITHETIC, STATELESS_ANTITHETIC]`. For `PSEUDO`, `PSEUDO_ANTITHETIC` and `HALTON_RANDOMIZED` the seed should be a Python integer. For `STATELESS` and `STATELESS_ANTITHETIC `must be supplied as an integer `Tensor` of shape `[2]`. Default value: `None` which means no seed is set. swap_memory: A Python bool. Whether GPU-CPU memory swap is enabled for this op. See an equivalent flag in `tf.while_loop` documentation for more details. Useful when computing a gradient of the op since `tf.while_loop` is used to propagate stochastic process in time. Default value: True. skip: `int32` 0-d `Tensor`. The number of initial points of the Sobol or Halton sequence to skip. Used only when `random_type` is 'SOBOL', 'HALTON', or 'HALTON_RANDOMIZED', otherwise ignored. Default value: `0`. precompute_normal_draws: Python bool. Indicates whether the noise increments `N(0, t_{n+1}) - N(0, t_n)` are precomputed. For `HALTON` and `SOBOL` random types the increments are always precomputed. While the resulting graph consumes more memory, the performance gains might be significant. Default value: `True`. watch_params: An optional list of zero-dimensional `Tensor`s of the same `dtype` as `initial_state`. If provided, specifies `Tensor`s with respect to which the differentiation of the sampling function will happen. A more efficient algorithm is used when `watch_params` are specified. Note the the function becomes differentiable only wrt to these `Tensor`s and the `initial_state`. The gradient wrt any other `Tensor` is set to be zero. stratonovich_order: A positive integer. The number of terms to use when calculating the approximate Stratonovich integrals in the multidimensional scheme. Stratonovich integrals are an alternative to Ito integrals, and can be used interchangeably when defining the higher order terms in the update equation. We use Stratonovich integrals here because they have a convenient approximation scheme for calculating cross terms involving different components of the Wiener process. See Eq. 8.10 in Section 5.8 of [2]. Default value: `5`. dtype: `tf.Dtype`. If supplied the dtype for the input and output `Tensor`s. Default value: None which means that the dtype implied by `times` is used. name: Python string. The name to give this op. Default value: `None` which maps to `milstein_sample`. """ name = name or 'milstein_sample' with tf.name_scope(name): if stratonovich_order <= 0: raise ValueError( '`stratonovich_order` must be a positive integer.') times = tf.convert_to_tensor(times, dtype=dtype) if dtype is None: dtype = times.dtype if initial_state is None: initial_state = tf.zeros(dim, dtype=dtype) initial_state = tf.convert_to_tensor(initial_state, dtype=dtype, name='initial_state') num_requested_times = tf.shape(times)[0] # Create a time grid for the Milstein scheme. if num_time_steps is not None and time_step is not None: raise ValueError( 'Only one of either `num_time_steps` or `time_step` ' 'should be defined but not both') if time_step is None: if num_time_steps is None: raise ValueError( 'Either `num_time_steps` or `time_step` should be ' 'defined.') num_time_steps = tf.convert_to_tensor(num_time_steps, dtype=tf.int32, name='num_time_steps') time_step = times[-1] / tf.cast(num_time_steps, dtype=dtype) else: time_step = tf.convert_to_tensor(time_step, dtype=dtype, name='time_step') times, keep_mask, time_indices = utils.prepare_grid( times=times, time_step=time_step, num_time_steps=num_time_steps, dtype=dtype) if watch_params is not None: watch_params = [ tf.convert_to_tensor(param, dtype=dtype) for param in watch_params ] if grad_volatility_fn is None: def _grad_volatility_fn(current_time, current_state, input_gradients): return gradient.fwd_gradient( functools.partial(volatility_fn, current_time), current_state, input_gradients=input_gradients, unconnected_gradients=tf.UnconnectedGradients.ZERO) grad_volatility_fn = _grad_volatility_fn input_gradients = None if dim > 1: input_gradients = tf.unstack(tf.eye(dim, dtype=dtype)) input_gradients = [ tf.broadcast_to(start, [num_samples, dim]) for start in input_gradients ] return _sample(dim=dim, drift_fn=drift_fn, volatility_fn=volatility_fn, grad_volatility_fn=grad_volatility_fn, times=times, time_step=time_step, keep_mask=keep_mask, num_requested_times=num_requested_times, num_samples=num_samples, initial_state=initial_state, random_type=random_type, seed=seed, swap_memory=swap_memory, skip=skip, precompute_normal_draws=precompute_normal_draws, watch_params=watch_params, time_indices=time_indices, input_gradients=input_gradients, stratonovich_order=stratonovich_order, dtype=dtype)
def find_interval_index(query_xs, interval_lower_xs, last_interval_is_closed=False, dtype=None, name=None): """Function to find the index of the interval where query points lies. Given a list of adjacent half-open intervals [x_0, x_1), [x_1, x_2), ..., [x_{n-1}, x_n), [x_n, inf), described by a list [x_0, x_1, ..., x_{n-1}, x_n]. Return the index where the input query points lie. If x >= x_n, n is returned, and if x < x_0, -1 is returned. If `last_interval_is_closed` is set to `True`, the last interval [x_{n-1}, x_n] is interpreted as closed (including x_n). ### Example ```python interval_lower_xs = [0.25, 0.5, 1.0, 2.0, 3.0] query_xs = [0.25, 3.0, 5.0, 0.0, 0.5, 0.8] result = find_interval_index(query_xs, interval_lower_xs) # result == [0, 4, 4, -1, 1, 1] ``` Args: query_xs: Rank 1 real `Tensor` of any size, the list of x coordinates for which the interval index is to be found. The values must be strictly increasing. interval_lower_xs: Rank 1 `Tensor` of the same shape and dtype as `query_xs`. The values x_0, ..., x_n that define the interval starts. last_interval_is_closed: If set to `True`, the last interval is interpreted as closed. dtype: Optional `tf.Dtype`. If supplied, the dtype for `query_xs` and `interval_lower_xs`. Default value: None which maps to the default dtype inferred by TensorFlow (float32). name: Optional name of the operation. Returns: A tensor that matches the shape of `query_xs` with dtype=int32 containing the indices of the intervals containing query points. `-1` means the query point lies before all intervals and `n-1` means that the point lies in the last half-open interval (if `last_interval_is_closed` is `False`) or that the point lies to the right of all intervals (if `last_interval_is_closed` is `True`). """ with tf.compat.v1.name_scope( name, default_name='find_interval_index', values=[query_xs, interval_lower_xs, last_interval_is_closed]): # TODO(b/138988951): add ability to validate that intervals are increasing. # TODO(b/138988951): validate that if last_interval_is_closed, input size # must be > 1. query_xs = tf.convert_to_tensor(query_xs, dtype=dtype) interval_lower_xs = tf.convert_to_tensor(interval_lower_xs, dtype=dtype) # Result assuming that last interval is half-open. indices = tf.searchsorted(interval_lower_xs, query_xs, side='right') - 1 # Handling the branch if the last interval is closed. last_index = tf.shape(interval_lower_xs)[-1] - 1 last_x = tf.gather(interval_lower_xs, [last_index], axis=-1) # should_cap is a tensor true where a cell is true iff indices is the last # index at that cell and the query x <= the right boundary of the last # interval. should_cap = tf.logical_and(tf.equal(indices, last_index), tf.less_equal(query_xs, last_x)) # cap to last_index if the query x is not in the last interval, otherwise, # cap to last_index - 1. caps = last_index - tf.cast(should_cap, dtype=tf.dtypes.int32) return tf.compat.v1.where(last_interval_is_closed, tf.minimum(indices, caps), indices)
def fold_in(seed, axes): for name in axes: axis_index = get_axis_index(name) seed = samplers.fold_in(seed, tf.cast(axis_index, tf.int32)) return seed
def _head(self, neck_outputs): # <tf.float32>[time * batch_size, 1, hidden_dim] visual_feature = neck_outputs['visual_feature'] # <tf.float32>[time * batch_size, num_tokens, hidden_dim] text_feature = neck_outputs['text_feature'] # <tf.float32>[time, batch_size, 1, hidden_dim] visual_feature = tf.reshape( visual_feature, [self._current_num_timesteps, self._current_batch_size] + visual_feature.shape[1:].as_list()) # <tf.float32>[batch_size, time, hidden_dim] visual_feature = tf.squeeze(visual_feature, axis=2) visual_feature = tf.transpose(visual_feature, [1, 0, 2]) first_true = utils.get_first_true_column( tf.reshape(neck_outputs[constants.DISC_MASK], [self._current_num_timesteps, self._current_batch_size])) # <tf.float32>[batch_size, num_tokens, hidden_dim] text_feature = tf.cond( tf.keras.backend.any(first_true), lambda: tf.boolean_mask(text_feature, tf.reshape(first_true, [-1])), lambda: tf.reshape(text_feature, [ self._current_num_timesteps, self._current_batch_size ] + text_feature.shape[1:].as_list())[0, :, :, :]) # visual_feature = tf.nn.l2_normalize(visual_feature, axis=2) # text_feature = tf.nn.l2_normalize(text_feature, axis=2) # <tf.float32>[batch_size, time, num_tokens] alpha_i_j = tf.matmul(visual_feature, tf.transpose(text_feature, perm=[0, 2, 1])) # <tf.float32>[batch_size, time, num_tokens] ealpha_i_j = tf.exp(alpha_i_j) sum_i_j = tf.tile( tf.expand_dims(tf.reduce_sum(ealpha_i_j, 2), 2), [1, 1, tf.shape(ealpha_i_j)[2]]) mask = tf.cast( tf.transpose( tf.reshape(neck_outputs[constants.DISC_MASK], [self._current_num_timesteps, self._current_batch_size]), perm=[1, 0]), tf.float32) # <tf.float32>[batch, time, num_tokens] c_i_j = tf.divide(ealpha_i_j, sum_i_j) # <tf.float32>[batch, time] score = tf.reduce_sum(c_i_j * alpha_i_j, 2) escore = tf.exp(-1 * score) * mask sum_escore = tf.tile( tf.expand_dims(tf.reduce_sum(escore, 1), 1), [1, tf.shape(escore)[1]]) score_weight = tf.divide(escore, sum_escore) similarities = tf.reduce_sum(mask * score * score_weight, 1) similarities = tf.expand_dims(similarities, axis=0) # [time_step, batch_size] similarities = tf.tile(similarities, [self._current_num_timesteps, 1]) # Apply an affine transform. similarities = similarities * self.affine_a + self.affine_b output_a = tf.reshape(tf.convert_to_tensor(self.affine_a), [1, 1]) output_b = tf.reshape(tf.convert_to_tensor(self.affine_b), [1, 1]) output_a = tf.tile(output_a, [self._current_num_timesteps, self._current_batch_size]) output_b = tf.tile(output_b, [self._current_num_timesteps, self._current_batch_size]) return common.AgentOutput( policy_logits=similarities, baseline=(output_a, output_b))
def spline_slope_constraint(s, dtype=tf.float32): """Maps `s` to all positive with `s[..., 0] == s[..., -1] == 1`.""" # Slice off a position since this is nknots - 2 vs nknots - 1 for bin sizes. min_slope = 1e-2 return tf.math.softplus(tf.cast(s[..., :-1], dtype)) + min_slope
def _preprocess_train_image(image, mean_rgb, stddev_rgb): image = tf.cast(image, tf.float32) image = _augment_image(image) image = (image - mean_rgb) / stddev_rgb return image
def read_value(self): val = self._variable.read_value() return tf.cast(val, self._cast_dtype)
def _preprocess_eval_image(image, mean_rgb, stddev_rgb): image = tf.cast(image, tf.float32) image = (image - mean_rgb) / stddev_rgb return image
def gather_nd(self, indices, name=None): """Gather slices of the variable into a Tensor.""" val = self._variable.gather_nd(indices, name=name) return tf.cast(val, self._cast_dtype)
def nelder_mead_one_step(current_simplex, current_objective_values, objective_function=None, dim=None, func_tolerance=None, position_tolerance=None, batch_evaluate_objective=False, reflection=None, expansion=None, contraction=None, shrinkage=None, name=None): """A single iteration of the Nelder Mead algorithm.""" with tf1.name_scope(name, 'nelder_mead_one_step'): domain_dtype = current_simplex.dtype.base_dtype order = tf.argsort(current_objective_values, direction='ASCENDING', stable=True) (best_index, worst_index, second_worst_index) = order[0], order[-1], order[-2] worst_vertex = current_simplex[worst_index] (best_objective_value, worst_objective_value, second_worst_objective_value) = ( current_objective_values[best_index], current_objective_values[worst_index], current_objective_values[second_worst_index]) # Compute the centroid of the face opposite the worst vertex. face_centroid = tf.reduce_sum(input_tensor=current_simplex, axis=0) - worst_vertex face_centroid /= tf.cast(dim, domain_dtype) # Reflect the worst vertex through the opposite face. reflected = face_centroid + reflection * (face_centroid - worst_vertex) objective_at_reflected = objective_function(reflected) num_evaluations = 1 has_converged = _check_convergence(current_simplex, current_simplex[best_index], best_objective_value, worst_objective_value, func_tolerance, position_tolerance) def _converged_fn(): return (True, current_simplex, current_objective_values, 0) case0 = has_converged, _converged_fn accept_reflected = ( (objective_at_reflected < second_worst_objective_value) & (objective_at_reflected >= best_objective_value)) accept_reflected_fn = _accept_reflected_fn(current_simplex, current_objective_values, worst_index, reflected, objective_at_reflected) case1 = accept_reflected, accept_reflected_fn do_expansion = objective_at_reflected < best_objective_value expansion_fn = _expansion_fn(objective_function, current_simplex, current_objective_values, worst_index, reflected, objective_at_reflected, face_centroid, expansion) case2 = do_expansion, expansion_fn do_outside_contraction = ( (objective_at_reflected < worst_objective_value) & (objective_at_reflected >= second_worst_objective_value)) outside_contraction_fn = _outside_contraction_fn( objective_function, current_simplex, current_objective_values, face_centroid, best_index, worst_index, reflected, objective_at_reflected, contraction, shrinkage, batch_evaluate_objective) case3 = do_outside_contraction, outside_contraction_fn default_fn = _inside_contraction_fn( objective_function, current_simplex, current_objective_values, face_centroid, best_index, worst_index, worst_objective_value, contraction, shrinkage, batch_evaluate_objective) (converged, next_simplex, next_objective_at_simplex, case_evals) = prefer_static.case([case0, case1, case2, case3], default=default_fn, exclusive=False) next_simplex.set_shape(current_simplex.shape) next_objective_at_simplex.set_shape(current_objective_values.shape) return (converged, next_simplex, next_objective_at_simplex, num_evaluations + case_evals)
def _call(self, r): c = tf.cast(self._concentration, r.dtype) er = tf.math.exp(r) mean = grad_mean = er * c variance = er * mean return mean, variance, grad_mean
def _at_least_x_are_equal(a, b, x): """At least `x` of `a` and `b` `Tensors` are equal.""" match = tf.equal(a, b) match = tf.cast(match, tf.int32) return tf.greater_equal(tf.reduce_sum(match), x)
def _call(self, r): mean = tf.identity(r) grad_mean = tf.ones_like(r) s = tf.cast(self._scale, r.dtype) variance = tf.fill(tf.shape(r), s**2.) return mean, variance, grad_mean
def equal32(x, y): return tf.cast(tf.equal(x, y), tf.float32)
def _as_distribution(self, r): scale = DeferredTensor(self._scale, lambda x: tf.cast(x, r.dtype), dtype=r.dtype) return tfd.Normal(loc=DeferredTensor(r, tf.math.reciprocal), scale=scale)
def compute_loss_and_metrics(mu, log_sigma_sq, regression_targets, labels, task_type, model_uncertainty, loss_config, regularization_loss=0., confidence_interval=95, mode='train'): """Computes loss statistics and other metrics.""" scalars_to_log = dict() vectors_to_log = dict() scalars_to_log['regularization_loss'] = regularization_loss vectors_to_log['mu'] = mu if task_type == TASK_CLASSIFICATION: cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=mu, labels=labels, name='cross_entropy') classification_loss = tf.reduce_mean(cross_entropy, name='class_loss') total_loss = classification_loss sigma = None scalars_to_log['classification_loss'] = classification_loss predicted_labels = tf.argmax(mu, axis=1) correct_predictions = equal32(predicted_labels, labels) else: regression_loss = mse_loss(mu, regression_targets) if 'mse_normalize' in loss_config and loss_config['mse_normalize']: assert task_type in [ TASK_GROUNDED_UNNORMALIZED_REGRESSION, TASK_NORMALIZED_REGRESSION ] regression_loss = normalize_regression_loss(regression_loss, mu) avg_regression_loss = tf.reduce_mean(regression_loss) vectors_to_log['regression_loss'] = regression_loss scalars_to_log['regression_loss'] = avg_regression_loss scalars_to_log['avg_mu'] = tf.reduce_mean(mu) scalars_to_log['var_mu'] = tf.reduce_mean( mse_loss(mu, tf.reduce_mean(mu))) predicted_labels = tf.cast(mu > 0, tf.int64) correct_predictions = equal32(predicted_labels, labels) if model_uncertainty: # This implements Eq. (1) in https://arxiv.org/pdf/1612.01474.pdf inv_sigma_sq = tf.math.exp(-log_sigma_sq) scaled_regression_loss = regression_loss * inv_sigma_sq scaled_regression_loss = tf.reduce_mean(scaled_regression_loss) uncertainty_loss = tf.reduce_mean(log_sigma_sq) total_loss = uncertainty_loss + scaled_regression_loss scalars_to_log['uncertainty_loss'] = uncertainty_loss scalars_to_log['scaled_regression_loss'] = scaled_regression_loss scalars_to_log['uncertainty_plus_scaled_regression'] = total_loss sigma = tf.math.exp(log_sigma_sq / 2.) vectors_to_log['sigma'] = sigma scalars_to_log['avg_sigma'] = tf.reduce_mean(sigma) var_sigma = tf.reduce_mean(mse_loss(sigma, tf.reduce_mean(sigma))) scalars_to_log['var_sigma'] = var_sigma # Compute # of labels that fall into the confidence interval. std_factor = get_std_factor_from_confidence_percent( confidence_interval) lower_bound = mu - std_factor * sigma upper_bound = mu + std_factor * sigma preds = tf.logical_and(tf.greater(regression_targets, lower_bound), tf.less(regression_targets, upper_bound)) percent_in_conf_interval = tf.reduce_mean( tf.cast(preds, tf.float32)) scalars_to_log[ 'percent_in_conf_interval'] = percent_in_conf_interval * 100 error_sigma_corr = tfp.stats.correlation(x=regression_loss, y=sigma, event_axis=None) scalars_to_log['error_sigma_correlation'] = error_sigma_corr dists = tfp.distributions.Normal(mu, sigma) probs = dists.prob(regression_targets) scalars_to_log['avg_prob'] = tf.reduce_mean(probs) else: total_loss = avg_regression_loss loss_name = str(mode) + '_loss' total_loss = tf.add(total_loss, regularization_loss, name=loss_name) scalars_to_log[loss_name] = total_loss vectors_to_log['correct_predictions'] = correct_predictions scalars_to_log['prediction_accuracy'] = tf.reduce_mean(correct_predictions) # Validate that metrics outputted are exactly what is expected expected = get_all_metric_names(task_type, model_uncertainty, loss_config, mode, False) assert set(expected) == set(scalars_to_log.keys()) return scalars_to_log, vectors_to_log
def _call(self, r): mean = tf.math.softplus(r) grad_mean = tf.math.sigmoid(r) variance = mean + mean**2 / tf.cast(self._total_count, r.dtype) return mean, variance, grad_mean
epochs=1, validation_data=(test_images, test_labels)) converter = tf.lite.TFLiteConverter.from_keras_model(model) tflite_model = converter.convert() tflite_models_dir = pathlib.Path("/tmp/mnist_tflite_models/") tflite_models_dir.mkdir(exist_ok=True, parents=True) tflite_model_file = tflite_models_dir / "mnist_model.tflite" tflite_model_file.write_bytes(tflite_model) converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE] mnist_train, _ = tf.keras.datasets.mnist.load_data() images = tf.cast(mnist_train[0], tf.float32) / 255.0 mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1) def representative_data_gen(): for input_value in mnist_ds.take(100): yield [input_value] converter.representative_dataset = representative_data_gen tflite_model_quant = converter.convert() tflite_model_quant_file = tflite_models_dir / "mnist_model_quant.tflite" tflite_model_quant_file.write_bytes(tflite_model_quant) # converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
def spline_bin_size_constraint(x, lo=-1, hi=1, dtype=tf.float32): """Maps innermost axis of `x` to positive values.""" nbins = tf.cast(tf.shape(x)[-1], dtype) min_width = 1e-2 scale = hi - lo - nbins * min_width return tf.math.softmax(tf.cast(x, dtype)) * scale + min_width
def main(argv): del argv # unused arg if not FLAGS.use_gpu: raise ValueError('Only GPU is currently supported.') if FLAGS.num_cores > 1: raise ValueError('Only a single accelerator is currently supported.') tf.enable_v2_behavior() tf.random.set_seed(FLAGS.seed) tf.io.gfile.makedirs(FLAGS.output_dir) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores steps_per_eval = IMAGENET_VALIDATION_IMAGES // batch_size dataset_test = utils.ImageNetInput(is_training=False, data_dir=FLAGS.data_dir, batch_size=FLAGS.per_core_batch_size, use_bfloat16=False).input_fn() test_datasets = {'clean': dataset_test} corruption_types, max_intensity = utils.load_corrupted_test_info() for name in corruption_types: for intensity in range(1, max_intensity + 1): dataset_name = '{0}_{1}'.format(name, intensity) test_datasets[dataset_name] = utils.load_corrupted_test_dataset( name=name, intensity=intensity, batch_size=FLAGS.per_core_batch_size, drop_remainder=True, use_bfloat16=False) model = deterministic_model.resnet50(input_shape=(224, 224, 3), num_classes=NUM_CLASSES) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) # Search for checkpoints from their index file; then remove the index suffix. ensemble_filenames = tf.io.gfile.glob( os.path.join(FLAGS.checkpoint_dir, '**/*.index')) ensemble_filenames = [filename[:-6] for filename in ensemble_filenames] ensemble_size = len(ensemble_filenames) logging.info('Ensemble size: %s', ensemble_size) logging.info('Ensemble number of weights: %s', ensemble_size * model.count_params()) logging.info('Ensemble filenames: %s', str(ensemble_filenames)) checkpoint = tf.train.Checkpoint(model=model) # Write model predictions to files. num_datasets = len(test_datasets) for m, ensemble_filename in enumerate(ensemble_filenames): checkpoint.restore(ensemble_filename) for n, (name, test_dataset) in enumerate(test_datasets.items()): filename = '{dataset}_{member}.npy'.format(dataset=name, member=m) filename = os.path.join(FLAGS.output_dir, filename) if not tf.io.gfile.exists(filename): logits = [] test_iterator = iter(test_dataset) for _ in range(steps_per_eval): features, _ = next(test_iterator) logits.append(model(features, training=False)) logits = tf.concat(logits, axis=0) with tf.io.gfile.GFile(filename, 'w') as f: np.save(f, logits.numpy()) percent = (m * num_datasets + (n + 1)) / (ensemble_size * num_datasets) message = ( '{:.1%} completion for prediction: ensemble member {:d}/{:d}. ' 'Dataset {:d}/{:d}'.format(percent, m + 1, ensemble_size, n + 1, num_datasets)) logging.info(message) metrics = { 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/gibbs_cross_entropy': tf.keras.metrics.Mean(), 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': ed.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } corrupt_metrics = {} for name in test_datasets: corrupt_metrics['test/nll_{}'.format(name)] = tf.keras.metrics.Mean() corrupt_metrics['test/accuracy_{}'.format(name)] = ( tf.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format( name)] = ed.metrics.ExpectedCalibrationError( num_bins=FLAGS.num_bins) # Evaluate model predictions. for n, (name, test_dataset) in enumerate(test_datasets.items()): logits_dataset = [] for m in range(ensemble_size): filename = '{dataset}_{member}.npy'.format(dataset=name, member=m) filename = os.path.join(FLAGS.output_dir, filename) with tf.io.gfile.GFile(filename, 'rb') as f: logits_dataset.append(np.load(f)) logits_dataset = tf.convert_to_tensor(logits_dataset) test_iterator = iter(test_dataset) for step in range(steps_per_eval): _, labels = next(test_iterator) logits = logits_dataset[:, (step * batch_size):((step + 1) * batch_size)] labels = tf.cast(tf.reshape(labels, [-1]), tf.int32) negative_log_likelihood = tf.reduce_mean( ensemble_negative_log_likelihood(labels, logits)) per_probs = tf.nn.softmax(logits) probs = tf.reduce_mean(per_probs, axis=0) if name == 'clean': gibbs_ce = tf.reduce_mean(gibbs_cross_entropy(labels, logits)) metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/gibbs_cross_entropy'].update_state(gibbs_ce) metrics['test/accuracy'].update_state(labels, probs) metrics['test/ece'].update_state(labels, probs) else: corrupt_metrics['test/nll_{}'.format(name)].update_state( negative_log_likelihood) corrupt_metrics['test/accuracy_{}'.format(name)].update_state( labels, probs) corrupt_metrics['test/ece_{}'.format(name)].update_state( labels, probs) message = ( '{:.1%} completion for evaluation: dataset {:d}/{:d}'.format( (n + 1) / num_datasets, n + 1, num_datasets)) logging.info(message) corrupt_results = utils.aggregate_corrupt_metrics( corrupt_metrics, corruption_types, max_intensity, FLAGS.alexnet_errors_path) total_results = {name: metric.result() for name, metric in metrics.items()} total_results.update(corrupt_results) logging.info('Metrics: %s', total_results)