def step_fn(inputs): """Per-Replica StepFn.""" images, labels = inputs images = tf.tile(images, [FLAGS.num_models, 1, 1, 1]) logits = model(images, training=False) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) probs = tf.nn.softmax(logits) per_probs = tf.split(probs, num_or_size_splits=FLAGS.num_models, axis=0) for i in range(FLAGS.num_models): member_probs = per_probs[i] member_loss = tf.keras.losses.sparse_categorical_crossentropy( labels, member_probs) test_nlls[i].update_state(member_loss) test_accs[i].update_state(labels, member_probs) probs = tf.reduce_mean(per_probs, axis=0) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy(labels, probs)) test_nll.update_state(negative_log_likelihood) test_accuracy.update_state(labels, probs)
def __call__(self, mask_outputs, mask_targets, select_class_targets): """Computes the mask loss of Mask-RCNN. This function implements the mask loss of Mask-RCNN. As the `mask_outputs` produces `num_classes` masks for each RoI, the reference model expands `mask_targets` to match the shape of `mask_outputs` and selects only the target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/mask_rcnn.py) # pylint: disable=line-too-long Instead, this implementation selects the `mask_outputs` by the `class_targets` so that it doesn't expand `mask_targets`. Note that the selection logic is done in the post-processing of mask_rcnn_fn in mask_rcnn_architecture.py. Args: mask_outputs: a float tensor representing the prediction for each mask, with a shape of [batch_size, num_masks, mask_height, mask_width]. mask_targets: a float tensor representing the binary mask of ground truth labels for each mask with a shape of [batch_size, num_masks, mask_height, mask_width]. select_class_targets: a tensor with a shape of [batch_size, num_masks], representing the foreground mask targets. Returns: mask_loss: a float tensor representing total mask loss. """ with tf.compat.v1.name_scope('mask_loss'): (batch_size, num_masks, mask_height, mask_width) = mask_outputs.get_shape().as_list() weights = tf.tile( tf.reshape(tf.greater(select_class_targets, 0), [batch_size, num_masks, 1, 1]), [1, 1, mask_height, mask_width]) return tf.compat.v1.losses.sigmoid_cross_entropy( mask_targets, mask_outputs, weights=weights, reduction=tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
def _fast_rcnn_box_loss(self, box_outputs, box_targets, class_targets, normalizer=1.0, delta=1.): """Computes box regression loss.""" # The delta is typically around the mean value of regression target. # for instances, the regression targets of 512x512 input with 6 anchors on # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2]. with tf.compat.v1.name_scope('fast_rcnn_box_loss'): mask = tf.tile( tf.expand_dims(tf.greater(class_targets, 0), axis=2), [1, 1, 4]) # The loss is normalized by the sum of non-zero weights before additional # normalizer provided by the function caller. box_loss = tf.compat.v1.losses.huber_loss( box_targets, box_outputs, weights=mask, delta=delta, reduction=tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS) box_loss /= normalizer return box_loss
def class_loss(self, cls_outputs, cls_targets, num_positives, ignore_label=-2): """Computes RetinaNet classification loss.""" # Onehot encoding for classification labels. cls_targets_one_hot = tf.one_hot(cls_targets, self._num_classes) bs, height, width, _, _ = cls_targets_one_hot.get_shape().as_list() cls_targets_one_hot = tf.reshape(cls_targets_one_hot, [bs, height, width, -1]) loss = focal_loss(cls_outputs, cls_targets_one_hot, self._focal_loss_alpha, self._focal_loss_gamma, num_positives) ignore_loss = tf.where( tf.equal(cls_targets, ignore_label), tf.zeros_like(cls_targets, dtype=tf.float32), tf.ones_like(cls_targets, dtype=tf.float32), ) ignore_loss = tf.expand_dims(ignore_loss, -1) ignore_loss = tf.tile(ignore_loss, [1, 1, 1, 1, self._num_classes]) ignore_loss = tf.reshape(ignore_loss, tf.shape(input=loss)) return tf.reduce_sum(input_tensor=ignore_loss * loss)
def _sample_n(self, n, seed=None, conditional_input=None, training=False): """Samples from the distribution, with optional conditional input. Args: n: `int`, number of samples desired. seed: `int`, seed for RNG. Setting a random seed enforces reproducability of the samples between sessions (not within a single session). conditional_input: `Tensor` on which to condition the distribution (e.g. class labels), or `None`. training: `bool` or `None`. If `bool`, it controls the dropout layer, where `True` implies dropout is active. If `None`, it defers to Keras' handling of train/eval status. Returns: samples: a `Tensor` of shape `[n, height, width, num_channels]`. """ if conditional_input is not None: conditional_input = tf.convert_to_tensor(conditional_input, dtype=self.dtype) conditional_event_rank = tensorshape_util.rank( self.conditional_shape) conditional_input_shape = prefer_static.shape(conditional_input) conditional_sample_rank = prefer_static.rank( conditional_input) - conditional_event_rank # If `conditional_input` has no sample dimensions, prepend a sample # dimension if conditional_sample_rank == 0: conditional_input = conditional_input[tf.newaxis, ...] conditional_sample_rank = 1 # Assert that the conditional event shape in the `PixelCnnNetwork` is the # same as that implied by `conditional_input`. conditional_event_shape = conditional_input_shape[ conditional_sample_rank:] with tf.control_dependencies([ tf.assert_equal(self.conditional_shape, conditional_event_shape) ]): conditional_sample_shape = conditional_input_shape[: conditional_sample_rank] repeat = n // prefer_static.reduce_prod( conditional_sample_shape) h = tf.reshape( conditional_input, prefer_static.concat([(-1, ), self.conditional_shape], axis=0)) h = tf.tile( h, prefer_static.pad([repeat], paddings=[[0, conditional_event_rank]], constant_values=1)) samples_0 = tf.random.uniform(prefer_static.concat( [(n, ), self.event_shape], axis=0), minval=-1., maxval=1., dtype=self.dtype, seed=seed) inputs = samples_0 if conditional_input is None else [samples_0, h] params_0 = self.network(inputs, training=training) samples_0 = self._sample_channels(*params_0, seed=seed) image_height, image_width, _ = tensorshape_util.as_list( self.event_shape) def loop_body(index, samples): """Loop for iterative pixel sampling. Args: index: 0D `Tensor` of type `int32`. Index of the current pixel. samples: 4D `Tensor`. Images with pixels sampled in raster order, up to pixel `[index]`, with dimensions `[batch_size, height, width, num_channels]`. Returns: samples: 4D `Tensor`. Images with pixels sampled in raster order, up to and including pixel `[index]`, with dimensions `[batch_size, height, width, num_channels]`. """ inputs = samples if conditional_input is None else [samples, h] params = self.network(inputs, training=training) samples_new = self._sample_channels(*params, seed=seed) # Update the current pixel samples = tf.transpose(samples, [1, 2, 3, 0]) samples_new = tf.transpose(samples_new, [1, 2, 3, 0]) row, col = index // image_width, index % image_width updates = samples_new[row, col, ...][tf.newaxis, ...] samples = tf.tensor_scatter_nd_update(samples, [[row, col]], updates) samples = tf.transpose(samples, [3, 0, 1, 2]) return index + 1, samples index0 = tf.zeros([], dtype=tf.int32) # Construct the while loop for sampling total_pixels = image_height * image_width loop_cond = lambda ind, _: tf.less(ind, total_pixels) # noqa: E731 init_vars = (index0, samples_0) _, samples = tf.while_loop(loop_cond, loop_body, init_vars, parallel_iterations=1) transformed_samples = (self._low + 0.5 * (self._high - self._low) * (samples + 1.)) return tf.round(transformed_samples)
def testExplicitBlocks(self, dynamic_shape, batch_shape): block_sizes = tf.convert_to_tensor(value=[2, 1, 3]) block_sizes = tf1.placeholder_with_default( block_sizes, shape=([None] * len(block_sizes.shape) if dynamic_shape else block_sizes.shape)) exp = tfb.Exp() sp = tfb.Softplus() aff = tfb.Affine(scale_diag=[2., 3., 4.]) blockwise = tfb.Blockwise(bijectors=[exp, sp, aff], block_sizes=block_sizes, maybe_changes_size=False) x = tf.cast([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype=tf.float32) for s in batch_shape: x = tf.expand_dims(x, 0) x = tf.tile(x, [s] + [1] * (tensorshape_util.rank(x.shape) - 1)) x = tf1.placeholder_with_default( x, shape=None if dynamic_shape else x.shape) # Identity to break the caching. blockwise_y = tf.identity(blockwise.forward(x)) blockwise_fldj = blockwise.forward_log_det_jacobian(x, event_ndims=1) blockwise_x = blockwise.inverse(blockwise_y) blockwise_ildj = blockwise.inverse_log_det_jacobian(blockwise_y, event_ndims=1) if not dynamic_shape: self.assertEqual(blockwise_y.shape, batch_shape + [6]) self.assertEqual(blockwise_fldj.shape, batch_shape + []) self.assertEqual(blockwise_x.shape, batch_shape + [6]) self.assertEqual(blockwise_ildj.shape, batch_shape + []) self.assertAllEqual(self.evaluate(tf.shape(blockwise_y)), batch_shape + [6]) self.assertAllEqual(self.evaluate(tf.shape(blockwise_fldj)), batch_shape + []) self.assertAllEqual(self.evaluate(tf.shape(blockwise_x)), batch_shape + [6]) self.assertAllEqual(self.evaluate(tf.shape(blockwise_ildj)), batch_shape + []) expl_y = tf.concat([ exp.forward(x[..., :2]), sp.forward(x[..., 2:3]), aff.forward(x[..., 3:]), ], axis=-1) expl_fldj = sum([ exp.forward_log_det_jacobian(x[..., :2], event_ndims=1), sp.forward_log_det_jacobian(x[..., 2:3], event_ndims=1), aff.forward_log_det_jacobian(x[..., 3:], event_ndims=1) ]) expl_x = tf.concat([ exp.inverse(expl_y[..., :2]), sp.inverse(expl_y[..., 2:3]), aff.inverse(expl_y[..., 3:]) ], axis=-1) expl_ildj = sum([ exp.inverse_log_det_jacobian(expl_y[..., :2], event_ndims=1), sp.inverse_log_det_jacobian(expl_y[..., 2:3], event_ndims=1), aff.inverse_log_det_jacobian(expl_y[..., 3:], event_ndims=1) ]) self.assertAllClose(self.evaluate(expl_y), self.evaluate(blockwise_y)) self.assertAllClose(self.evaluate(expl_fldj), self.evaluate(blockwise_fldj)) self.assertAllClose(self.evaluate(expl_x), self.evaluate(blockwise_x)) self.assertAllClose(self.evaluate(expl_ildj), self.evaluate(blockwise_ildj))
def _sample_n(self, n, seed=None): if self._use_static_graph: # This sampling approach is almost the same as the approach used by # `MixtureSameFamily`. The differences are due to having a list of # `Distribution` objects rather than a single object, and maintaining # random seed management that is consistent with the non-static code # path. samples = [] cat_samples = self.cat.sample(n, seed=seed) stream = SeedStream(seed, salt='Mixture') for c in range(self.num_components): samples.append(self.components[c].sample(n, seed=stream())) stack_axis = -1 - tensorshape_util.rank(self._static_event_shape) x = tf.stack(samples, axis=stack_axis) # [n, B, k, E] npdt = dtype_util.as_numpy_dtype(x.dtype) mask = tf.one_hot( indices=cat_samples, # [n, B] depth=self._num_components, # == k on_value=npdt(1), off_value=npdt(0)) # [n, B, k] mask = distribution_util.pad_mixture_dimensions( mask, self, self._cat, tensorshape_util.rank( self._static_event_shape)) # [n, B, k, [1]*e] return tf.reduce_sum(x * mask, axis=stack_axis) # [n, B, E] n = tf.convert_to_tensor(n, name='n') static_n = tf.get_static_value(n) n = int(static_n) if static_n is not None else n cat_samples = self.cat.sample(n, seed=seed) static_samples_shape = cat_samples.shape if tensorshape_util.is_fully_defined(static_samples_shape): samples_shape = tensorshape_util.as_list(static_samples_shape) samples_size = tensorshape_util.num_elements(static_samples_shape) else: samples_shape = tf.shape(cat_samples) samples_size = tf.size(cat_samples) static_batch_shape = self.batch_shape if tensorshape_util.is_fully_defined(static_batch_shape): batch_shape = tensorshape_util.as_list(static_batch_shape) batch_size = tensorshape_util.num_elements(static_batch_shape) else: batch_shape = tf.shape(cat_samples)[1:] batch_size = tf.reduce_prod(batch_shape) static_event_shape = self.event_shape if tensorshape_util.is_fully_defined(static_event_shape): event_shape = np.array( tensorshape_util.as_list(static_event_shape), dtype=np.int32) else: event_shape = None # Get indices into the raw cat sampling tensor. We will # need these to stitch sample values back out after sampling # within the component partitions. samples_raw_indices = tf.reshape(tf.range(0, samples_size), samples_shape) # Partition the raw indices so that we can use # dynamic_stitch later to reconstruct the samples from the # known partitions. partitioned_samples_indices = tf.dynamic_partition( data=samples_raw_indices, partitions=cat_samples, num_partitions=self.num_components) # Copy the batch indices n times, as we will need to know # these to pull out the appropriate rows within the # component partitions. batch_raw_indices = tf.reshape(tf.tile(tf.range(0, batch_size), [n]), samples_shape) # Explanation of the dynamic partitioning below: # batch indices are i.e., [0, 1, 0, 1, 0, 1] # Suppose partitions are: # [1 1 0 0 1 1] # After partitioning, batch indices are cut as: # [batch_indices[x] for x in 2, 3] # [batch_indices[x] for x in 0, 1, 4, 5] # i.e. # [1 1] and [0 0 0 0] # Now we sample n=2 from part 0 and n=4 from part 1. # For part 0 we want samples from batch entries 1, 1 (samples 0, 1), # and for part 1 we want samples from batch entries 0, 0, 0, 0 # (samples 0, 1, 2, 3). partitioned_batch_indices = tf.dynamic_partition( data=batch_raw_indices, partitions=cat_samples, num_partitions=self.num_components) samples_class = [None for _ in range(self.num_components)] stream = SeedStream(seed, salt='Mixture') for c in range(self.num_components): n_class = tf.size(partitioned_samples_indices[c]) samples_class_c = self.components[c].sample(n_class, seed=stream()) if event_shape is None: batch_ndims = prefer_static.rank_from_shape(batch_shape) event_shape = tf.shape(samples_class_c)[1 + batch_ndims:] # Pull out the correct batch entries from each index. # To do this, we may have to flatten the batch shape. # For sample s, batch element b of component c, we get the # partitioned batch indices from # partitioned_batch_indices[c]; and shift each element by # the sample index. The final lookup can be thought of as # a matrix gather along locations (s, b) in # samples_class_c where the n_class rows correspond to # samples within this component and the batch_size columns # correspond to batch elements within the component. # # Thus the lookup index is # lookup[c, i] = batch_size * s[i] + b[c, i] # for i = 0 ... n_class[c] - 1. lookup_partitioned_batch_indices = ( batch_size * tf.range(n_class) + partitioned_batch_indices[c]) samples_class_c = tf.reshape( samples_class_c, tf.concat([[n_class * batch_size], event_shape], 0)) samples_class_c = tf.gather(samples_class_c, lookup_partitioned_batch_indices, name='samples_class_c_gather') samples_class[c] = samples_class_c # Stitch back together the samples across the components. lhs_flat_ret = tf.dynamic_stitch(indices=partitioned_samples_indices, data=samples_class) # Reshape back to proper sample, batch, and event shape. ret = tf.reshape(lhs_flat_ret, tf.concat([samples_shape, event_shape], 0)) tensorshape_util.set_shape( ret, tensorshape_util.concatenate(static_samples_shape, self.event_shape)) return ret
def step_fn(inputs): """Per-Replica StepFn.""" images = inputs['features'] labels = inputs['labels'] images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1]) # generate lambdas lambdas = log_uniform_sample(per_core_batch_size, lambda_parameters) lambdas = tf.reshape(lambdas, (FLAGS.ensemble_size * per_core_batch_size, lambdas_config.dim)) with tf.GradientTape() as tape: logits = model([images, lambdas], training=True) if FLAGS.use_gibbs_ce: # Average of single model CEs # tiling of labels should be only done for Gibbs CE loss labels = tf.tile(labels, [FLAGS.ensemble_size]) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( labels, logits, from_logits=True)) else: # Ensemble CE uses no tiling of the labels negative_log_likelihood = ensemble_crossentropy( labels, logits, FLAGS.ensemble_size) # Note: Divide l2_loss by sample_size (this differs from uncertainty_ # baselines implementation.) l2_loss = sum(model.losses) / train_sample_size loss = negative_log_likelihood + l2_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) # Separate learning rate for fast weights. grads_and_vars = [] for grad, var in zip(grads, model.trainable_variables): if (('alpha' in var.name or 'gamma' in var.name) and 'batch_norm' not in var.name): grads_and_vars.append( (grad * FLAGS.fast_weight_lr_multiplier, var)) else: grads_and_vars.append((grad, var)) optimizer.apply_gradients(grads_and_vars) probs = tf.nn.softmax(logits) per_probs = tf.split(probs, num_or_size_splits=FLAGS.ensemble_size, axis=0) per_probs_stacked = tf.stack(per_probs, axis=0) metrics['train/ece'].add_batch(probs, label=labels) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, logits) diversity = rm.metrics.AveragePairwiseDiversity() diversity.add_batch(per_probs_stacked, num_models=FLAGS.ensemble_size) diversity_results = diversity.result() for k, v in diversity_results.items(): metrics['train/' + k].update_state(v) if grads_and_vars: grads, _ = zip(*grads_and_vars)
def sigmoid_metric_transform(metrics: tf.Tensor): batch_size = tf.shape(tf.nest.flatten(metrics)[0])[0] sigmoid_batch_mask = tf.reshape( tf.tile(sigmoid_metric_mask, [batch_size]), [batch_size, len(sigmoid_metric_mask)]) return tf.where(sigmoid_batch_mask, tf.sigmoid(metrics), metrics)
def sample_halton_sequence(dim, num_results=None, sequence_indices=None, dtype=tf.float32, randomized=True, seed=None, name=None): r"""Returns a sample from the `dim` dimensional Halton sequence. Warning: The sequence elements take values only between 0 and 1. Care must be taken to appropriately transform the domain of a function if it differs from the unit cube before evaluating integrals using Halton samples. It is also important to remember that quasi-random numbers without randomization are not a replacement for pseudo-random numbers in every context. Quasi random numbers are completely deterministic and typically have significant negative autocorrelation unless randomization is used. Computes the members of the low discrepancy Halton sequence in dimension `dim`. The `dim`-dimensional sequence takes values in the unit hypercube in `dim` dimensions. Currently, only dimensions up to 1000 are supported. The prime base for the k-th axes is the k-th prime starting from 2. For example, if `dim` = 3, then the bases will be [2, 3, 5] respectively and the first element of the non-randomized sequence will be: [0.5, 0.333, 0.2]. For a more complete description of the Halton sequences see [here](https://en.wikipedia.org/wiki/Halton_sequence). For low discrepancy sequences and their applications see [here](https://en.wikipedia.org/wiki/Low-discrepancy_sequence). If `randomized` is true, this function produces a scrambled version of the Halton sequence introduced by [Owen (2017)][1]. For the advantages of randomization of low discrepancy sequences see [here]( https://en.wikipedia.org/wiki/Quasi-Monte_Carlo_method#Randomization_of_quasi-Monte_Carlo). The number of samples produced is controlled by the `num_results` and `sequence_indices` parameters. The user must supply either `num_results` or `sequence_indices` but not both. The former is the number of samples to produce starting from the first element. If `sequence_indices` is given instead, the specified elements of the sequence are generated. For example, sequence_indices=tf.range(10) is equivalent to specifying n=10. #### Examples ```python import tensorflow as tf import tensorflow_probability as tfp # Produce the first 1000 members of the Halton sequence in 3 dimensions. num_results = 1000 dim = 3 sample = tfp.mcmc.sample_halton_sequence( dim, num_results=num_results, seed=127) # Evaluate the integral of x_1 * x_2^2 * x_3^3 over the three dimensional # hypercube. powers = tf.range(1.0, limit=dim + 1) integral = tf.reduce_mean(tf.reduce_prod(sample ** powers, axis=-1)) true_value = 1.0 / tf.reduce_prod(powers + 1.0) with tf.Session() as session: values = session.run((integral, true_value)) # Produces a relative absolute error of 1.7%. print ("Estimated: %f, True Value: %f" % values) # Now skip the first 1000 samples and recompute the integral with the next # thousand samples. The sequence_indices argument can be used to do this. sequence_indices = tf.range(start=1000, limit=1000 + num_results, dtype=tf.int32) sample_leaped = tfp.mcmc.sample_halton_sequence( dim, sequence_indices=sequence_indices, seed=111217) integral_leaped = tf.reduce_mean(tf.reduce_prod(sample_leaped ** powers, axis=-1)) with tf.Session() as session: values = session.run((integral_leaped, true_value)) # Now produces a relative absolute error of 0.05%. print ("Leaped Estimated: %f, True Value: %f" % values) ``` Args: dim: Positive Python `int` representing each sample's `event_size.` Must not be greater than 1000. num_results: (Optional) Positive scalar `Tensor` of dtype int32. The number of samples to generate. Either this parameter or sequence_indices must be specified but not both. If this parameter is None, then the behaviour is determined by the `sequence_indices`. Default value: `None`. sequence_indices: (Optional) `Tensor` of dtype int32 and rank 1. The elements of the sequence to compute specified by their position in the sequence. The entries index into the Halton sequence starting with 0 and hence, must be whole numbers. For example, sequence_indices=[0, 5, 6] will produce the first, sixth and seventh elements of the sequence. If this parameter is None, then the `num_results` parameter must be specified which gives the number of desired samples starting from the first sample. Default value: `None`. dtype: (Optional) The dtype of the sample. One of: `float16`, `float32` or `float64`. Default value: `tf.float32`. randomized: (Optional) bool indicating whether to produce a randomized Halton sequence. If True, applies the randomization described in [Owen (2017)][1]. Default value: `True`. seed: PRNG seed; see `tfp.random.sanitize_seed` for details. Only used if `randomized` is True. If not supplied and `randomized` is True, no seed is set. Default value: `None`. name: (Optional) Python `str` describing ops managed by this function. If not supplied the name of this function is used. Default value: "sample_halton_sequence". Returns: halton_elements: Elements of the Halton sequence. `Tensor` of supplied dtype and `shape` `[num_results, dim]` if `num_results` was specified or shape `[s, dim]` where s is the size of `sequence_indices` if `sequence_indices` were specified. Raises: ValueError: if both `sequence_indices` and `num_results` were specified or if dimension `dim` is less than 1 or greater than 1000. #### References [1]: Art B. Owen. A randomized Halton algorithm in R. _arXiv preprint arXiv:1706.02808_, 2017. https://arxiv.org/abs/1706.02808 """ if dim < 1 or dim > _MAX_DIMENSION: raise ValueError( 'Dimension must be between 1 and {}. Supplied {}'.format( _MAX_DIMENSION, dim)) if (num_results is None) == (sequence_indices is None): raise ValueError('Either `num_results` or `sequence_indices` must be' ' specified but not both.') if not dtype_util.is_floating(dtype): raise ValueError('dtype must be of `float`-type') with tf.name_scope(name or 'sample'): # Here and in the following, the shape layout is as follows: # [sample dimension, event dimension, coefficient dimension]. # The coefficient dimension is an intermediate axes which will hold the # weights of the starting integer when expressed in the (prime) base for # an event dimension. if num_results is not None: num_results = tf.convert_to_tensor(num_results) if sequence_indices is not None: sequence_indices = tf.convert_to_tensor(sequence_indices) indices = _get_indices(num_results, sequence_indices, dtype) radixes = tf.constant(_PRIMES[0:dim], dtype=dtype, shape=[dim, 1]) max_sizes_by_axes = _base_expansion_size(tf.reduce_max(indices), radixes) max_size = tf.reduce_max(max_sizes_by_axes) # The powers of the radixes that we will need. Note that there is a bit # of an excess here. Suppose we need the place value coefficients of 7 # in base 2 and 3. For 2, we will have 3 digits but we only need 2 digits # for base 3. However, we can only create rectangular tensors so we # store both expansions in a [2, 3] tensor. This leads to the problem that # we might end up attempting to raise large numbers to large powers. For # example, base 2 expansion of 1024 has 10 digits. If we were in 10 # dimensions, then the 10th prime (29) we will end up computing 29^10 even # though we don't need it. We avoid this by setting the exponents for each # axes to 0 beyond the maximum value needed for that dimension. exponents_by_axes = tf.tile([tf.range(max_size)], [dim, 1]) # The mask is true for those coefficients that are irrelevant. weight_mask = exponents_by_axes < max_sizes_by_axes capped_exponents = tf.where(weight_mask, exponents_by_axes, tf.constant(0, exponents_by_axes.dtype)) weights = radixes**capped_exponents # The following computes the base b expansion of the indices. Suppose, # x = a0 + a1*b + a2*b^2 + ... Then, performing a floor div of x with # the vector (1, b, b^2, b^3, ...) will produce # (a0 + s1 * b, a1 + s2 * b, ...) where s_i are coefficients we don't care # about. Noting that all a_i < b by definition of place value expansion, # we see that taking the elements mod b of the above vector produces the # place value expansion coefficients. coeffs = tf.math.floordiv(indices, weights) coeffs *= tf.cast(weight_mask, dtype) coeffs %= radixes if not randomized: coeffs /= radixes return tf.reduce_sum(coeffs / weights, axis=-1) shuffle_seed, zero_correction_seed = samplers.split_seed( seed, salt='MCMCSampleHaltonSequence') coeffs = _randomize(coeffs, radixes, seed=shuffle_seed) # Remove the contribution from randomizing the trailing zero for the # axes where max_size_by_axes < max_size. This will be accounted # for separately below (using zero_correction). coeffs *= tf.cast(weight_mask, dtype) coeffs /= radixes base_values = tf.reduce_sum(coeffs / weights, axis=-1) # The randomization used in Owen (2017) does not leave 0 invariant. While # we have accounted for the randomization of the first `max_size_by_axes` # coefficients, we still need to correct for the trailing zeros. Luckily, # this is equivalent to adding a uniform random value scaled so the first # `max_size_by_axes` coefficients are zero. The following statements perform # this correction. zero_correction = samplers.uniform([dim, 1], seed=zero_correction_seed, dtype=dtype) zero_correction /= radixes**max_sizes_by_axes return base_values + tf.reshape(zero_correction, [-1])
def sample(dim, num_results=None, sequence_indices=None, randomized=True, randomization_params=None, seed=None, validate_args=False, dtype=None, name=None): r"""Returns a sample from the `dim` dimensional Halton sequence. Warning: The sequence elements take values only between 0 and 1. Care must be taken to appropriately transform the domain of a function if it differs from the unit cube before evaluating integrals using Halton samples. It is also important to remember that quasi-random numbers without randomization are not a replacement for pseudo-random numbers in every context. Quasi random numbers are completely deterministic and typically have significant negative autocorrelation unless randomization is used. Computes the members of the low discrepancy Halton sequence in dimension `dim`. The `dim`-dimensional sequence takes values in the unit hypercube in `dim` dimensions. Currently, only dimensions up to 1000 are supported. The prime base for the k-th axes is the k-th prime starting from 2. For example, if `dim` = 3, then the bases will be [2, 3, 5] respectively and the first element of the non-randomized sequence will be: [0.5, 0.333, 0.2]. For a more complete description of the Halton sequences see [here](https://en.wikipedia.org/wiki/Halton_sequence). For low discrepancy sequences and their applications see [here](https://en.wikipedia.org/wiki/Low-discrepancy_sequence). If `randomized` is true, this function produces a scrambled version of the Halton sequence introduced by [Owen (2017)][1]. For the advantages of randomization of low discrepancy sequences see [here]( https://en.wikipedia.org/wiki/Quasi-Monte_Carlo_method#Randomization_of_quasi-Monte_Carlo). The number of samples produced is controlled by the `num_results` and `sequence_indices` parameters. The user must supply either `num_results` or `sequence_indices` but not both. The former is the number of samples to produce starting from the first element. If `sequence_indices` is given instead, the specified elements of the sequence are generated. For example, sequence_indices=tf.range(10) is equivalent to specifying n=10. #### Examples ```python import tensorflow as tf import tensorflow_probability as tfp # Produce the first 1000 members of the Halton sequence in 3 dimensions. num_results = 1000 dim = 3 sample, params = qmc.halton.sample( dim, num_results=num_results, seed=127) # Evaluate the integral of x_1 * x_2^2 * x_3^3 over the three dimensional # hypercube. powers = tf.range(1.0, limit=dim + 1) integral = tf.reduce_mean(tf.reduce_prod(sample ** powers, axis=-1)) true_value = 1.0 / tf.reduce_prod(powers + 1.0) with tf.Session() as session: values = session.run((integral, true_value)) # Produces a relative absolute error of 1.7%. print ("Estimated: %f, True Value: %f" % values) # Now skip the first 1000 samples and recompute the integral with the next # thousand samples. The sequence_indices argument can be used to do this. sequence_indices = tf.range(start=1000, limit=1000 + num_results, dtype=tf.int32) sample_leaped, _ = qmc.halton.sample( dim, sequence_indices=sequence_indices, randomization_params=params) integral_leaped = tf.reduce_mean(tf.reduce_prod(sample_leaped ** powers, axis=-1)) with tf.Session() as session: values = session.run((integral_leaped, true_value)) # Now produces a relative absolute error of 0.05%. print ("Leaped Estimated: %f, True Value: %f" % values) ``` Args: dim: Positive Python `int` representing each sample's `event_size.` Must not be greater than 1000. num_results: (Optional) Positive scalar `Tensor` of dtype int32. The number of samples to generate. Either this parameter or sequence_indices must be specified but not both. If this parameter is None, then the behaviour is determined by the `sequence_indices`. Default value: `None`. sequence_indices: (Optional) `Tensor` of dtype int32 and rank 1. The elements of the sequence to compute specified by their position in the sequence. The entries index into the Halton sequence starting with 0 and hence, must be whole numbers. For example, sequence_indices=[0, 5, 6] will produce the first, sixth and seventh elements of the sequence. If this parameter is None, then the `num_results` parameter must be specified which gives the number of desired samples starting from the first sample. Default value: `None`. randomized: (Optional) bool indicating whether to produce a randomized Halton sequence. If True, applies the randomization described in [Owen (2017)][1]. If True, either seed or randomization_params must be specified. This is because the randomization uses stateless random number generation which requires an explicitly specified seed. Default value: `True`. randomization_params: (Optional) Instance of `HaltonParams` that fully describes the randomization behavior. If provided and randomized is True, seed will be ignored and these will be used instead of computing them from scratch. If randomized is False, this parameter has no effect. Default value: `None`. In this case with randomized = True, the necessary randomization parameters will be computed from scratch. seed: (Optional) Python integer to seed the random number generator. Must be specified if `randomized` is True and randomization_params is not specified. Ignored if randomized is False or randomization_params is specified. Default value: `None`. validate_args: If True, checks that maximum index is not exceeded and that the dimension `dim` is less than 1 or greater than 1000. Default value: `False`. dtype: Optional `dtype`. The dtype of the output `Tensor` (either `float32` or `float64`). Default value: `None` which maps to the `float32`. name: (Optional) Python `str` describing ops managed by this function. If not supplied the name of this function is used. Default value: "halton_sample". Returns: halton_elements: Elements of the Halton sequence. `Tensor` of supplied dtype and `shape` `[num_results, dim]` if `num_results` was specified or shape `[s, dim]` where s is the size of `sequence_indices` if `sequence_indices` were specified. randomization_params: None if randomized is False. If randomized is True and randomization_params was supplied as an argument, returns that. Otherwise returns the computed randomization_params, an instance of `HaltonParams` that fully describes the randomization behavior. Raises: ValueError: if both `sequence_indices` and `num_results` were specified. ValueError: if `randomization` is True but `seed` is not specified. InvalidArgumentError: if `validate_args` is True and the maximum supported sequence index is exceeded. #### References [1]: Art B. Owen. A randomized Halton algorithm in R. _arXiv preprint arXiv:1706.02808_, 2017. https://arxiv.org/abs/1706.02808 """ if (num_results is None) == (sequence_indices is None): raise ValueError('Either `num_results` or `sequence_indices` must be' ' specified but not both.') dtype = dtype or tf.float32 with tf.compat.v1.name_scope(name, 'halton_sample', values=[num_results, sequence_indices]): # Here and in the following, the shape layout is as follows: # [sample dimension, event dimension, coefficient dimension]. # The coefficient dimension is an intermediate axes which will hold the # weights of the starting integer when expressed in the (prime) base for # an event dimension. if num_results is not None: num_results = tf.convert_to_tensor(value=num_results, dtype=tf.int32, name='name_results') if sequence_indices is not None: sequence_indices = tf.convert_to_tensor(value=sequence_indices, dtype=tf.int32, name='sequence_indices') indices = _get_indices(num_results, sequence_indices, dtype) runtime_assertions = [] if validate_args: runtime_assertions.append( tf.compat.v1.assert_less_equal( tf.reduce_max(indices), tf.constant(_MAX_INDEX_BY_DTYPE[dtype], dtype=dtype), message= ('Maximum sequence index exceeded. Maximum index for dtype %s ' 'is %d.' % (dtype, _MAX_INDEX_BY_DTYPE[dtype])))) runtime_assertions.append( tf.compat.v1.assert_greater_equal( dim, 1, message='`dim` should be greater than 1')) runtime_assertions.append( tf.compat.v1.assert_less_equal( dim, _MAX_DIMENSION, message='`dim` should be less or equal than %d' % _MAX_DIMENSION)) with tf.compat.v1.control_dependencies(runtime_assertions): radixes = tf.convert_to_tensor(_PRIMES, dtype=dtype, name='radixes') radixes = tf.reshape(radixes[0:dim], shape=[dim, 1]) max_sizes_by_axes = tf.convert_to_tensor( _MAX_SIZES_BY_AXES[dtype], dtype=dtype, name='max_sizes_by_axes')[:dim] max_size = tf.reduce_max(max_sizes_by_axes) # The powers of the radixes that we will need. Note that there is a bit # of an excess here. Suppose we need the place value coefficients of 7 # in base 2 and 3. For 2, we will have 3 digits but we only need 2 digits # for base 3. However, we can only create rectangular tensors so we # store both expansions in a [2, 3] tensor. This leads to the problem that # we might end up attempting to raise large numbers to large powers. For # example, base 2 expansion of 1024 has 10 digits. If we were in 10 # dimensions, then the 10th prime (29) we will end up computing 29^10 even # though we don't need it. We avoid this by setting the exponents for each # axes to 0 beyond the maximum value needed for that dimension. exponents_by_axes = tf.tile([tf.range(max_size, dtype=dtype)], [dim, 1]) # The mask is true for those coefficients that are irrelevant. weight_mask = exponents_by_axes >= max_sizes_by_axes capped_exponents = tf.where(weight_mask, tf.zeros_like(exponents_by_axes), exponents_by_axes) weights = radixes**capped_exponents # The following computes the base b expansion of the indices. Suppose, # x = a0 + a1*b + a2*b^2 + ... Then, performing a floor div of x with # the vector (1, b, b^2, b^3, ...) will produce # (a0 + s1 * b, a1 + s2 * b, ...) where s_i are coefficients we don't care # about. Noting that all a_i < b by definition of place value expansion, # we see that taking the elements mod b of the above vector produces the # place value expansion coefficients. coeffs = tf.compat.v1.floor_div(indices, weights) coeffs *= 1. - tf.cast(weight_mask, dtype) coeffs %= radixes if not randomized: coeffs /= radixes return tf.reduce_sum(input_tensor=coeffs / weights, axis=-1), None if randomization_params is None: perms, zero_correction = None, None else: perms, zero_correction = randomization_params coeffs, perms = _randomize(coeffs, radixes, seed, perms=perms) # Remove the contribution from randomizing the trailing zero for the # axes where max_size_by_axes < max_size. This will be accounted # for separately below (using zero_correction). coeffs *= 1. - tf.cast(weight_mask, dtype) coeffs /= radixes base_values = tf.reduce_sum(input_tensor=coeffs / weights, axis=-1) # The randomization used in Owen (2017) does not leave 0 invariant. While # we have accounted for the randomization of the first `max_size_by_axes` # coefficients, we still need to correct for the trailing zeros. Luckily, # this is equivalent to adding a uniform random value scaled so the first # `max_size_by_axes` coefficients are zero. The following statements # perform this correction. if zero_correction is None: if seed is None: zero_correction = tf.random.uniform([dim, 1], dtype=dtype) else: zero_correction = tf.random.stateless_uniform([dim, 1], seed=(seed, seed), dtype=dtype) zero_correction /= radixes**max_sizes_by_axes zero_correction = tf.reshape(zero_correction, [-1]) return base_values + zero_correction, HaltonParams( perms, zero_correction)
def solve_nu_zeta(self, dataset: dataset_lib.OffpolicyDataset, target_policy: tf_policy.TFPolicy, regularizer: float = 1e-6): """Solves for density ratios and then approximates target policy value. Args: dataset: The dataset to sample experience from. target_policy: The policy whose value we want to estimate. regularizer: A small constant to add to matrices before inverting them or to floats before taking square root. Returns: Estimated average per-step reward of the target policy. """ if not hasattr(self, '_td_mat'): # Set up env_steps. episodes, valid_steps = dataset.get_all_episodes( limit=self._limit_episodes) total_num_steps_per_episode = tf.shape(valid_steps)[1] - 1 num_episodes = tf.shape(valid_steps)[0] num_samples = num_episodes * total_num_steps_per_episode valid_and_not_last = tf.logical_and(valid_steps, episodes.discount > 0) valid_indices = tf.squeeze( tf.where(tf.reshape(valid_and_not_last[:, :-1], [-1]))) initial_env_step = tf.nest.map_structure( lambda t: tf.squeeze( tf.reshape( tf.repeat(t[:, 0:1, ...], axis=1, repeats=total_num_steps_per_episode), [num_samples, -1])), episodes) initial_env_step = tf.nest.map_structure( lambda t: tf.gather(t, valid_indices), initial_env_step) tfagents_initial_env_step = dataset_lib.convert_to_tfagents_timestep( initial_env_step) env_step = tf.nest.map_structure( lambda t: tf.squeeze( tf.reshape(t[:, 0:total_num_steps_per_episode, ...], [num_samples, -1])), episodes) env_step = tf.nest.map_structure( lambda t: tf.gather(t, valid_indices), env_step) tfagents_env_step = dataset_lib.convert_to_tfagents_timestep( env_step) next_env_step = tf.nest.map_structure( lambda t: tf.squeeze( tf.reshape(t[:, 1:total_num_steps_per_episode + 1, ...], [num_samples, -1])), episodes) next_env_step = tf.nest.map_structure( lambda t: tf.gather(t, valid_indices), next_env_step) tfagents_next_env_step = dataset_lib.convert_to_tfagents_timestep( next_env_step) # get probabilities initial_target_probs = target_policy.distribution( tfagents_initial_env_step).action.probs_parameter() next_target_probs = target_policy.distribution( tfagents_next_env_step).action.probs_parameter() # First, get the nu_loss and data weights #current_nu_loss = self._get_nu_loss(initial_env_step, env_step, # next_env_step, target_policy) #data_weight, _ = self._get_weights(current_nu_loss) # # debug only and to reproduce dual dice result, DELETE # data_weight = tf.ones_like(data_weight) state_action_count = self._get_state_action_counts(env_step) counts = tf.reduce_sum( tf.one_hot(state_action_count, self._dimension), 0) gamma_sample = tf.pow(self._gamma, tf.cast(env_step.step_num, tf.float32)) # # debug only and to reproduce dual dice result, DELETE # gamma_sample = tf.ones_like(gamma_sample) # now we need to expand_dims to include action space in extra dimensions #data_weights = tf.reshape(data_weight, [-1, self._num_limits]) # both are data sample weights for L2 problem, needs to be normalized later #gamma_data_weights = tf.reshape(gamma_sample, [-1, 1]) * data_weights initial_states = tf.tile( tf.reshape(initial_env_step.observation, [-1, 1]), [1, self._num_actions]) initial_actions = tf.tile( tf.reshape(tf.range(self._num_actions), [1, -1]), [initial_env_step.observation.shape[0], 1]) initial_nu_indices = self._get_index(initial_states, initial_actions) # linear term w.r.t. initial distribution #b_vec_2 = tf.stack([ # tf.reduce_sum( # tf.reshape( # data_weights[:, itr] / tf.reduce_sum(data_weights[:, itr]), # [-1, 1]) * tf.reduce_sum( # tf.one_hot(initial_nu_indices, self._dimension) * # (1 - self._gamma) * # tf.expand_dims(initial_target_probs, axis=-1), # axis=1), # axis=0) for itr in range(self._num_limits) #], # axis=0) next_states = tf.tile( tf.reshape(next_env_step.observation, [-1, 1]), [1, self._num_actions]) next_actions = tf.tile( tf.reshape(tf.range(self._num_actions), [1, -1]), [next_env_step.observation.shape[0], 1]) next_nu_indices = self._get_index(next_states, next_actions) next_nu_indices = tf.where( tf.expand_dims(next_env_step.is_absorbing(), -1), -1 * tf.ones_like(next_nu_indices), next_nu_indices) nu_indices = self._get_index(env_step.observation, env_step.action) target_log_probabilities = target_policy.distribution( tfagents_env_step).action.log_prob(env_step.action) if not self._solve_for_state_action_ratio: policy_ratio = tf.exp(target_log_probabilities - env_step.get_log_probability()) else: policy_ratio = tf.ones([ target_log_probabilities.shape[0], ]) policy_ratios = tf.tile(tf.reshape(policy_ratio, [-1, 1]), [1, self._num_actions]) # the tabular feature vector a_vec = tf.one_hot(nu_indices, self._dimension) - tf.reduce_sum( self._gamma * tf.expand_dims(next_target_probs * policy_ratios, axis=-1) * tf.one_hot(next_nu_indices, self._dimension), axis=1) # linear term w.r.t. reward #b_vec_1 = tf.stack([ # tf.reduce_sum( # tf.reshape( # (gamma_data_weights[:, itr] / # tf.reduce_sum(gamma_data_weights[:, itr])) * self._reward_fn(env_step), #/ # #tf.cast(state_action_count, tf.float32), # [-1, 1]) * a_vec, # axis=0) for itr in range(self._num_limits) #], # axis=0) # quadratic term of feature # Get weighted outer product by using einsum to save computing resource! #a_mat = tf.stack([ # tf.einsum( # 'ai, a, aj -> ij', a_vec, # #1.0 / tf.cast(state_action_count, tf.float32), # gamma_data_weights[:, itr] / # tf.reduce_sum(gamma_data_weights[:, itr]), # a_vec) # for itr in range(self._num_limits) #], # axis=0) td_mat = tf.einsum('ai, a, aj -> ij', tf.one_hot(nu_indices, self._dimension), 1.0 / tf.cast(state_action_count, tf.float32), a_vec) weighted_rewards = policy_ratio * self._reward_fn(env_step) bias = tf.reduce_sum( tf.one_hot(nu_indices, self._dimension) * tf.reshape(weighted_rewards, [-1, 1]) * 1.0 / tf.cast(state_action_count, tf.float32)[:, None], axis=0) # Initialize self._nu = np.ones_like(self._nu) * bias[:, None] self._nu2 = np.ones_like(self._nu2) * bias[:, None] self._a_vec = a_vec self._td_mat = td_mat self._bias = bias self._weighted_rewards = weighted_rewards self._state_action_count = state_action_count self._nu_indices = nu_indices self._initial_nu_indices = initial_nu_indices self._initial_target_probs = initial_target_probs self._gamma_sample = gamma_sample self._gamma_sample = tf.ones_like(gamma_sample) saddle_bellman_residuals = (tf.matmul(self._a_vec, self._nu) - self._weighted_rewards[:, None]) saddle_bellman_residuals *= -1 * self._algae_alpha_sign saddle_zetas = tf.gather(self._zeta, self._nu_indices) saddle_initial_nu_values = tf.reduce_sum( # Average over actions. self._initial_target_probs[:, :, None] * tf.gather(self._nu, self._initial_nu_indices), axis=1) saddle_init_nu_loss = ((1 - self._gamma) * saddle_initial_nu_values * self._algae_alpha_sign) saddle_bellman_residuals2 = (tf.matmul(self._a_vec, self._nu2) - self._weighted_rewards[:, None]) saddle_bellman_residuals2 *= 1 * self._algae_alpha_sign saddle_zetas2 = tf.gather(self._zeta2, self._nu_indices) saddle_initial_nu_values2 = tf.reduce_sum( # Average over actions. self._initial_target_probs[:, :, None] * tf.gather(self._nu2, self._initial_nu_indices), axis=1) saddle_init_nu_loss2 = ((1 - self._gamma) * saddle_initial_nu_values2 * -1 * self._algae_alpha_sign) saddle_loss = 0.5 * ( saddle_init_nu_loss + saddle_bellman_residuals * saddle_zetas + -tf.math.abs(self._algae_alpha) * 0.5 * tf.square(saddle_zetas) + -saddle_init_nu_loss2 + -saddle_bellman_residuals2 * saddle_zetas2 + tf.math.abs(self._algae_alpha) * 0.5 * tf.square(saddle_zetas2)) # Binary search to find best alpha. left = tf.constant([-8., -8.]) right = tf.constant([32., 32.]) for _ in range(16): mid = 0.5 * (left + right) self._alpha.assign(mid) weights, log_weights = self._get_weights( saddle_loss * self._gamma_sample[:, None]) divergence = self._compute_divergence(weights, log_weights) divergence_violation = divergence - self._two_sided_limit left = tf.where(divergence_violation > 0., mid, left) right = tf.where(divergence_violation > 0., right, mid) self._alpha.assign(0.5 * (left + right)) weights, log_weights = self._get_weights(saddle_loss * self._gamma_sample[:, None]) gamma_data_weights = tf.stop_gradient(weights * self._gamma_sample[:, None]) #print(tf.concat([gamma_data_weights, saddle_loss], axis=-1)) avg_saddle_loss = ( tf.reduce_sum(gamma_data_weights * saddle_loss, axis=0) / tf.reduce_sum(gamma_data_weights, axis=0)) weighted_state_action_count = tf.reduce_sum( tf.one_hot(self._nu_indices, self._dimension)[:, :, None] * weights[:, None, :], axis=0) weighted_state_action_count = tf.gather(weighted_state_action_count, self._nu_indices) my_td_mat = tf.einsum( 'ai, ab, ab, aj -> bij', tf.one_hot(self._nu_indices, self._dimension), #1.0 / tf.cast(self._state_action_count, tf.float32), 1.0 / weighted_state_action_count, weights, self._a_vec) my_bias = tf.reduce_sum( tf.transpose(weights)[:, :, None] * tf.one_hot(self._nu_indices, self._dimension)[None, :, :] * tf.reshape(self._weighted_rewards, [1, -1, 1]) * #1.0 / tf.cast(self._state_action_count, tf.float32)[None, :, None], 1.0 / tf.transpose(weighted_state_action_count)[:, :, None], axis=1) #print('hello', saddle_initial_nu_values[:1], saddle_zetas[:3], # self._nu[:2], my_bias[:, :2], saddle_loss[:4]) with tf.GradientTape(watch_accessed_variables=False, persistent=True) as tape: tape.watch([self._nu, self._nu2, self._alpha]) bellman_residuals = tf.matmul( my_td_mat, tf.transpose(self._nu)[:, :, None]) - my_bias[:, :, None] bellman_residuals = tf.transpose(tf.squeeze(bellman_residuals, -1)) bellman_residuals = tf.gather(bellman_residuals, self._nu_indices) initial_nu_values = tf.reduce_sum( # Average over actions. self._initial_target_probs[:, :, None] * tf.gather(self._nu, self._initial_nu_indices), axis=1) bellman_residuals *= self._algae_alpha_sign init_nu_loss = ((1 - self._gamma) * initial_nu_values * self._algae_alpha_sign) nu_loss = (tf.math.square(bellman_residuals) / 2.0 + tf.math.abs(self._algae_alpha) * init_nu_loss) loss = (gamma_data_weights * nu_loss / tf.reduce_sum(gamma_data_weights, axis=0, keepdims=True)) bellman_residuals2 = tf.matmul( my_td_mat, tf.transpose(self._nu2)[:, :, None]) - my_bias[:, :, None] bellman_residuals2 = tf.transpose( tf.squeeze(bellman_residuals2, -1)) bellman_residuals2 = tf.gather(bellman_residuals2, self._nu_indices) initial_nu_values2 = tf.reduce_sum( # Average over actions. self._initial_target_probs[:, :, None] * tf.gather(self._nu2, self._initial_nu_indices), axis=1) bellman_residuals2 *= -1 * self._algae_alpha_sign init_nu_loss2 = ((1 - self._gamma) * initial_nu_values2 * -1 * self._algae_alpha_sign) nu_loss2 = (tf.math.square(bellman_residuals2) / 2.0 + tf.math.abs(self._algae_alpha) * init_nu_loss2) loss2 = (gamma_data_weights * nu_loss2 / tf.reduce_sum(gamma_data_weights, axis=0, keepdims=True)) divergence = self._compute_divergence(weights, log_weights) divergence_violation = divergence - self._two_sided_limit alpha_loss = (-tf.exp(self._alpha) * tf.stop_gradient(divergence_violation)) extra_loss = tf.reduce_sum(tf.math.square(self._nu[-1, :])) extra_loss2 = tf.reduce_sum(tf.math.square(self._nu2[-1, :])) nu_grad = tape.gradient(loss + extra_loss, [self._nu])[0] nu_grad2 = tape.gradient(loss2 + extra_loss2, [self._nu2])[0] avg_loss = tf.reduce_sum(0.5 * (loss - loss2) / tf.math.abs(self._algae_alpha), axis=0) nu_jacob = tape.jacobian(nu_grad, [self._nu])[0] nu_hess = tf.stack( [nu_jacob[:, i, :, i] for i in range(self._num_limits)], axis=0) nu_jacob2 = tape.jacobian(nu_grad2, [self._nu2])[0] nu_hess2 = tf.stack( [nu_jacob2[:, i, :, i] for i in range(self._num_limits)], axis=0) for idx, div in enumerate(divergence): tf.summary.scalar('divergence%d' % idx, div) #alpha_grads = tape.gradient(alpha_loss, [self._alpha]) #alpha_grad_op = self._alpha_optimizer.apply_gradients( # zip(alpha_grads, [self._alpha])) #self._alpha.assign(tf.minimum(8., tf.maximum(-8., self._alpha))) #print(self._alpha, tf.concat([weights, nu_loss], -1)) #regularizer = 0.1 nu_transformed = tf.transpose( tf.squeeze( tf.linalg.solve( nu_hess + regularizer * tf.eye(self._dimension), tf.expand_dims(-tf.transpose(nu_grad), axis=-1)))) self._nu = self._nu + 0.1 * nu_transformed nu_transformed2 = tf.transpose( tf.squeeze( tf.linalg.solve( nu_hess2 + regularizer * tf.eye(self._dimension), tf.expand_dims(-tf.transpose(nu_grad2), axis=-1)))) self._nu2 = self._nu2 + 0.1 * nu_transformed2 print(avg_loss * self._algae_alpha_sign, avg_saddle_loss * self._algae_alpha_sign, self._nu[:2], divergence) #print(init_nu_loss[:8], init_nu_loss[-8:]) #print(bellman_residuals[:8]) #print(self._nu[:3], self._zeta[:3]) zetas = tf.matmul(my_td_mat, tf.transpose(self._nu)[:, :, None]) - my_bias[:, :, None] zetas = tf.transpose(tf.squeeze(zetas, -1)) zetas *= -self._algae_alpha_sign zetas /= tf.math.abs(self._algae_alpha) self._zeta = self._zeta + 0.1 * (zetas - self._zeta) zetas2 = tf.matmul(my_td_mat, tf.transpose(self._nu2)[:, :, None]) - my_bias[:, :, None] zetas2 = tf.transpose(tf.squeeze(zetas2, -1)) zetas2 *= 1 * self._algae_alpha_sign zetas2 /= tf.math.abs(self._algae_alpha) self._zeta2 = self._zeta2 + 0.1 * (zetas2 - self._zeta2) #self._zeta = ( # tf.einsum('ij,ja-> ia', self._td_mat, self._nu) - # tf.transpose(my_bias)) #self._zeta *= -tf.reshape(self._algae_alpha_sign, [1, self._num_limits]) #self._zeta /= tf.math.abs(self._algae_alpha) return [ avg_saddle_loss * self._algae_alpha_sign, avg_loss * self._algae_alpha_sign, divergence ]
def __call__(self, roi_features, class_indices, is_training=None): """Mask branch for the Mask-RCNN model. Args: roi_features: A ROI feature tensor of shape [batch_size, num_rois, height_l, width_l, num_filters]. class_indices: a Tensor of shape [batch_size, num_rois], indicating which class the ROI is. is_training: `boolean`, if True if model is in training mode. Returns: mask_outputs: a tensor with a shape of [batch_size, num_masks, mask_height, mask_width, num_classes], representing the mask predictions. fg_gather_indices: a tensor with a shape of [batch_size, num_masks, 2], representing the fg mask targets. Raises: ValueError: If boxes is not a rank-3 tensor or the last dimension of boxes is not 4. """ def _get_stddev_equivalent_to_msra_fill(kernel_size, fan_out): """Returns the stddev of random normal initialization as MSRAFill.""" # Reference: https://github.com/pytorch/pytorch/blob/master/caffe2/operators/filler_op.h#L445-L463 # pylint: disable=line-too-long # For example, kernel size is (3, 3) and fan out is 256, stddev is 0.029. # stddev = (2/(3*3*256))^0.5 = 0.029 return (2 / (kernel_size[0] * kernel_size[1] * fan_out))**0.5 with backend.get_graph().as_default(): with tf.name_scope('mask_head'): _, num_rois, height, width, filters = roi_features.get_shape( ).as_list() net = tf.reshape(roi_features, [-1, height, width, filters]) for i in range(4): kernel_size = (3, 3) fan_out = 256 init_stddev = _get_stddev_equivalent_to_msra_fill( kernel_size, fan_out) net = tf.keras.layers.Conv2D( fan_out, kernel_size=kernel_size, strides=(1, 1), padding='same', dilation_rate=(1, 1), activation=None, kernel_initializer=tf.keras.initializers.RandomNormal( stddev=init_stddev), bias_initializer=tf.zeros_initializer(), name='mask-conv-l%d' % i)(net) net = self._batch_norm_relu()(net, is_training=is_training) kernel_size = (2, 2) fan_out = 256 init_stddev = _get_stddev_equivalent_to_msra_fill( kernel_size, fan_out) net = tf.keras.layers.Conv2DTranspose( fan_out, kernel_size=kernel_size, strides=(2, 2), padding='valid', activation=None, kernel_initializer=tf.keras.initializers.RandomNormal( stddev=init_stddev), bias_initializer=tf.zeros_initializer(), name='conv5-mask')(net) net = self._batch_norm_relu()(net, is_training=is_training) kernel_size = (1, 1) fan_out = self._num_classes init_stddev = _get_stddev_equivalent_to_msra_fill( kernel_size, fan_out) mask_outputs = tf.keras.layers.Conv2D( fan_out, kernel_size=kernel_size, strides=(1, 1), padding='valid', kernel_initializer=tf.keras.initializers.RandomNormal( stddev=init_stddev), bias_initializer=tf.zeros_initializer(), name='mask_fcn_logits')(net) mask_outputs = tf.reshape(mask_outputs, [ -1, num_rois, self._mrcnn_resolution, self._mrcnn_resolution, self._num_classes ]) with tf.name_scope('masks_post_processing'): # TODO(pengchong): Figure out the way not to use the static inferred # batch size. batch_size, num_masks = class_indices.get_shape().as_list() mask_outputs = tf.transpose(a=mask_outputs, perm=[0, 1, 4, 2, 3]) # Contructs indices for gather. batch_indices = tf.tile( tf.expand_dims(tf.range(batch_size), axis=1), [1, num_masks]) mask_indices = tf.tile( tf.expand_dims(tf.range(num_masks), axis=0), [batch_size, 1]) gather_indices = tf.stack( [batch_indices, mask_indices, class_indices], axis=2) mask_outputs = tf.gather_nd(mask_outputs, gather_indices) return mask_outputs
def test_vimco_and_gradient(self): dims = 5 # Dimension num_draws = int(1e3) num_batch_draws = int(3) seed = test_util.test_seed() with tf.GradientTape(persistent=True) as tape: f = lambda logu: tfp.vi.kl_reverse(logu, self_normalized=False) np_f = lambda logu: -logu s = tf.constant(1.) tape.watch(s) p = tfd.MultivariateNormalFullCovariance(covariance_matrix=tridiag( dims, diag_value=1, offdiag_value=0.5)) # Variance is very high when approximating Forward KL, so we make # scale_diag large. This ensures q "covers" p and thus Var_q[p/q] is # smaller. q = tfd.MultivariateNormalDiag(scale_diag=tf.tile([s], [dims])) vimco = tfp.vi.csiszar_vimco(f=f, p_log_prob=p.log_prob, q=q, num_draws=num_draws, num_batch_draws=num_batch_draws, seed=seed) # We want the seed to be the same since we will use computations # with the same underlying sample to show correctness of vimco. if tf.executing_eagerly(): tf1.set_random_seed(seed) x = q.sample(sample_shape=[num_draws, num_batch_draws], seed=seed) x = tf.stop_gradient(x) logu = p.log_prob(x) - q.log_prob(x) f_log_sum_u = f(tfp.stats.log_soomean_exp(logu, axis=0)[::-1][0]) q_log_prob_x = q.log_prob(x) grad_vimco = tape.gradient(vimco, s) grad_mean_f_log_sum_u = tape.gradient(f_log_sum_u, s) / num_batch_draws jacobian_logqx = tape.jacobian(q_log_prob_x, s) [ logu_, jacobian_logqx_, vimco_, grad_vimco_, f_log_sum_u_, grad_mean_f_log_sum_u_, ] = self.evaluate([ logu, jacobian_logqx, vimco, grad_vimco, f_log_sum_u, grad_mean_f_log_sum_u, ]) np_log_avg_u, np_log_sooavg_u = self._csiszar_vimco_helper(logu_) # Test VIMCO loss is correct. self.assertAllClose(np_f(np_log_avg_u).mean(axis=0), vimco_, rtol=1e-4, atol=1e-5) # Test gradient of VIMCO loss is correct. # # To make this computation we'll inject two gradients from TF: # - grad[mean(f(log(sum(p(x)/q(x)))))] # - jacobian[log(q(x))]. # # We now justify why using these (and only these) TF values for # ground-truth does not undermine the completeness of this test. # # Regarding `grad_mean_f_log_sum_u_`, note that we validate the # correctness of the zero-th order derivative (for each batch member). # Since `tfp.vi.csiszar_vimco_helper` itself does not manipulate any # gradient information, we can safely rely on TF. self.assertAllClose(np_f(np_log_avg_u), f_log_sum_u_, rtol=1e-4, atol=1e-5) # # Regarding `jacobian_logqx_`, note that testing the gradient of # `q.log_prob` is outside the scope of this unit-test thus we may safely # use TF to find it. # The `mean` is across batches and the `sum` is across iid samples. np_grad_vimco = (grad_mean_f_log_sum_u_ + np.mean(np.sum( jacobian_logqx_ * (np_f(np_log_avg_u) - np_f(np_log_sooavg_u)), axis=0), axis=0)) self.assertAllClose(np_grad_vimco, grad_vimco_, rtol=0.03, atol=1e-3)
def exact_kl(s): p = tfd.MultivariateNormalFullCovariance( covariance_matrix=tridiag(d, diag_value=1, offdiag_value=0.5)) q = tfd.MultivariateNormalDiag(scale_diag=tf.tile([s], [d])) return tfd.kl_divergence(q, p)
def crop_mask_in_target_box(masks, boxes, target_boxes, output_size, sample_offset=0): """Crop masks in target boxes. Args: masks: A tensor with a shape of [batch_size, num_masks, height, width]. boxes: a float tensor representing box cooridnates that tightly enclose masks with a shape of [batch_size, num_masks, 4] in un-normalized coordinates. A box is represented by [ymin, xmin, ymax, xmax]. target_boxes: a float tensor representing target box cooridnates for masks with a shape of [batch_size, num_masks, 4] in un-normalized coordinates. A box is represented by [ymin, xmin, ymax, xmax]. output_size: A scalar to indicate the output crop size. It currently only supports to output a square shape outputs. sample_offset: a float number in [0, 1] indicates the subpixel sample offset from grid point. Returns: A 4-D tensor representing feature crop of shape [batch_size, num_boxes, output_size, output_size]. """ with tf.name_scope('crop_mask_in_target_box'): batch_size, num_masks, height, width = masks.get_shape().as_list() masks = tf.reshape(masks, [batch_size * num_masks, height, width, 1]) # Pad zeros on the boundary of masks. masks = tf.image.pad_to_bounding_box(masks, 2, 2, height + 4, width + 4) masks = tf.reshape(masks, [batch_size, num_masks, height + 4, width + 4, 1]) # Projects target box locations and sizes to corresponding cropped # mask coordinates. gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(value=boxes, num_or_size_splits=4, axis=2) bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(value=target_boxes, num_or_size_splits=4, axis=2) y_transform = (bb_y_min - gt_y_min) * height / (gt_y_max - gt_y_min + _EPSILON) + 2 x_transform = (bb_x_min - gt_x_min) * height / (gt_x_max - gt_x_min + _EPSILON) + 2 h_transform = (bb_y_max - bb_y_min) * width / (gt_y_max - gt_y_min + _EPSILON) w_transform = (bb_x_max - bb_x_min) * width / (gt_x_max - gt_x_min + _EPSILON) boundaries = tf.concat([ tf.cast(tf.ones_like(y_transform) * ((height + 4) - 1), dtype=tf.float32), tf.cast(tf.ones_like(x_transform) * ((width + 4) - 1), dtype=tf.float32) ], axis=-1) # Reshape tensors to have the right shape for selective_crop_and_resize. trasnformed_boxes = tf.concat( [y_transform, x_transform, h_transform, w_transform], -1) levels = tf.tile(tf.reshape(tf.range(num_masks), [1, num_masks]), [batch_size, 1]) cropped_masks = selective_crop_and_resize(masks, trasnformed_boxes, levels, boundaries, output_size, sample_offset=sample_offset) cropped_masks = tf.squeeze(cropped_masks, axis=-1) return cropped_masks
def selective_crop_and_resize(features, boxes, box_levels, boundaries, output_size=7, sample_offset=0.5): """Crop and resize boxes on a set of feature maps. Given multiple features maps indexed by different levels, and a set of boxes where each box is mapped to a certain level, it selectively crops and resizes boxes from the corresponding feature maps to generate the box features. We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf, figure 3 for reference). Specifically, for each feature map, we select an (output_size, output_size) set of pixels corresponding to the box location, and then use bilinear interpolation to select the feature value for each pixel. For performance, we perform the gather and interpolation on all layers as a single operation. This is op the multi-level features are first stacked and gathered into [2*output_size, 2*output_size] feature points. Then bilinear interpolation is performed on the gathered feature points to generate [output_size, output_size] RoIAlign feature map. Here is the step-by-step algorithm: 1. The multi-level features are gathered into a [batch_size, num_boxes, output_size*2, output_size*2, num_filters] Tensor. The Tensor contains four neighboring feature points for each vertice in the output grid. 2. Compute the interpolation kernel of shape [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis can be seen as stacking 2x2 interpolation kernels for all vertices in the output grid. 3. Element-wise multiply the gathered features and interpolation kernel. Then apply 2x2 average pooling to reduce spatial dimension to output_size. Args: features: a 5-D tensor of shape [batch_size, num_levels, max_height, max_width, num_filters] where cropping and resizing are based. boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the information of each box w.r.t. the corresponding feature map. boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float) in terms of the number of pixels of the corresponding feature map size. box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing the 0-based corresponding feature level index of each box. boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing the boundary (in (y, x)) of the corresponding feature map for each box. Any resampled grid points that go beyond the bounary will be clipped. output_size: a scalar indicating the output crop size. sample_offset: a float number in [0, 1] indicates the subpixel sample offset from grid point. Returns: features_per_box: a 5-D tensor of shape [batch_size, num_boxes, output_size, output_size, num_filters] representing the cropped features. """ (batch_size, num_levels, max_feature_height, max_feature_width, num_filters) = features.get_shape().as_list() _, num_boxes, _ = boxes.get_shape().as_list() # Compute the grid position w.r.t. the corresponding feature map. box_grid_x = [] box_grid_y = [] for i in range(output_size): box_grid_x.append(boxes[:, :, 1] + (i + sample_offset) * boxes[:, :, 3] / output_size) box_grid_y.append(boxes[:, :, 0] + (i + sample_offset) * boxes[:, :, 2] / output_size) box_grid_x = tf.stack(box_grid_x, axis=2) box_grid_y = tf.stack(box_grid_y, axis=2) # Compute indices for gather operation. box_grid_y0 = tf.floor(box_grid_y) box_grid_x0 = tf.floor(box_grid_x) box_grid_x0 = tf.maximum(0., box_grid_x0) box_grid_y0 = tf.maximum(0., box_grid_y0) box_gridx0x1 = tf.stack([ tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1)), tf.minimum(box_grid_x0 + 1, tf.expand_dims(boundaries[:, :, 1], -1)) ], axis=3) box_gridy0y1 = tf.stack([ tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1)), tf.minimum(box_grid_y0 + 1, tf.expand_dims(boundaries[:, :, 0], -1)) ], axis=3) x_indices = tf.cast(tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) y_indices = tf.cast(tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) height_dim_offset = max_feature_width level_dim_offset = max_feature_height * height_dim_offset batch_dim_offset = num_levels * level_dim_offset indices = tf.reshape( tf.tile( tf.reshape( tf.range(batch_size) * batch_dim_offset, [batch_size, 1, 1, 1]), [1, num_boxes, output_size * 2, output_size * 2]) + tf.tile( tf.reshape(box_levels * level_dim_offset, [batch_size, num_boxes, 1, 1]), [1, 1, output_size * 2, output_size * 2]) + tf.tile( tf.reshape(y_indices * height_dim_offset, [batch_size, num_boxes, output_size * 2, 1]), [1, 1, 1, output_size * 2]) + tf.tile( tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]), [1, 1, output_size * 2, 1]), [-1]) features = tf.reshape(features, [-1, num_filters]) features_per_box = tf.reshape( tf.gather(features, indices), [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters]) # The RoIAlign feature f can be computed by bilinear interpolation of four # neighboring feature points f0, f1, f2, and f3. # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T # [f10, f11]] # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11 # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11 ly = box_grid_y - box_grid_y0 lx = box_grid_x - box_grid_x0 hy = 1.0 - ly hx = 1.0 - lx kernel_x = tf.reshape(tf.stack([hx, lx], axis=3), [batch_size, num_boxes, 1, output_size * 2]) kernel_y = tf.reshape(tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size * 2, 1]) # Uses implicit broadcast to generate the interpolation kernel. The # multiplier `4` is for avg pooling. interpolation_kernel = kernel_y * kernel_x * 4 # Interpolates the gathered features with computed interpolation kernels. features_per_box *= tf.cast(tf.expand_dims(interpolation_kernel, axis=4), dtype=features_per_box.dtype) features_per_box = tf.reshape(features_per_box, [ batch_size * num_boxes, output_size * 2, output_size * 2, num_filters ]) features_per_box = tf.nn.avg_pool2d(input=features_per_box, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID') features_per_box = tf.reshape( features_per_box, [batch_size, num_boxes, output_size, output_size, num_filters]) return features_per_box
def _parse_train_data(self, data): """Parses data for training. Args: data: the decoded tensor dictionary from TfExampleDecoder. Returns: image: image tensor that is preproessed to have normalized value and dimension [output_size[0], output_size[1], 3] labels: a dictionary of tensors used for training. The following describes {key: value} pairs in the dictionary. image_info: a 2D `Tensor` that encodes the information of the image and the applied preprocessing. It is in the format of [[original_height, original_width], [scaled_height, scaled_width], anchor_boxes: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, 4] representing anchor boxes at each level. rpn_score_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location]. The height_l and width_l represent the dimension of class logits at l-th level. rpn_box_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled image that is fed to the network. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by mask_crop_size. """ classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] if self._include_mask: masks = data['groundtruth_instance_masks'] is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training and self._is_training: num_groundtrtuhs = tf.shape(classes)[0] with tf.control_dependencies([num_groundtrtuhs, is_crowds]): indices = tf.cond( tf.greater(tf.size(is_crowds), 0), lambda: tf.where(tf.logical_not(is_crowds))[:, 0], lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Gets original image and its size. image = data['image'] image_shape = tf.shape(image)[0:2] # Normalizes image with mean and std pixel values. image = input_utils.normalize_image(image) # Flips image randomly during training. if self._aug_rand_hflip: if self._include_mask: image, boxes, masks = input_utils.random_horizontal_flip( image, boxes, masks) else: image, boxes = input_utils.random_horizontal_flip(image, boxes) # Converts boxes from normalized coordinates to pixel coordinates. # Now the coordinates of boxes are w.r.t. the original image. boxes = box_utils.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = input_utils.resize_and_crop_image( image, self._output_size, padded_size=input_utils.compute_padded_size( self._output_size, 2**self._max_level), aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) image_height, image_width, _ = image.get_shape().as_list() # Resizes and crops boxes. # Now the coordinates of boxes are w.r.t the scaled image. image_scale = image_info[2, :] offset = image_info[3, :] boxes = input_utils.resize_and_crop_boxes(boxes, image_scale, (image_height, image_width), offset) # Filters out ground truth boxes that are all zeros. indices = box_utils.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) if self._include_mask: masks = tf.gather(masks, indices) cropped_boxes = boxes + tf.cast(tf.tile( tf.expand_dims(offset, axis=0), [1, 2]), dtype=tf.float32) cropped_boxes = box_utils.normalize_boxes(cropped_boxes, image_info[1, :]) num_masks = tf.shape(masks)[0] masks = tf.image.crop_and_resize( tf.expand_dims(masks, axis=-1), cropped_boxes, box_indices=tf.range(num_masks, dtype=tf.int32), crop_size=[self._mask_crop_size, self._mask_crop_size], method='bilinear') masks = tf.squeeze(masks, axis=-1) # Assigns anchor targets. # Note that after the target assignment, box targets are absolute pixel # offsets w.r.t. the scaled image. input_anchor = anchor.Anchor(self._min_level, self._max_level, self._num_scales, self._aspect_ratios, self._anchor_size, (image_height, image_width)) anchor_labeler = anchor.RpnAnchorLabeler(input_anchor, self._rpn_match_threshold, self._rpn_unmatched_threshold, self._rpn_batch_size_per_im, self._rpn_fg_fraction) rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors( boxes, tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32)) # If bfloat16 is used, casts input image to tf.bfloat16. if self._use_bfloat16: image = tf.cast(image, dtype=tf.bfloat16) # Packs labels for model_fn outputs. labels = { 'anchor_boxes': input_anchor.multilevel_boxes, 'image_info': image_info, 'rpn_score_targets': rpn_score_targets, 'rpn_box_targets': rpn_box_targets, } labels['gt_boxes'] = input_utils.pad_to_fixed_size( boxes, self._max_num_instances, -1) labels['gt_classes'] = input_utils.pad_to_fixed_size( classes, self._max_num_instances, -1) if self._include_mask: labels['gt_masks'] = input_utils.pad_to_fixed_size( masks, self._max_num_instances, -1) return image, labels
def _make_pairs(x): return tf.reshape(tf.tile(x[:, tf.newaxis, :], [1, 2, 1]), [-1, x.shape[-1]])
def evaluate_multiclass(self, predictions, weights): """Evaluates the multiclass hinge loss on the given predictions. Given a rank-1 `Tensor` of predictions with shape (n,), where n is the number of examples, and a rank-2 `Tensor` of weights with shape (m, 2), where m is broadcastable to n, this method will return a `Tensor` of shape (n,) where the ith element is: ```python hinge_loss[i] = weights[i, 0] + sum_{j=0}^{num_classes - 2} ( (weights[i, j+1] - weights[i, j]) * max_{l=j+1}^{num_classes-1} max{0, margin + predictions[i, l] - mean_{k=0}^j predictions[i, k]} ) ``` where we've assumed (without loss of generality) that the weights and predictions are ordered in such a way that weights[i, j] <= weights[i, j+1]. In the implementation, of course, we cannot simply assume this, and actually perform a sort. This is admittedly a somewhat strange-seeming formulation, and it's complicated and expensive to implement. The reason it was chosen is that it satisfies the following properties: 1. It's shift invariant: adding a constant to every weight will shift the loss by the same constant. 2. It's scale invariant: multiplying every weight by a constant will scale the loss by the same constant. 3. When there are only two classes, it's equivalent to the binary hinge loss implemented in evaluate_binary_classification(). 4. When the weights represent a misclassification rate (i.e. weights[i, 0] = 0 and weights[i, j] = 1 for i > 0, assuming the weights are sorted), it's equivalent to the usual multiclass hinge misclassification loss. 5. It's convex in the predictions, and upper bounds the multiclass 0-1 loss when margin >= 1. Args: predictions: a `Tensor` of shape (n, k), where n is the number of examples and k is the number of classes. weights: a `Tensor` of shape (m, k), where m is broadcastable to n. This `Tensor` is *not* necessarily non-negative. Returns: A `Tensor` of shape (n,) and dtype=predictions.dtype, containing the hinge losses for each example. Raises: TypeError: if "predictions" is not a floating-point `Tensor`, or "weights" is not a `Tensor`. ValueError: if "predictions" and "weights" have different numbers of columns (i.e. if the number of classes is inconsistent). """ num_classes = helpers.get_num_columns_of_2d_tensor( predictions, name="multiclass predictions") weights_num_classes = helpers.get_num_columns_of_2d_tensor( weights, name="weights") if weights_num_classes != num_classes: raise ValueError( "weights must have the same number of columns as " "predictions ({} vs. {}): did you specify num_classes " "correctly when you created your context?".format( weights_num_classes, num_classes)) dtype = predictions.dtype.base_dtype if not dtype.is_floating: raise TypeError("multiclass predictions must be floating-point") zero = tf.zeros(1, dtype=dtype) weights_rows = tf.shape(weights)[0] predictions_rows = tf.shape(predictions)[0] # We start out by finding a permutation for each row that will cause the # weights to be nondecreasing. weights_permutation = tf.argsort(weights, axis=1) # This won't work if predictions_rows isn't divisible by weights_rows # (tf.stack() below will fail), but we require weights to be broadcastable # to predictions (usually, weights_rows will either be 1, or equal to # predictions_rows). predictions_permutation = tf.tile(weights_permutation, [predictions_rows / weights_rows, 1]) # First we create a Tensor of shape [weights_rows, num_classes, 2], for # which: # weights_indices[i, j, 0] = i # weights_indices[i, j, 1] = weights_permutation[j] # Next, we use gather_nd to re-organize the weights such that: # new_weights[i, j] = old_weights[i, weights_permutation[j]] weights_iota = tf.range(weights_rows) weights_iota = tf.expand_dims(weights_iota, axis=-1) weights_iota = tf.tile(weights_iota, [1, num_classes]) weights_indices = tf.stack([weights_iota, weights_permutation], axis=2) weights = tf.gather_nd(tf.cast(weights, dtype=dtype), weights_indices) # Next we create a Tensor of shape [predictions_rows, num_classes, 2], for # which: # predictions_indices[i, j, 0] = i # predictions_indices[i, j, 1] = predictions_permutation[j] # Next, we use gather_nd to re-organize the predictions such that: # new_predictions[i, j] = old_predictions[i, predictions_permutation[j]] predictions_iota = tf.range(predictions_rows) predictions_iota = tf.expand_dims(predictions_iota, axis=-1) predictions_iota = tf.tile(predictions_iota, [1, num_classes]) predictions_indices = tf.stack( [predictions_iota, predictions_permutation], axis=2) predictions = tf.gather_nd(predictions, predictions_indices) # At this point, every row of weights and predictions has been sorted in # such a way that the weights are nondecreasing. We wish to calculate the # following: # result[i] = weights[i, 0] + \sum_{j=0}^{num_classes - 2} ( # (weights[i, j+1] - weights[i, j]) * max_{l=j+1}^{num_classes-1} # max{0, margin + predictions[i, l] - mean_{k=0}^j predictions[i, k]} # ) # Notice that the innermost max is a hinge. result = weights[:, 0] for ii in xrange(num_classes - 1): scale = weights[:, ii + 1] - weights[:, ii] # The "included" predictions are those in the above max over l, and the # "excluded" predictions are those in the above mean over k. included = predictions[:, (ii + 1):num_classes] included = tf.reduce_max(included, axis=1) excluded = predictions[:, 0:(ii + 1)] excluded = tf.reduce_mean(excluded, axis=1) result += scale * tf.maximum(zero, self._margin + included - excluded) return result
def simpson(func, lower, upper, num_points=1001, dtype=None, name=None): """Evaluates definite integral using composite Simpson's 1/3 rule. Integrates `func` using composite Simpson's 1/3 rule [1]. Evaluates function at points of evenly spaced grid of `num_points` points, then uses obtained values to interpolate `func` with quadratic polynomials and integrates these polynomials. ## References [1] Weisstein, Eric W. "Simpson's Rule." From MathWorld - A Wolfram Web Resource. http://mathworld.wolfram.com/SimpsonsRule.html ## Example ```python f = lambda x: x*x a = tf.constant(0.0) b = tf.constant(3.0) integrate(f, a, b, num_points=1001) # 9.0 ``` Args: func: Python callable representing a function to be integrated. It must be a callable of a single `Tensor` parameter and return a `Tensor` of the same shape and dtype as its input. It will be called with a `Tesnor` of shape `lower.shape + [n]` (where n is integer number of points) and of the same `dtype` as `lower`. lower: `Tensor` or Python float representing the lower limits of integration. `func` will be integrated between each pair of points defined by `lower` and `upper`. upper: `Tensor` of the same shape and dtype as `lower` or Python float representing the upper limits of intergation. num_points: Scalar int32 `Tensor`. Number of points at which function `func` will be evaluated. Must be odd and at least 3. Default value: 1001. dtype: Optional `tf.Dtype`. If supplied, the dtype for the `lower` and `upper`. Result will have the same dtype. Default value: None which maps to dtype of `lower`. name: Python str. The name to give to the ops created by this function. Default value: None which maps to 'integrate_simpson_composite'. Returns: `Tensor` of shape `func_batch_shape + limits_batch_shape`, containing value of the definite integral. """ with tf.compat.v1.name_scope( name, default_name='integrate_simpson_composite', values=[lower, upper]): lower = tf.convert_to_tensor(lower, dtype=dtype, name='lower') dtype = lower.dtype upper = tf.convert_to_tensor(upper, dtype=dtype, name='upper') num_points = tf.convert_to_tensor( num_points, dtype=tf.int32, name='num_points') assertions = [ tf.debugging.assert_greater_equal(num_points, 3), tf.debugging.assert_equal(num_points % 2, 1), ] with tf.compat.v1.control_dependencies(assertions): dx = (upper - lower) / (tf.cast(num_points, dtype=dtype) - 1) dx_expand = tf.expand_dims(dx, -1) lower_exp = tf.expand_dims(lower, -1) grid = lower_exp + dx_expand * tf.cast(tf.range(num_points), dtype=dtype) weights_first = tf.constant([1.0], dtype=dtype) weights_mid = tf.tile( tf.constant([4.0, 2.0], dtype=dtype), [(num_points - 3) // 2]) weights_last = tf.constant([4.0, 1.0], dtype=dtype) weights = tf.concat([weights_first, weights_mid, weights_last], axis=0) return tf.reduce_sum(func(grid) * weights, axis=-1) * dx / 3
def update_confusion_matrix_variables( variables_to_update, y_true, y_pred, thresholds, top_k=None, class_id=None, sample_weight=None, multi_label=False, label_weights=None, thresholds_distributed_evenly=False, ): """Returns op to update the given confusion matrix variables. For every pair of values in y_true and y_pred: true_positive: y_true == True and y_pred > thresholds false_negatives: y_true == True and y_pred <= thresholds true_negatives: y_true == False and y_pred <= thresholds false_positive: y_true == False and y_pred > thresholds The results will be weighted and added together. When multiple thresholds are provided, we will repeat the same for every threshold. For estimation of these metrics over a stream of data, the function creates an `update_op` operation that updates the given variables. If `sample_weight` is `None`, weights default to 1. Use weights of 0 to mask values. Args: variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys and corresponding variables to update as values. y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`. y_pred: A floating point `Tensor` of arbitrary shape and whose values are in the range `[0, 1]`. thresholds: A float value, float tensor, python list, or tuple of float thresholds in `[0, 1]`, or NEG_INF (used when top_k is set). top_k: Optional int, indicates that the positive labels should be limited to the top k predictions. class_id: Optional int, limits the prediction and labels to the class specified by this argument. sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must be either `1`, or the same as the corresponding `y_true` dimension). multi_label: Optional boolean indicating whether multidimensional prediction/labels should be treated as multilabel responses, or flattened into a single label. When True, the valus of `variables_to_update` must have a second dimension equal to the number of labels in y_true and y_pred, and those tensors must not be RaggedTensors. label_weights: (optional) tensor of non-negative weights for multilabel data. The weights are applied when calculating TP, FP, FN, and TN without explicit multilabel handling (i.e. when the data is to be flattened). thresholds_distributed_evenly: Boolean, whether the thresholds are evenly distributed within the list. An optimized method will be used if this is the case. See _update_confusion_matrix_variables_optimized() for more details. Returns: Update op. Raises: ValueError: If `y_pred` and `y_true` have mismatched shapes, or if `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if `variables_to_update` contains invalid keys. """ if multi_label and label_weights is not None: raise ValueError( "`label_weights` for multilabel data should be handled " "outside of `update_confusion_matrix_variables` when " "`multi_label` is True.") if variables_to_update is None: return if not any(key for key in variables_to_update if key in list(ConfusionMatrix)): raise ValueError( "Please provide at least one valid confusion matrix " "variable to update. Valid variable key options are: " f'"{list(ConfusionMatrix)}". Received: "{variables_to_update.keys()}"' ) variable_dtype = list(variables_to_update.values())[0].dtype y_true = tf.cast(y_true, dtype=variable_dtype) y_pred = tf.cast(y_pred, dtype=variable_dtype) if thresholds_distributed_evenly: # Check whether the thresholds has any leading or tailing epsilon added # for floating point imprecision. The leading and tailing threshold will be # handled bit differently as the corner case. # At this point, thresholds should be a list/array with more than 2 items, # and ranged between [0, 1]. See is_evenly_distributed_thresholds() for more # details. thresholds_with_epsilon = thresholds[0] < 0.0 or thresholds[-1] > 1.0 thresholds = tf.convert_to_tensor(thresholds, dtype=variable_dtype) num_thresholds = thresholds.shape.as_list()[0] if multi_label: one_thresh = tf.equal( tf.cast(1, dtype=tf.int32), tf.rank(thresholds), name="one_set_of_thresholds_cond", ) else: [y_pred, y_true ], _ = ragged_assert_compatible_and_get_flat_values([y_pred, y_true], sample_weight) one_thresh = tf.cast(True, dtype=tf.bool) invalid_keys = [ key for key in variables_to_update if key not in list(ConfusionMatrix) ] if invalid_keys: raise ValueError( f'Invalid keys: "{invalid_keys}". ' f'Valid variable key options are: "{list(ConfusionMatrix)}"') if sample_weight is None: y_pred, y_true = losses_utils.squeeze_or_expand_dimensions( y_pred, y_true) else: sample_weight = tf.cast(sample_weight, dtype=variable_dtype) ( y_pred, y_true, sample_weight, ) = losses_utils.squeeze_or_expand_dimensions( y_pred, y_true, sample_weight=sample_weight) y_pred.shape.assert_is_compatible_with(y_true.shape) if top_k is not None: y_pred = _filter_top_k(y_pred, top_k) if class_id is not None: y_true = y_true[..., class_id] y_pred = y_pred[..., class_id] if thresholds_distributed_evenly: return _update_confusion_matrix_variables_optimized( variables_to_update, y_true, y_pred, thresholds, multi_label=multi_label, sample_weights=sample_weight, label_weights=label_weights, thresholds_with_epsilon=thresholds_with_epsilon, ) pred_shape = tf.shape(y_pred) num_predictions = pred_shape[0] if y_pred.shape.ndims == 1: num_labels = 1 else: num_labels = tf.math.reduce_prod(pred_shape[1:], axis=0) thresh_label_tile = tf.where(one_thresh, num_labels, tf.ones([], dtype=tf.int32)) # Reshape predictions and labels, adding a dim for thresholding. if multi_label: predictions_extra_dim = tf.expand_dims(y_pred, 0) labels_extra_dim = tf.expand_dims(tf.cast(y_true, dtype=tf.bool), 0) else: # Flatten predictions and labels when not multilabel. predictions_extra_dim = tf.reshape(y_pred, [1, -1]) labels_extra_dim = tf.reshape(tf.cast(y_true, dtype=tf.bool), [1, -1]) # Tile the thresholds for every prediction. if multi_label: thresh_pretile_shape = [num_thresholds, 1, -1] thresh_tiles = [1, num_predictions, thresh_label_tile] data_tiles = [num_thresholds, 1, 1] else: thresh_pretile_shape = [num_thresholds, -1] thresh_tiles = [1, num_predictions * num_labels] data_tiles = [num_thresholds, 1] thresh_tiled = tf.tile(tf.reshape(thresholds, thresh_pretile_shape), tf.stack(thresh_tiles)) # Tile the predictions for every threshold. preds_tiled = tf.tile(predictions_extra_dim, data_tiles) # Compare predictions and threshold. pred_is_pos = tf.greater(preds_tiled, thresh_tiled) # Tile labels by number of thresholds label_is_pos = tf.tile(labels_extra_dim, data_tiles) if sample_weight is not None: sample_weight = tf.__internal__.ops.broadcast_weights( tf.cast(sample_weight, dtype=variable_dtype), y_pred) weights_tiled = tf.tile(tf.reshape(sample_weight, thresh_tiles), data_tiles) else: weights_tiled = None if label_weights is not None and not multi_label: label_weights = tf.expand_dims(label_weights, 0) label_weights = tf.__internal__.ops.broadcast_weights( label_weights, y_pred) label_weights_tiled = tf.tile(tf.reshape(label_weights, thresh_tiles), data_tiles) if weights_tiled is None: weights_tiled = label_weights_tiled else: weights_tiled = tf.multiply(weights_tiled, label_weights_tiled) update_ops = [] def weighted_assign_add(label, pred, weights, var): label_and_pred = tf.cast(tf.logical_and(label, pred), dtype=var.dtype) if weights is not None: label_and_pred *= tf.cast(weights, dtype=var.dtype) return var.assign_add(tf.reduce_sum(label_and_pred, 1)) loop_vars = { ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos), } update_tn = ConfusionMatrix.TRUE_NEGATIVES in variables_to_update update_fp = ConfusionMatrix.FALSE_POSITIVES in variables_to_update update_fn = ConfusionMatrix.FALSE_NEGATIVES in variables_to_update if update_fn or update_tn: pred_is_neg = tf.logical_not(pred_is_pos) loop_vars[ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos, pred_is_neg) if update_fp or update_tn: label_is_neg = tf.logical_not(label_is_pos) loop_vars[ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg, pred_is_pos) if update_tn: loop_vars[ConfusionMatrix.TRUE_NEGATIVES] = ( label_is_neg, pred_is_neg, ) for matrix_cond, (label, pred) in loop_vars.items(): if matrix_cond in variables_to_update: update_ops.append( weighted_assign_add(label, pred, weights_tiled, variables_to_update[matrix_cond])) return tf.group(update_ops)
def step_fn(inputs): """Per-Replica StepFn.""" # Note that we don't use tf.tile for labels here images = inputs['features'] labels = inputs['labels'] images = tf.tile(images, [ensemble_size, 1, 1, 1]) # get lambdas samples = log_uniform_sample(n_samples, lambda_parameters) if num_eval_samples >= 0: lambdas = log_uniform_mean(lambda_parameters) lambdas = tf.expand_dims(lambdas, 1) lambdas = tf.concat((lambdas, samples), 1) else: lambdas = samples # lambdas with shape (ens size, samples, dim of lambdas) rep_lambdas = tf.repeat(lambdas, per_core_batch_size, axis=1) rep_lambdas = tf.reshape(rep_lambdas, (ensemble_size * per_core_batch_size, -1)) # eval on testsets logits = model([images, rep_lambdas], training=False) probs = tf.nn.softmax(logits) per_probs = tf.split(probs, num_or_size_splits=ensemble_size, axis=0) # per member performance and gibbs performance (average per member perf) if dataset_name == 'clean': for i in range(FLAGS.ensemble_size): # we record the first sample of lambdas per batch-ens member first_member_index = i * (ensemble_size // FLAGS.ensemble_size) member_probs = per_probs[first_member_index] member_loss = tf.keras.losses.sparse_categorical_crossentropy( labels, member_probs) metrics['test/nll_member_{}'.format(i)].update_state( member_loss) metrics['test/accuracy_member_{}'.format(i)].update_state( labels, member_probs) labels_tile = tf.tile(labels, [ensemble_size]) metrics['test/gibbs_nll'].update_state( tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( labels_tile, logits, from_logits=True))) metrics['test/gibbs_accuracy'].update_state(labels_tile, probs) # ensemble performance negative_log_likelihood = ensemble_crossentropy( labels, logits, ensemble_size) probs = tf.reduce_mean(per_probs, axis=0) if dataset_name == 'clean': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/accuracy'].update_state(labels, probs) metrics['test/ece'].add_batch(probs, label=labels) else: corrupt_metrics['test/nll_{}'.format( dataset_name)].update_state(negative_log_likelihood) corrupt_metrics['test/accuracy_{}'.format( dataset_name)].update_state(labels, probs) corrupt_metrics['test/ece_{}'.format(dataset_name)].add_batch( probs, label=labels) if dataset_name == 'clean': per_probs_stacked = tf.stack(per_probs, axis=0) diversity = rm.metrics.AveragePairwiseDiversity() diversity.add_batch(per_probs_stacked, num_models=ensemble_size) diversity_results = diversity.result() for k, v in diversity_results.items(): metrics['test/' + k].update_state(v)
def _replicate(n, tensor): """Replicate the input tensor n times along a new (major) dimension.""" # TODO(axch) Does this already exist somewhere? Should it get contributed? multiples = tf.concat([[n], tf.ones_like(tensor.shape)], axis=0) return tf.tile(tf.expand_dims(tensor, axis=0), multiples)
def get_true_shapes(input_tensor): input_shape = tf.shape(input_tensor) batch = input_shape[0] image_shape = input_shape[1:] true_shapes = tf.tile(image_shape[tf.newaxis, :], [batch, 1]) return true_shapes
def __init__(self, num_timesteps, period, frequency_multipliers, drift_scale, initial_state_prior, observation_noise_scale=0., initial_step=0, validate_args=False, allow_nan_stats=True, name=None): """Build a smooth seasonal state space model. Args: num_timesteps: Scalar `int` `Tensor` number of timesteps to model with this distribution. period: positive scalar `float` `Tensor` giving the number of timesteps required for the longest cyclic effect to repeat. frequency_multipliers: One-dimensional `float` `Tensor` listing the frequencies (cyclic components) included in the model, as multipliers of the base/fundamental frequency `2. * pi / period`. Each component is specified by the number of times it repeats per period, and adds two latent dimensions to the model. A smooth seasonal model that can represent any periodic function is given by `frequency_multipliers = [1, 2, ..., floor(period / 2)]`. However, it is often desirable to enforce a smoothness assumption (and reduce the computational burden) by dropping some of the higher frequencies. drift_scale: Scalar (any additional dimensions are treated as batch dimensions) `float` `Tensor` indicating the standard deviation of the latent state transitions. initial_state_prior: instance of `tfd.MultivariateNormal` representing the prior distribution on latent states. Must have event shape `[num_features]`. observation_noise_scale: Scalar (any additional dimensions are treated as batch dimensions) `float` `Tensor` indicating the standard deviation of the observation noise. Default value: `0.`. initial_step: scalar `int` `Tensor` specifying the starting timestep. Default value: `0`. validate_args: Python `bool`. Whether to validate input with asserts. If `validate_args` is `False`, and the inputs are invalid, correct behavior is not guaranteed. Default value: `False`. allow_nan_stats: Python `bool`. If `False`, raise an exception if a statistic (e.g. mean/mode/etc...) is undefined for any batch member. If `True`, batch members with valid parameters leading to undefined statistics will return NaN for this statistic. Default value: `True`. name: Python `str` name prefixed to ops created by this class. Default value: 'SmoothSeasonalStateSpaceModel'. """ with tf.name_scope(name or 'SmoothSeasonalStateSpaceModel') as name: dtype = dtype_util.common_dtype([ period, frequency_multipliers, drift_scale, initial_state_prior ]) period = tf.convert_to_tensor(value=period, name='period', dtype=dtype) frequency_multipliers = tf.convert_to_tensor( value=frequency_multipliers, name='frequency_multipliers', dtype=dtype) drift_scale = tf.convert_to_tensor(value=drift_scale, name='drift_scale', dtype=dtype) observation_noise_scale = tf.convert_to_tensor( value=observation_noise_scale, name='observation_noise_scale', dtype=dtype) num_frequencies = static_num_frequencies(frequency_multipliers) observation_matrix = tf.tile(tf.constant([[1., 0.]], dtype=dtype), multiples=[1, num_frequencies]) transition_matrix = build_smooth_seasonal_transition_matrix( period=period, frequency_multipliers=frequency_multipliers, dtype=dtype) self._drift_scale = drift_scale self._observation_noise_scale = observation_noise_scale self._period = period self._frequency_multipliers = frequency_multipliers super(SmoothSeasonalStateSpaceModel, self).__init__( num_timesteps=num_timesteps, transition_matrix=transition_matrix, transition_noise=tfd.MultivariateNormalDiag( scale_diag=(drift_scale[..., tf.newaxis] * tf.ones([2 * num_frequencies], dtype=dtype)), name='transition_noise'), observation_matrix=observation_matrix, observation_noise=tfd.MultivariateNormalDiag( scale_diag=observation_noise_scale[..., tf.newaxis], name='observation_noise'), initial_state_prior=initial_state_prior, initial_step=initial_step, allow_nan_stats=allow_nan_stats, validate_args=validate_args, name=name)
def soft_multivariate_quantiles(x, quantiles, quantile_width=None, **kwargs): """Computes soft multivariate quantiles via optimal transport. Transport multivariate input values in x onto 2^d + 1 weighted points, {0,1}^d + [0.5, ..., 0.5]. Target weights are adjusted so that those values in x that are transported to the middle value in the target vector correspond to those concentrating around the quantile of interest. Args: x: Tensor<float> of shape [batch, N, d] quantiles: Tensor<float> of shape [r, d], r targeted quantiles of dimension d quantile_width: (float) mass given to the bucket supposed to attract points whose value concentrate around the desired quantile value. Bigger width means that we allow the soft quantile to be a mixture of more points further away from the quantile. If None, the width is set at 1/n where n is the number of values considered (the size along the 'axis'). **kwargs: see sinkhorn.autodiff_sinkhorn for possible extra parameters. Returns: A Tensor<float> [N,r,d] of multivariate quantiles per batch. """ quantiles = tf.constant(quantiles, tf.float32) batch_size = x.shape[0] n = tf.cast(x.shape[1], tf.float32) d = x.shape[2] if quantile_width is None: quantile_width = 2 / n num_quantiles = tf.shape(quantiles)[0] hypercube_vertices = tf.constant( list(itertools.product([-1, 1], repeat=d)), tf.float32) # weights attached to vertices for each quantile. this is n_quantiles x 2^r weights = quantiles[:, tf.newaxis, :]**( 0.5 * (1 - hypercube_vertices))[tf.newaxis, Ellipsis] weights *= (1 - quantiles)[:, tf.newaxis, :]**( 0.5 * (1 + hypercube_vertices))[tf.newaxis, Ellipsis] weights = (1 - quantile_width) * tf.reduce_prod(weights, axis=2) # adding weights for quantile itself (in position 0). weights = tf.concat((quantile_width * tf.ones((num_quantiles, 1)), weights), axis=1) # augmenting and formating as batch_size * 2^r +1 * num_quantiles weights = tf.reshape( tf.tile(tf.transpose(weights), [batch_size, 1]), [batch_size, 2**d + 1, num_quantiles]) # set target locations, by adding the point at 0 that will absorb the quantile # augment it with batch_size y = tf.concat((tf.zeros((1, d), dtype=tf.float32), hypercube_vertices), axis=0) y = tf.reshape(tf.tile(y, [batch_size, 1]), [batch_size, 2**d + 1, d]) # center x x_mean = tf.reduce_mean(x, axis=1) x = x - x_mean[:, tf.newaxis, :] transports = sinkhorn.autodiff_sinkhorn( x, y, tf.ones([batch_size, n, num_quantiles], dtype=tf.float32) / n, weights, **kwargs) # recover convex combinations resulting from transporting to central point in # in all batches and quantile variations. transports = 1 / quantile_width * tf.reshape(transports[:, :, 0, :], [batch_size, n, -1]) # apply these convex combinations to data points + recenter. all_soft_quantiles = tf.reduce_sum( transports[:, :, :, tf.newaxis] * x[:, :, tf.newaxis, :], axis=1) + x_mean[:, tf.newaxis, :] # reshape those quantiles after having applied convex combinations. return tf.reshape(all_soft_quantiles, [batch_size, num_quantiles, d])
def train_step(inputs): """Build `step_fn` for efficientnet learning.""" images, labels = inputs images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1]) labels = tf.tile(labels, [FLAGS.ensemble_size, 1]) num_replicas = tf.cast(strategy.num_replicas_in_sync, tf.float32) l2_coeff = tf.cast(FLAGS.l2, tf.float32) with tf.GradientTape() as tape: logits = model(images, training=True) logits = tf.cast(logits, tf.float32) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy( labels, logits, from_logits=True, label_smoothing=FLAGS.label_smoothing)) filtered_variables = [] for var in model.trainable_variables: # Apply l2 on the slow weights and bias terms. This excludes BN # parameters and fast weight approximate posterior/prior parameters, # but pay caution to their naming scheme. if 'kernel' in var.name or 'bias' in var.name: filtered_variables.append(tf.reshape(var, (-1, ))) l2_loss = FLAGS.l2 * 2 * tf.nn.l2_loss( tf.concat(filtered_variables, axis=0)) loss = negative_log_likelihood + l2_coeff * l2_loss scaled_loss = loss / num_replicas grads = tape.gradient(scaled_loss, model.trainable_weights) # Separate learning rate implementation. if FLAGS.fast_weight_lr_multiplier != 1.0: grads_and_vars = [] for grad, var in zip(grads, model.trainable_variables): # Apply different learning rate on the fast weights. This excludes BN # and slow weights, but pay caution to the naming scheme. if ('batch_norm' not in var.name and 'kernel' not in var.name): grads_and_vars.append( (grad * FLAGS.fast_weight_lr_multiplier, var)) else: grads_and_vars.append((grad, var)) optimizer.apply_gradients(grads_and_vars) else: optimizer.apply_gradients(zip(grads, model.trainable_variables)) sparse_labels = tf.cast( tf.math.argmax(labels, axis=-1, output_type=tf.int32), tf.float32) probs = tf.nn.softmax(logits) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, logits) metrics['train/ece'].update_state(sparse_labels, probs) step_info = { 'loss/negative_log_likelihood': negative_log_likelihood / num_replicas, 'loss/total_loss': scaled_loss, } return step_info
def _replicate(n, tensor): """Replicate the input tensor n times along a new (major) dimension.""" # TODO(axch) Does this already exist somewhere? Should it get contributed? multiples = tf.concat([[n], tf.ones([tf.rank(tensor)], dtype=n.dtype)], axis=0) return tf.tile(tensor[tf.newaxis], multiples)
def single_level_feature_crop(features, level_boxes, detection_prior_levels, min_mask_level, mask_crop_size): """Crop the FPN features at the appropriate levels for each detection. Args: features: a float tensor of shape [batch_size, num_levels, max_feature_size, max_feature_size, num_downsample_channels]. level_boxes: a float Tensor of the level boxes to crop from. [batch_size, num_instances, 4]. detection_prior_levels: an int Tensor of instance assigned level of shape [batch_size, num_instances]. min_mask_level: minimum FPN level to crop mask feature from. mask_crop_size: an int of mask crop size. Returns: crop_features: a float Tensor of shape [batch_size * num_instances, mask_crop_size, mask_crop_size, num_downsample_channels]. This is the instance feature crop. """ (batch_size, num_levels, max_feature_size, _, num_downsample_channels) = features.get_shape().as_list() _, num_of_instances, _ = level_boxes.get_shape().as_list() level_boxes = tf.cast(level_boxes, tf.int32) assert num_of_instances == detection_prior_levels.get_shape().as_list()[1] x_start_indices = level_boxes[:, :, 1] y_start_indices = level_boxes[:, :, 0] # generate the full indices (not just the starting index) x_idx_list = [] y_idx_list = [] for i in range(mask_crop_size): x_idx_list.append(x_start_indices + i) y_idx_list.append(y_start_indices + i) x_indices = tf.stack(x_idx_list, axis=2) y_indices = tf.stack(y_idx_list, axis=2) levels = detection_prior_levels - min_mask_level height_dim_size = max_feature_size level_dim_size = max_feature_size * height_dim_size batch_dim_size = num_levels * level_dim_size # TODO(weicheng) change this to gather_nd for better readability. indices = tf.reshape( tf.tile( tf.reshape( tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]), [1, num_of_instances, mask_crop_size, mask_crop_size]) + tf.tile( tf.reshape(levels * level_dim_size, [batch_size, num_of_instances, 1, 1]), [1, 1, mask_crop_size, mask_crop_size]) + tf.tile( tf.reshape(y_indices * height_dim_size, [batch_size, num_of_instances, mask_crop_size, 1]), [1, 1, 1, mask_crop_size]) + tf.tile( tf.reshape(x_indices, [batch_size, num_of_instances, 1, mask_crop_size]), [1, 1, mask_crop_size, 1]), [-1]) features_r2 = tf.reshape(features, [-1, num_downsample_channels]) crop_features = tf.reshape(tf.gather(features_r2, indices), [ batch_size * num_of_instances, mask_crop_size, mask_crop_size, num_downsample_channels ]) return crop_features