def train(self, sentences):
    token_ids, token_values, token_dense_shape = self._tokenize(sentences)
    tokens_sparse = tf.sparse.SparseTensor(
        indices=token_ids, values=token_values, dense_shape=token_dense_shape)
    tokens = tf.sparse.to_dense(tokens_sparse, default_value="")

    sparse_lookup_ids = tf.sparse.SparseTensor(
        indices=tokens_sparse.indices,
        values=self._words_to_indices(tokens_sparse.values),
        dense_shape=tokens_sparse.dense_shape)
    lookup_ids = tf.sparse.to_dense(sparse_lookup_ids, default_value=0)

    # Targets are the next word for each word of the sentence.
    tokens_ids_seq = lookup_ids[:, 0:-1]
    tokens_ids_target = lookup_ids[:, 1:]

    tokens_prefix = tokens[:, 0:-1]

    # Mask determining which positions we care about for a loss: all positions
    # that have a valid non-terminal token.
    mask = tf.logical_and(
        tf.logical_not(tf.equal(tokens_prefix, "")),
        tf.logical_not(tf.equal(tokens_prefix, "<E>")))

    input_mask = tf.cast(mask, tf.int32)

    with tf.GradientTape() as t:
      sentence_embeddings = tf.nn.embedding_lookup(self._embeddings,
                                                   tokens_ids_seq)

      lstm_initial_state = self._lstm_cell.get_initial_state(
          sentence_embeddings)

      lstm_output = self._rnn_layer(
          inputs=sentence_embeddings, initial_state=lstm_initial_state)

      # Stack LSTM outputs into a batch instead of a 2D array.
      lstm_output = tf.reshape(lstm_output, [-1, self._lstm_cell.output_size])

      logits = self._logit_layer(lstm_output)

      targets = tf.reshape(tokens_ids_target, [-1])
      weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32)

      losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=targets, logits=logits)

      # Final loss is the mean loss for all token losses.
      final_loss = tf.math.divide(
          tf.reduce_sum(tf.multiply(losses, weights)),
          tf.reduce_sum(weights),
          name="final_loss")

    watched = t.watched_variables()
    gradients = t.gradient(final_loss, watched)

    for w, g in zip(watched, gradients):
      w.assign_sub(g)

    return final_loss
  def decode_greedy(self, sequence_length, first_word):
    initial_state = self._lstm_cell.get_initial_state(
        dtype=tf.float32, batch_size=1)

    sequence = [first_word]
    current_word = first_word
    current_id = tf.expand_dims(self._words_to_indices(current_word), 0)
    current_state = initial_state

    for _ in range(sequence_length):
      token_embeddings = tf.nn.embedding_lookup(self._embeddings, current_id)
      lstm_outputs, current_state = self._lstm_cell(token_embeddings,
                                                    current_state)
      lstm_outputs = tf.reshape(lstm_outputs, [-1, self._lstm_cell.output_size])
      logits = self._logit_layer(lstm_outputs)
      softmax = tf.nn.softmax(logits)

      next_ids = tf.math.argmax(softmax, axis=1)
      next_words = self._indices_to_words(next_ids)[0]

      current_id = next_ids
      current_word = next_words
      sequence.append(current_word)

    return sequence
  def _tokenize(self, sentences):
    # Perform a minimalistic text preprocessing by removing punctuation and
    # splitting on spaces.
    normalized_sentences = tf.strings.regex_replace(
        input=sentences, pattern=r"\pP", rewrite="")
    normalized_sentences = tf.reshape(normalized_sentences, [-1])
    sparse_tokens = tf.string_split(normalized_sentences, " ")

    # Deal with a corner case: there is one empty sentence.
    sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant(""))
    # Deal with a corner case: all sentences are empty.
    sparse_tokens = tf.sparse.reset_shape(sparse_tokens)
    sparse_token_ids = self._table.lookup(sparse_tokens.values)

    return (sparse_tokens.indices, sparse_token_ids, sparse_tokens.dense_shape)
Exemple #4
0
def softquantiles(x,
                  quantiles,
                  quantile_width=None,
                  axis=-1,
                  may_squeeze=True,
                  **kwargs):
  """Computes soft quantiles via optimal transport.

  This operator takes advantage of the fact that an exhaustive softsort is not
  required to recover a single quantile. Instead, one can transport all
  input values in x onto only 3 weighted values. Target weights are adjusted so
  that those values in x that are transported to the middle value in the target
  vector y correspond to those concentrating around the quantile of interest.

  This idea generalizes to more quantiles, interleaving small weights on the
  quantile indices and bigger weights in between, corresponding to the gap from
  one desired quantile to the next one.

  Args:
   x: Tensor<float> of any shape.
   quantiles: list<float> the quantiles to be returned. It can also be a single
     float.
   quantile_width: (float) mass given to the bucket supposed to attract points
     whose value concentrate around the desired quantile value. Bigger width
     means that we allow the soft quantile to be a mixture of more points
     further away from the quantile. If None, the width is set at 1/n where n is
     the number of values considered (the size along the 'axis').
   axis: (int) the axis along which to compute the quantile.
   may_squeeze: (bool) should we squeeze the output tensor in case of a single
     quantile.
   **kwargs: see SoftQuantilizer for possible extra parameters.

  Returns:
    A Tensor<float> similar to the input tensor, but the axis dimension is
    replaced by the number of quantiles specified in the quantiles list.
    Hence, if only a quantile is requested (quantiles is a float) only one value
    in that axis is returned. When several quantiles are requested, the tensor
    will have that many values in that axis.

  Raises:
    tf.errors.InvalidArgumentError when the quantiles and quantile width are not
    correct, namely quantiles are either not in sorted order or the
    quantile_width is too large.
  """
  if isinstance(quantiles, float):
    quantiles = [quantiles]
  quantiles = tf.constant(quantiles, tf.float32)

  # Preprocesses submitted quantiles to check that they satisfy elementary
  # constraints.
  valid_quantiles = tf.boolean_mask(
      quantiles, tf.logical_and(quantiles > 0.0, quantiles < 1.0))
  num_quantiles = tf.shape(valid_quantiles)[0]

  # Includes values on both ends of [0,1].
  extended_quantiles = tf.concat([[0.0], valid_quantiles, [1.0]], axis=0)

  # Builds filler_weights in between the target quantiles.
  filler_weights = extended_quantiles[1:] - extended_quantiles[:-1]
  if quantile_width is None:
    quantile_width = tf.reduce_min(
        tf.concat(
            [filler_weights, [1.0 / tf.cast(tf.shape(x)[axis], dtype=x.dtype)]],
            axis=0))

  # Takes into account quantile_width in the definition of weights
  shift = -tf.ones(tf.shape(filler_weights), dtype=x.dtype)
  shift = shift + 0.5 * (
      tf.one_hot(0, num_quantiles + 1) +
      tf.one_hot(num_quantiles, num_quantiles + 1))
  filler_weights = filler_weights + quantile_width * shift

  assert_op = tf.Assert(tf.reduce_all(filler_weights >= 0.0), [filler_weights])
  with tf.control_dependencies([assert_op]):
    # Adds one more value to have tensors of the same shape to interleave them.
    quantile_weights = tf.ones(num_quantiles + 1) * quantile_width

    # Interleaves the filler_weights with the quantile weights.
    weights = tf.reshape(
        tf.stack([filler_weights, quantile_weights], axis=1), (-1,))[:-1]

    # Sends only the positive weights to the softsort operator.
    positive_weights = tf.boolean_mask(weights, weights > 0.0)
    all_quantiles = softsort(
        x,
        direction='ASCENDING',
        axis=axis,
        target_weights=positive_weights,
        **kwargs)

    # Recovers the indices corresponding to the desired quantiles.
    odds = tf.math.floormod(tf.range(weights.shape[0], dtype=tf.float32), 2)
    positives = tf.cast(weights > 0.0, tf.float32)
    indices = tf.cast(tf.math.cumsum(positives) * odds, dtype=tf.int32)
    indices = tf.boolean_mask(indices, indices > 0) - 1
    result = tf.gather(all_quantiles, indices, axis=axis)

    # In the specific case where we want a single quantile, squeezes the
    # quantile dimension.
    can_squeeze = tf.equal(tf.shape(result)[axis], 1)
    if tf.math.logical_and(can_squeeze, may_squeeze):
      result = tf.squeeze(result, axis=axis)
    return result
Exemple #5
0
  def _build_tables(self, prior):
    """Computes integer-valued probability tables used by the range coder.

    These tables must not be re-generated independently on the sending and
    receiving side, since small numerical discrepancies between both sides can
    occur in this process. If the tables differ slightly, this in turn would
    very likely cause catastrophic error propagation during range decoding. For
    a more in-depth discussion of this, see:

    > "Integer Networks for Data Compression with Latent-Variable Models"<br />
    > J. Ballé, N. Johnston, D. Minnen<br />
    > https://openreview.net/forum?id=S1zz2i0cY7

    The tables are stored in `tf.Variable`s as attributes of this object. The
    recommended way is to train the model, instantiate an entropy model with
    `compression=True`, and then distribute the model to a sender and a
    receiver.

    Arguments:
      prior: The `tfp.distributions.Distribution` object (see initializer).
    """
    offset = helpers.quantization_offset(prior)
    lower_tail = helpers.lower_tail(prior, self.tail_mass)
    upper_tail = helpers.upper_tail(prior, self.tail_mass)

    # Largest distance observed between lower tail and median, and between
    # median and upper tail.
    minima = offset - lower_tail
    minima = tf.cast(tf.math.ceil(minima), tf.int32)
    minima = tf.math.maximum(minima, 0)
    maxima = upper_tail - offset
    maxima = tf.cast(tf.math.ceil(maxima), tf.int32)
    maxima = tf.math.maximum(maxima, 0)

    # PMF starting positions and lengths.
    pmf_start = offset - tf.cast(minima, self.dtype)
    pmf_length = maxima + minima + 1

    # Sample the densities in the computed ranges, possibly computing more
    # samples than necessary at the upper end.
    max_length = tf.math.reduce_max(pmf_length)
    if tf.executing_eagerly() and max_length > 2048:
      logging.warning(
          "Very wide PMF with %d elements may lead to out of memory issues. "
          "Consider priors with smaller dispersion or increasing `tail_mass` "
          "parameter.", int(max_length))
    samples = tf.range(tf.cast(max_length, self.dtype), dtype=self.dtype)
    samples = tf.reshape(samples, [-1] + len(self.prior_shape) * [1])
    samples += pmf_start
    pmf = prior.prob(samples)

    # Collapse batch dimensions of distribution.
    pmf = tf.reshape(pmf, [max_length, -1])
    pmf = tf.transpose(pmf)

    pmf_length = tf.broadcast_to(pmf_length, self.prior_shape_tensor)
    pmf_length = tf.reshape(pmf_length, [-1])
    cdf_length = pmf_length + 2
    cdf_offset = tf.broadcast_to(-minima, self.prior_shape_tensor)
    cdf_offset = tf.reshape(cdf_offset, [-1])

    # Prevent tensors from bouncing back and forth between host and GPU.
    with tf.device("/cpu:0"):
      def loop_body(args):
        prob, length = args
        prob = prob[:length]
        overflow = tf.math.maximum(1 - tf.reduce_sum(prob, keepdims=True), 0.)
        prob = tf.concat([prob, overflow], axis=0)
        cdf = range_coding_ops.pmf_to_quantized_cdf(
            prob, precision=self.range_coder_precision)
        return tf.pad(
            cdf, [[0, max_length - length]], mode="CONSTANT", constant_values=0)

      # TODO(jonycgn,ssjhv): Consider switching to Python control flow.
      cdf = tf.map_fn(
          loop_body, (pmf, pmf_length), dtype=tf.int32, name="pmf_to_cdf")

    if self.no_variables:
      self._cdf = cdf
      self._cdf_offset = cdf_offset
      self._cdf_length = cdf_length
    else:
      self._cdf = tf.Variable(cdf, trainable=False, name="cdf")
      self._cdf_offset = tf.Variable(
          cdf_offset, trainable=False, name="cdf_offset")
      self._cdf_length = tf.Variable(
          cdf_length, trainable=False, name="cdf_length")
 def top_k_from_dist(self, user, embeddings, k):
   c = tf.math.softplus(self.c)
   user_emb_distance = tf.reshape(
       hyp_utils.hyp_distance_all_pairs(
           tf.reshape(user, [1, -1]), embeddings, c), [-1])
   return tf.math.top_k(-user_emb_distance, k=k)[1]
Exemple #7
0
 def _bin_positions(self, x):
     x = tf.reshape(x, [-1, self._nbins])
     return tf.math.softmax(
         x, axis=-1) * (2 - self._nbins * 1e-2) + 1e-2
def convolution_batch(x,
                      kernel,
                      rank,
                      strides,
                      padding,
                      data_format=None,
                      dilations=None,
                      name=None):
    """Like `tf.nn.conv2d` except applies batch of kernels to batch of `x`."""
    if rank != 2:
        raise NotImplementedError(
            'Argument `rank` currently only supports `2`; '
            'saw "{}".'.format(rank))
    if data_format is not None and data_format.upper() != 'NHWBC':
        raise ValueError(
            'Argument `data_format` currently only supports "NHWBC"; '
            'saw "{}".'.format(data_format))
    with tf.name_scope(name or 'conv2d_nhwbc'):
        # Prepare arguments.
        [
            rank,
            _,  # strides
            padding,
            dilations,
            data_format,
        ] = prepare_conv_args(rank, strides, padding, dilations)
        strides = prepare_tuple_argument(strides, rank + 2, arg_name='strides')

        dtype = dtype_util.common_dtype([x, kernel], dtype_hint=tf.float32)
        x = tf.convert_to_tensor(x, dtype=dtype, name='x')
        kernel = tf.convert_to_tensor(kernel, dtype=dtype, name='kernel')

        # Step 1: Transpose and double flatten kernel.
        # kernel.shape = B + F + [c, c']. Eg: [b, fh, fw, c, c']
        kernel_shape = prefer_static.shape(kernel)
        kernel_batch_shape, kernel_event_shape = prefer_static.split(
            kernel_shape, num_or_size_splits=[-1, rank + 2])
        kernel_batch_size = prefer_static.reduce_prod(kernel_batch_shape)
        kernel_ndims = prefer_static.rank(kernel)
        kernel_batch_ndims = kernel_ndims - rank - 2
        perm = prefer_static.concat([
            prefer_static.range(kernel_batch_ndims, kernel_batch_ndims + rank),
            prefer_static.range(0, kernel_batch_ndims),
            prefer_static.range(kernel_batch_ndims + rank, kernel_ndims),
        ],
                                    axis=0)  # Eg, [1, 2, 0, 3, 4]
        kernel = tf.transpose(kernel, perm=perm)  # F + B + [c, c']
        kernel = tf.reshape(kernel,
                            shape=prefer_static.concat([
                                kernel_event_shape[:rank],
                                [
                                    kernel_batch_size * kernel_event_shape[-2],
                                    kernel_event_shape[-1]
                                ],
                            ],
                                                       axis=0))  # F + [bc, c']

        # Step 2: Double flatten x.
        # x.shape = N + D + B + [c]
        x_shape = prefer_static.shape(x)
        [
            x_sample_shape,
            x_rank_shape,
            x_batch_shape,
            x_channel_shape,
        ] = prefer_static.split(
            x_shape, num_or_size_splits=[-1, rank, kernel_batch_ndims, 1])
        x = tf.reshape(
            x,  # N + D + B + [c]
            shape=prefer_static.concat([
                [prefer_static.reduce_prod(x_sample_shape)],
                x_rank_shape,
                [
                    prefer_static.reduce_prod(x_batch_shape) *
                    prefer_static.reduce_prod(x_channel_shape)
                ],
            ],
                                       axis=0))  # [n] + D + [bc]

        # Step 3: Apply convolution.
        y = tf.nn.depthwise_conv2d(x,
                                   kernel,
                                   strides=strides,
                                   padding=padding,
                                   data_format='NHWC',
                                   dilations=dilations)
        #  SAME: y.shape = [n, h,      w,      bcc']
        # VALID: y.shape = [n, h-fh+1, w-fw+1, bcc']

        # Step 4: Reshape/reduce for output.
        y_shape = prefer_static.shape(y)
        y = tf.reshape(y,
                       shape=prefer_static.concat(
                           [
                               x_sample_shape,
                               y_shape[1:-1],
                               kernel_batch_shape,
                               kernel_event_shape[-2:],
                           ],
                           axis=0))  # N + D' + B + [c, c']
        y = tf.reduce_sum(y, axis=-2)  # N + D' + B + [c']

        return y
Exemple #9
0
def main(argv):
    del argv  # unused arg
    if not FLAGS.use_gpu:
        raise ValueError('Only GPU is currently supported.')
    if FLAGS.num_cores > 1:
        raise ValueError('Only a single accelerator is currently supported.')
    tf.enable_v2_behavior()
    tf.random.set_seed(FLAGS.seed)
    tf.io.gfile.makedirs(FLAGS.output_dir)

    batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores
    steps_per_eval = IMAGENET_VALIDATION_IMAGES // batch_size

    dataset_test = utils.ImageNetInput(is_training=False,
                                       data_dir=FLAGS.data_dir,
                                       batch_size=FLAGS.per_core_batch_size,
                                       use_bfloat16=False).input_fn()
    test_datasets = {'clean': dataset_test}
    corruption_types, max_intensity = utils.load_corrupted_test_info()
    for name in corruption_types:
        for intensity in range(1, max_intensity + 1):
            dataset_name = '{0}_{1}'.format(name, intensity)
            test_datasets[dataset_name] = utils.load_corrupted_test_dataset(
                name=name,
                intensity=intensity,
                batch_size=FLAGS.per_core_batch_size,
                drop_remainder=True,
                use_bfloat16=False)

    model = deterministic_model.resnet50(input_shape=(224, 224, 3),
                                         num_classes=NUM_CLASSES)

    logging.info('Model input shape: %s', model.input_shape)
    logging.info('Model output shape: %s', model.output_shape)
    logging.info('Model number of weights: %s', model.count_params())
    # Search for checkpoints from their index file; then remove the index suffix.
    ensemble_filenames = tf.io.gfile.glob(
        os.path.join(FLAGS.checkpoint_dir, '**/*.index'))
    ensemble_filenames = [filename[:-6] for filename in ensemble_filenames]
    ensemble_size = len(ensemble_filenames)
    logging.info('Ensemble size: %s', ensemble_size)
    logging.info('Ensemble number of weights: %s',
                 ensemble_size * model.count_params())
    logging.info('Ensemble filenames: %s', str(ensemble_filenames))
    checkpoint = tf.train.Checkpoint(model=model)

    # Write model predictions to files.
    num_datasets = len(test_datasets)
    for m, ensemble_filename in enumerate(ensemble_filenames):
        checkpoint.restore(ensemble_filename)
        for n, (name, test_dataset) in enumerate(test_datasets.items()):
            filename = '{dataset}_{member}.npy'.format(dataset=name, member=m)
            filename = os.path.join(FLAGS.output_dir, filename)
            if not tf.io.gfile.exists(filename):
                logits = []
                test_iterator = iter(test_dataset)
                for _ in range(steps_per_eval):
                    features, _ = next(test_iterator)
                    logits.append(model(features, training=False))

                logits = tf.concat(logits, axis=0)
                with tf.io.gfile.GFile(filename, 'w') as f:
                    np.save(f, logits.numpy())
            percent = (m * num_datasets +
                       (n + 1)) / (ensemble_size * num_datasets)
            message = (
                '{:.1%} completion for prediction: ensemble member {:d}/{:d}. '
                'Dataset {:d}/{:d}'.format(percent, m + 1, ensemble_size,
                                           n + 1, num_datasets))
            logging.info(message)

    metrics = {
        'test/negative_log_likelihood': tf.keras.metrics.Mean(),
        'test/gibbs_cross_entropy': tf.keras.metrics.Mean(),
        'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(),
        'test/ece':
        ed.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins),
    }
    corrupt_metrics = {}
    for name in test_datasets:
        corrupt_metrics['test/nll_{}'.format(name)] = tf.keras.metrics.Mean()
        corrupt_metrics['test/accuracy_{}'.format(name)] = (
            tf.keras.metrics.SparseCategoricalAccuracy())
        corrupt_metrics['test/ece_{}'.format(
            name)] = ed.metrics.ExpectedCalibrationError(
                num_bins=FLAGS.num_bins)

    # Evaluate model predictions.
    for n, (name, test_dataset) in enumerate(test_datasets.items()):
        logits_dataset = []
        for m in range(ensemble_size):
            filename = '{dataset}_{member}.npy'.format(dataset=name, member=m)
            filename = os.path.join(FLAGS.output_dir, filename)
            with tf.io.gfile.GFile(filename, 'rb') as f:
                logits_dataset.append(np.load(f))

        logits_dataset = tf.convert_to_tensor(logits_dataset)
        test_iterator = iter(test_dataset)
        for step in range(steps_per_eval):
            _, labels = next(test_iterator)
            logits = logits_dataset[:, (step * batch_size):((step + 1) *
                                                            batch_size)]
            labels = tf.cast(tf.reshape(labels, [-1]), tf.int32)
            negative_log_likelihood = tf.reduce_mean(
                ensemble_negative_log_likelihood(labels, logits))
            per_probs = tf.nn.softmax(logits)
            probs = tf.reduce_mean(per_probs, axis=0)
            if name == 'clean':
                gibbs_ce = tf.reduce_mean(gibbs_cross_entropy(labels, logits))
                metrics['test/negative_log_likelihood'].update_state(
                    negative_log_likelihood)
                metrics['test/gibbs_cross_entropy'].update_state(gibbs_ce)
                metrics['test/accuracy'].update_state(labels, probs)
                metrics['test/ece'].update_state(labels, probs)
            else:
                corrupt_metrics['test/nll_{}'.format(name)].update_state(
                    negative_log_likelihood)
                corrupt_metrics['test/accuracy_{}'.format(name)].update_state(
                    labels, probs)
                corrupt_metrics['test/ece_{}'.format(name)].update_state(
                    labels, probs)

        message = (
            '{:.1%} completion for evaluation: dataset {:d}/{:d}'.format(
                (n + 1) / num_datasets, n + 1, num_datasets))
        logging.info(message)

    corrupt_results = utils.aggregate_corrupt_metrics(
        corrupt_metrics, corruption_types, max_intensity,
        FLAGS.alexnet_errors_path)
    total_results = {name: metric.result() for name, metric in metrics.items()}
    total_results.update(corrupt_results)
    logging.info('Metrics: %s', total_results)
Exemple #10
0
    def sample_discount_curve_paths(self,
                                    times,
                                    curve_times,
                                    num_samples,
                                    time_step,
                                    num_time_steps=None,
                                    random_type=None,
                                    seed=None,
                                    skip=0,
                                    name=None):
        """Returns a sample of simulated discount curves for the Hull-white model.

    Args:
      times: A real positive `Tensor` of shape `[num_times,]`. The times `t` at
        which the discount curves are to be evaluated.
      curve_times: A real positive `Tensor` of shape `[num_curve_times]`. The
        maturities at which discount curve is computed at each simulation time.
      num_samples: Positive scalar `int`. The number of paths to draw.
      time_step: Scalar real `Tensor`. Maximal distance between time grid points
        in Euler scheme. Used only when Euler scheme is applied.
        Default value: `None`.
      num_time_steps: An optional Scalar integer `Tensor` - a total number of
        time steps performed by the algorithm. The maximal distance betwen
        points in grid is bounded by
        `times[-1] / (num_time_steps - times.shape[0])`.
        Either this or `time_step` should be supplied.
        Default value: `None`.
      random_type: Enum value of `RandomType`. The type of (quasi)-random
        number generator to use to generate the paths.
        Default value: None which maps to the standard pseudo-random numbers.
      seed: Seed for the random number generator. The seed is
        only relevant if `random_type` is one of
        `[STATELESS, PSEUDO, HALTON_RANDOMIZED, PSEUDO_ANTITHETIC,
          STATELESS_ANTITHETIC]`. For `PSEUDO`, `PSEUDO_ANTITHETIC` and
        `HALTON_RANDOMIZED` the seed should be an Python integer. For
        `STATELESS` and  `STATELESS_ANTITHETIC` must be supplied as an integer
        `Tensor` of shape `[2]`.
        Default value: `None` which means no seed is set.
      skip: `int32` 0-d `Tensor`. The number of initial points of the Sobol or
        Halton sequence to skip. Used only when `random_type` is 'SOBOL',
        'HALTON', or 'HALTON_RANDOMIZED', otherwise ignored.
        Default value: `0`.
      name: Str. The name to give this op.
        Default value: `sample_discount_curve_paths`.

    Returns:
      A tuple containing three `Tensor`s.

      * The first element is a `Tensor` of shape
      `batch_shape + [num_samples, num_curve_times, num_times]` containing
      the simulated zero coupon bond curves `P(t, T)`.
      * The second element is a `Tensor` of shape
      `batch_shape + [num_samples, num_times]` containing the simulated short
      rate paths.
      * The third element is a `Tensor` of shape
      `batch_shape + [num_samples, num_times]` containing the simulated
      discount factor paths.

    ### References:
      [1]: Leif B.G. Andersen and Vladimir V. Piterbarg. Interest Rate Modeling,
      Volume II: Term Structure Models. 2010.
    """
        name = name or self._name + '_sample_discount_curve_paths'
        with tf.name_scope(name):
            times = tf.convert_to_tensor(times, self._dtype)
            num_times = tf.shape(times)[0]
            curve_times = tf.convert_to_tensor(curve_times, self._dtype)
            rate_paths, discount_factor_paths, x_t, y_t = self._sample_paths(
                times, time_step, num_time_steps, num_samples, random_type,
                skip, seed)
            # Reshape x_t to (batch_size, num_samples, 1, num_times, nfactors)
            x_t = tf.expand_dims(x_t, axis=self._batch_rank + 1)
            # Reshape y_t to (batch_size, num_samples, 1, num_times, nfactors**2)
            y_t = tf.expand_dims(y_t, axis=self._batch_rank + 1)

            # Reshape `times` and `curve_times` so that they have the dimensions of
            # ([num_smaples,num_curve_times,num_sim_times]).
            num_curve_nodes = tf.shape(curve_times)[0]
            num_sim_steps = tf.shape(times)[0]
            times = tf.reshape(times, (1, 1, num_sim_steps))
            curve_times = tf.reshape(curve_times, (1, num_curve_nodes, 1))
            # Reshape `mean_reversion` to the dimensions of
            # (batch_shape, [num_smaples,num_curve_times,num_sim_times]).
            mean_reversion = tf.reshape(
                self._mean_reversion,
                self._batch_shape + [1, 1, 1, self._factors])

            return (self._bond_reconstitution(times, times + curve_times,
                                              mean_reversion, x_t, y_t,
                                              num_samples, num_times),
                    rate_paths, discount_factor_paths)
    def _sample_n(self, n, seed):
        components_seed, mix_seed = samplers.split_seed(
            seed, salt='MixtureSameFamily')
        try:
            seed_stream = SeedStream(seed, salt='MixtureSameFamily')
        except TypeError as e:  # Can happen for Tensor seeds.
            seed_stream = None
            seed_stream_err = e
        try:
            x = self.components_distribution.sample(  # [n, B, k, E]
                n, seed=components_seed)
            if seed_stream is not None:
                seed_stream()  # Advance even if unused.
        except TypeError as e:
            if ('Expected int for argument' not in str(e)
                    and TENSOR_SEED_MSG_PREFIX not in str(e)):
                raise
            if seed_stream is None:
                raise seed_stream_err
            msg = (
                'Falling back to stateful sampling for `components_distribution` '
                '{} of type `{}`. Please update to use `tf.random.stateless_*` '
                'RNGs. This fallback may be removed after 20-Aug-2020. {}')
            warnings.warn(
                msg.format(self.components_distribution.name,
                           type(self.components_distribution), str(e)))
            x = self.components_distribution.sample(  # [n, B, k, E]
                n, seed=seed_stream())

        event_shape = None
        event_ndims = tensorshape_util.rank(self.event_shape)
        if event_ndims is None:
            event_shape = self.components_distribution.event_shape_tensor()
            event_ndims = ps.rank_from_shape(event_shape)
        event_ndims_static = tf.get_static_value(event_ndims)

        num_components = None
        if event_ndims_static is not None:
            num_components = tf.compat.dimension_value(
                x.shape[-1 - event_ndims_static])
        # We could also check if num_components can be computed statically from
        # self.mixture_distribution's logits or probs.
        if num_components is None:
            num_components = tf.shape(x)[-1 - event_ndims]

        # TODO(jvdillon): Consider using tf.gather (by way of index unrolling).
        npdt = dtype_util.as_numpy_dtype(x.dtype)
        try:
            mix_sample = self.mixture_distribution.sample(
                n, seed=mix_seed)  # [n, B] or [n]
        except TypeError as e:
            if ('Expected int for argument' not in str(e)
                    and TENSOR_SEED_MSG_PREFIX not in str(e)):
                raise
            if seed_stream is None:
                raise seed_stream_err
            msg = (
                'Falling back to stateful sampling for `mixture_distribution` '
                '{} of type `{}`. Please update to use `tf.random.stateless_*` '
                'RNGs. This fallback may be removed after 20-Aug-2020. ({})')
            warnings.warn(
                msg.format(self.mixture_distribution.name,
                           type(self.mixture_distribution), str(e)))
            mix_sample = self.mixture_distribution.sample(
                n, seed=seed_stream())  # [n, B] or [n]
        mask = tf.one_hot(
            indices=mix_sample,  # [n, B] or [n]
            depth=num_components,
            on_value=npdt(1),
            off_value=npdt(0))  # [n, B, k] or [n, k]

        # Pad `mask` to [n, B, k, [1]*e] or [n, [1]*b, k, [1]*e] .
        batch_ndims = ps.rank(x) - event_ndims - 1
        mask_batch_ndims = ps.rank(mask) - 1
        pad_ndims = batch_ndims - mask_batch_ndims
        mask_shape = ps.shape(mask)
        target_shape = ps.concat([
            mask_shape[:-1],
            ps.ones([pad_ndims], dtype=tf.int32),
            mask_shape[-1:],
            ps.ones([event_ndims], dtype=tf.int32),
        ],
                                 axis=0)
        mask = tf.reshape(mask, shape=target_shape)

        if dtype_util.is_floating(x.dtype) or dtype_util.is_complex(x.dtype):
            masked = tf.math.multiply_no_nan(x, mask)
        else:
            masked = x * mask
        ret = tf.reduce_sum(masked, axis=-1 - event_ndims)  # [n, B, E]

        if self._reparameterize:
            if event_shape is None:
                event_shape = self.components_distribution.event_shape_tensor()
            ret = self._reparameterize_sample(ret, event_shape=event_shape)

        return ret
    def _reparameterize_sample(self, x, event_shape):
        """Adds reparameterization (pathwise) gradients to samples of the mixture.

    Implicit reparameterization gradients are
       dx/dphi = -(d transform(x, phi) / dx)^-1 * d transform(x, phi) / dphi,
    where transform(x, phi) is distributional transform that removes all
    parameters from samples x.

    We implement them by replacing x with
      -stop_gradient(d transform(x, phi) / dx)^-1 * transform(x, phi)]
    for the backward pass (gradient computation).
    The derivative of this quantity w.r.t. phi is then the implicit
    reparameterization gradient.
    Note that this replaces the gradients w.r.t. both the mixture
    distribution parameters and components distributions parameters.

    Limitations:
      1. Fundamental: components must be fully reparameterized.
      2. Distributional transform is currently only implemented for
        factorized components.
      3. Distributional transform currently only works for known rank of the
        batch tensor.

    Args:
      x: Sample of mixture distribution
      event_shape: The event shape of this distribution

    Returns:
      Tensor with same value as x, but with reparameterization gradients
    """
        # Remove the existing gradients of x wrt parameters of the components.
        x = tf.stop_gradient(x)

        event_size = ps.cast(ps.reduce_prod(event_shape), dtype=tf.int32)
        x_2d_shape = [-1, event_size]  # [S*prod(B), prod(E)]

        # Perform distributional transform of x in [S, B, E] shape,
        # but have Jacobian of size [S*prod(B), prod(E), prod(E)].
        def reshaped_distributional_transform(x_2d):
            return tf.reshape(
                self._distributional_transform(tf.reshape(x_2d, ps.shape(x)),
                                               event_shape), x_2d_shape)

        # transform_2d: [S*prod(B), prod(E)]
        # jacobian: [S*prod(B), prod(E), prod(E)]
        x_2d = tf.reshape(x, x_2d_shape)
        transform_2d, jacobian = value_and_batch_jacobian(
            reshaped_distributional_transform, x_2d)

        # We only provide the first derivative; the second derivative computed by
        # autodiff would be incorrect, so we raise an error if it is requested.
        transform_2d = _prevent_2nd_derivative(transform_2d)

        # Compute [- stop_gradient(jacobian)^-1 * transform] by solving a linear
        # system. The Jacobian is lower triangular because the distributional
        # transform for i-th event dimension does not depend on the next
        # dimensions.
        surrogate_x_2d = -tf.linalg.triangular_solve(
            tf.stop_gradient(jacobian),
            transform_2d[..., tf.newaxis],
            lower=True)  # [S*prod(B), prod(E), 1]
        surrogate_x = tf.reshape(surrogate_x_2d, ps.shape(x))

        # Replace gradients of x with gradients of surrogate_x, but keep the value.
        return x + (surrogate_x - tf.stop_gradient(surrogate_x))
Exemple #13
0
 def _make_pairs(x):
     return tf.reshape(tf.tile(x[:, tf.newaxis, :], [1, 2, 1]),
                       [-1, x.shape[-1]])
Exemple #14
0
def DenseAR(x,
            h=None,
            hidden_layers=(),
            activation=tf.nn.relu,
            log_scale_clip=None,
            log_scale_clip_pre=None,
            train=False,
            dropout_rate=0.0,
            sigmoid_scale=False,
            log_scale_factor=1.0,
            log_scale_reg=0.0,
            shift_only=False,
            **kwargs):
    input_depth = int(x.shape.with_rank_at_least(1)[-1])
    if input_depth is None:
        raise NotImplementedError(
            "Rightmost dimension must be known prior to graph execution.")
    input_shape = (np.int32(x.shape.as_list())
                   if x.shape.is_fully_defined() else tf.shape(x))
    for i, units in enumerate(hidden_layers):
        x = MaskedDense(inputs=x,
                        units=units,
                        num_blocks=input_depth,
                        exclusive=True if i == 0 else False,
                        activation=activation,
                        **kwargs)
        if h is not None:
            x += tfkl.Dense(units, use_bias=False, **kwargs)(h)
        if dropout_rate > 0:
            x = tfkl.Dropout(dropout_rate)(x, training=train)

    if shift_only:
        shift = MaskedDense(inputs=x,
                            units=input_depth,
                            num_blocks=input_depth,
                            activation=None,
                            **kwargs)
        return shift, None
    else:
        if log_scale_factor == 1.0 and log_scale_reg == 0.0 and not log_scale_clip_pre:
            x = MaskedDense(inputs=x,
                            units=2 * input_depth,
                            num_blocks=input_depth,
                            activation=None,
                            **kwargs)
            if h is not None:
                x += tfkl.Dense(2 * input_depth, use_bias=False, **kwargs)(h)
            x = tf.reshape(x, shape=tf.concat([input_shape, [2]], axis=0))
            shift, log_scale = tf.unstack(x, num=2, axis=-1)
        else:
            shift = MaskedDense(inputs=x,
                                units=input_depth,
                                num_blocks=input_depth,
                                activation=None,
                                **kwargs)
            if log_scale_reg > 0.0:
                regularizer = lambda w: log_scale_reg * 2.0 * tf.nn.l2_loss(w)
            else:
                regularizer = None
            log_scale = MaskedDense(inputs=x,
                                    units=input_depth,
                                    num_blocks=input_depth,
                                    activation=None,
                                    use_bias=False,
                                    kernel_regularizer=regularizer,
                                    **kwargs)
            log_scale *= log_scale_factor
            if log_scale_clip_pre:
                log_scale = log_scale_clip_pre * tf.nn.tanh(
                    log_scale / log_scale_clip_pre)
            log_scale += tf.get_variable("log_scale_bias", [1, input_depth],
                                         initializer=tf.zeros_initializer())
            if h is not None:
                shift += tfkl.Dense(input_depth, use_bias=False, **kwargs)(h)
                log_scale += tfkl.Dense(input_depth, use_bias=False,
                                        **kwargs)(h)

        if sigmoid_scale:
            log_scale = tf.log_sigmoid(log_scale)

        if log_scale_clip:
            log_scale = log_scale_clip * tf.nn.tanh(log_scale / log_scale_clip)

        return shift, log_scale
Exemple #15
0
 def _sum_pairs(x):
     if x.shape[0] % 2 != 0:
         x = tf.concat(
             [x, tf.zeros(tf.concat([[1], tf.shape(x)[1:]], 0))], 0)
     return tf.reduce_sum(
         tf.reshape(x, [tf.shape(x)[0] // 2, 2, -1]), 1)
Exemple #16
0
def EffectiveSampleSize(states,
                        filter_beyond_lag=300,
                        filter_threshold=0.05,
                        use_geyer=False,
                        center=True,
                        normalize=True):
    """ESS computation for one single Tensor argument."""
    def _axis_size(x, axis=None):
        """Get number of elements of `x` in `axis`, as type `x.dtype`."""
        if axis is None:
            return tf.cast(tf.size(x), x.dtype)
        return tf.cast(tf.reduce_prod(tf.gather(tf.shape(x), axis)), x.dtype)

    with tf.name_scope("effective_sample_size_single_state"):

        states = tf.convert_to_tensor(states, name="states")
        dt = states.dtype

        # filter_beyond_lag == None ==> auto_corr is the full sequence.
        auto_corr = SanitizedAutoCorrelationMean(states,
                                                 axis=0,
                                                 reduce_axis=1,
                                                 center=center,
                                                 normalize=normalize,
                                                 max_lags=filter_beyond_lag)
        orig_auto_corr = auto_corr
        if use_geyer:

            def _sum_pairs(x):
                if x.shape[0] % 2 != 0:
                    x = tf.concat(
                        [x, tf.zeros(tf.concat([[1], tf.shape(x)[1:]], 0))], 0)
                return tf.reduce_sum(
                    tf.reshape(x, [tf.shape(x)[0] // 2, 2, -1]), 1)

            def _make_pairs(x):
                return tf.reshape(tf.tile(x[:, tf.newaxis, :], [1, 2, 1]),
                                  [-1, x.shape[-1]])

            auto_corr_pairs = _make_pairs(
                _sum_pairs(auto_corr))[:auto_corr.shape[0]]
            mask = auto_corr_pairs < 0.
            mask = tf.cast(mask, dt)
            mask = tf.cumsum(mask, axis=0)
            mask = tf.maximum(1. - mask, 0.)
            auto_corr *= mask
        elif filter_threshold is not None:
            filter_threshold = tf.convert_to_tensor(filter_threshold,
                                                    dtype=dt,
                                                    name="filter_threshold")
            # Get a binary mask to zero out values of auto_corr below the threshold.
            #   mask[i, ...] = 1 if auto_corr[j, ...] > threshold for all j <= i,
            #   mask[i, ...] = 0, otherwise.
            # So, along dimension zero, the mask will look like [1, 1, ..., 0, 0,...]
            # Building step by step,
            #   Assume auto_corr = [1, 0.5, 0.0, 0.3], and filter_threshold = 0.2.
            # Step 1:  mask = [False, False, True, False]
            mask = tf.abs(auto_corr) < filter_threshold
            # Step 2:  mask = [0, 0, 1, 1]
            mask = tf.cast(mask, dtype=dt)
            # Step 3:  mask = [0, 0, 1, 2]
            mask = tf.cumsum(mask, axis=0)
            # Step 4:  mask = [1, 1, 0, 0]
            mask = tf.maximum(1. - mask, 0.)
            auto_corr *= mask

        # With R[k] := auto_corr[k, ...],
        # ESS = N / {1 + 2 * Sum_{k=1}^N (N - k) / N * R[k]}
        #     = N / {-1 + 2 * Sum_{k=0}^N (N - k) / N * R[k]} (since R[0] = 1)
        #     approx N / {-1 + 2 * Sum_{k=0}^M (N - k) / N * R[k]}
        # where M is the filter_beyond_lag truncation point chosen above.

        # Get the factor (N - k) / N, and give it shape [M, 1,...,1], having total
        # ndims the same as auto_corr
        n = _axis_size(states, axis=0)
        k = tf.range(0., _axis_size(auto_corr, axis=0))
        nk_factor = (n - k) / n
        if auto_corr.shape.ndims is not None:
            new_shape = [-1] + [1] * (auto_corr.shape.ndims - 1)
        else:
            new_shape = tf.concat(
                ([-1], tf.ones([tf.rank(auto_corr) - 1], dtype=tf.int32)),
                axis=0)
        nk_factor = tf.reshape(nk_factor, new_shape)

        # return tf.reduce_mean(n / (
        #   -1 + 2 * tf.reduce_sum(nk_factor * auto_corr, axis=0)), 0)
        # return n / (1.0 + 2 *
        #             tf.reduce_sum(nk_factor[1:, ...] * auto_corr[1:, ...],
        #             axis=0))
        # return tf.reduce_mean(n / (-auto_corr[0] + 2 *
        #   tf.reduce_sum(nk_factor * auto_corr, axis=0)), 0)
        # print(auto_corr[0])
        return n / (orig_auto_corr[0] + 2 * tf.reduce_sum(
            nk_factor[1:, Ellipsis] * auto_corr[1:, Ellipsis], axis=0))
Exemple #17
0
def pairwise_square_distance_tensor(x1,
                                    x2,
                                    feature_ndims,
                                    x1_example_ndims=1,
                                    x2_example_ndims=1):
    """Returns pairwise distance between x1 and x2.

  This method is a generalization `pairwise_square_distance_matrix`.
  Given `x1` and `x2`, Tensors with shape `[..., N1, ... Nm, D1, ... Dk]` and
  `[..., M1, ... Ml, D1, ... Dk]`, compute the pairwise distance tensor `A` of
  shape `[..., N1, ... Nm, M1, ... Ml]`, where `m` is `x1_example_ndims` and
  `l` is `x2_example_ndims`.

  Args:
    x1: Floating point `Tensor` with shape `B1 + E1 + [D1, ..., Dk]`,
      where `B1` is a (possibly empty) batch shape, and `E1` is a list
      of `x1_example_ndims` values.
    x2: Floating point `Tensor` with shape `B2 + [M] + [D1, ..., Dk]`,
      where `B2` is a (possibly empty) batch shape that broadcasts
      with `B1`, and `E2` is a list of `x1_example_ndims` values.
    feature_ndims: The number of dimensions to consider for the euclidean
      norm. This is `k` from above.
    x1_example_ndims: Integer for number of example dimensions in `x1`. This is
      `len(E1)`.
    x2_example_ndims: Integer for number of example dimensions in `x2`. This is
      `len(E2)`.
  Returns:
    `Tensor` of shape `bc(B1, B2) + E1 + E2` representing the pairwise square
    distance tensor.
  """
    # Collapse all the example dimensions and then expand after.
    x1_shape = tf.shape(x1)
    x1_example_shape = x1_shape[-(feature_ndims +
                                  x1_example_ndims):-feature_ndims]

    x2_shape = tf.shape(x2)
    x2_example_shape = x2_shape[-(feature_ndims +
                                  x2_example_ndims):-feature_ndims]

    x1 = tf.reshape(
        x1,
        tf.concat([
            x1_shape[:-(feature_ndims + x1_example_ndims)], [-1],
            x1_shape[-feature_ndims:]
        ],
                  axis=0))
    x2 = tf.reshape(
        x2,
        tf.concat([
            x2_shape[:-(feature_ndims + x2_example_ndims)], [-1],
            x2_shape[-feature_ndims:]
        ],
                  axis=0))
    pairwise = pairwise_square_distance_matrix(x1,
                                               x2,
                                               feature_ndims=feature_ndims)
    # Now we need to undo the transformation.
    return tf.reshape(
        pairwise,
        tf.concat(
            [tf.shape(pairwise)[:-2], x1_example_shape, x2_example_shape],
            axis=0))
Exemple #18
0
def soft_multivariate_quantiles(x,
                                quantiles,
                                quantile_width=None,
                                **kwargs):
  """Computes soft multivariate quantiles via optimal transport.

  Transport multivariate input values in x onto 2^d + 1 weighted points,
  {0,1}^d + [0.5, ..., 0.5]. Target weights are adjusted so
  that those values in x that are transported to the middle value in the target
  vector correspond to those concentrating around the quantile of interest.

  Args:
   x: Tensor<float> of shape [batch, N, d]
   quantiles: Tensor<float> of shape [r, d], r targeted quantiles of dimension d
   quantile_width: (float) mass given to the bucket supposed to attract points
     whose value concentrate around the desired quantile value. Bigger width
     means that we allow the soft quantile to be a mixture of more points
     further away from the quantile. If None, the width is set at 1/n where n is
     the number of values considered (the size along the 'axis').
   **kwargs: see sinkhorn.autodiff_sinkhorn for possible extra parameters.

  Returns:
    A Tensor<float> [N,r,d] of multivariate quantiles per batch.

  """
  quantiles = tf.constant(quantiles, tf.float32)
  batch_size = x.shape[0]
  n = tf.cast(x.shape[1], tf.float32)
  d = x.shape[2]
  if quantile_width is None:
    quantile_width = 2 / n
  num_quantiles = tf.shape(quantiles)[0]
  hypercube_vertices = tf.constant(
      list(itertools.product([-1, 1], repeat=d)), tf.float32)
  # weights attached to vertices for each quantile. this is n_quantiles x 2^r
  weights = quantiles[:, tf.newaxis, :]**(
      0.5 * (1 - hypercube_vertices))[tf.newaxis, Ellipsis]
  weights *= (1 - quantiles)[:, tf.newaxis, :]**(
      0.5 * (1 + hypercube_vertices))[tf.newaxis, Ellipsis]

  weights = (1 - quantile_width) * tf.reduce_prod(weights, axis=2)
  # adding weights for quantile itself (in position 0).
  weights = tf.concat((quantile_width * tf.ones((num_quantiles, 1)), weights),
                      axis=1)
  # augmenting and formating as batch_size * 2^r +1 * num_quantiles
  weights = tf.reshape(
      tf.tile(tf.transpose(weights), [batch_size, 1]),
      [batch_size, 2**d + 1, num_quantiles])
  # set target locations, by adding the point at 0 that will absorb the quantile
  # augment it with batch_size
  y = tf.concat((tf.zeros((1, d), dtype=tf.float32), hypercube_vertices),
                axis=0)
  y = tf.reshape(tf.tile(y, [batch_size, 1]), [batch_size, 2**d + 1, d])
  # center x
  x_mean = tf.reduce_mean(x, axis=1)
  x = x - x_mean[:, tf.newaxis, :]
  transports = sinkhorn.autodiff_sinkhorn(
      x, y,
      tf.ones([batch_size, n, num_quantiles], dtype=tf.float32) / n, weights,
      **kwargs)

  # recover convex combinations resulting from transporting to central point in
  # in all batches and quantile variations.
  transports = 1 / quantile_width * tf.reshape(transports[:, :, 0, :],
                                               [batch_size, n, -1])
  # apply these convex combinations to data points + recenter.
  all_soft_quantiles = tf.reduce_sum(
      transports[:, :, :, tf.newaxis] *
      x[:, :, tf.newaxis, :],
      axis=1) + x_mean[:, tf.newaxis, :]
  # reshape those quantiles after having applied convex combinations.
  return tf.reshape(all_soft_quantiles, [batch_size, num_quantiles, d])
 def reshaped_distributional_transform(x_2d):
     return tf.reshape(
         self._distributional_transform(tf.reshape(x_2d, ps.shape(x)),
                                        event_shape), x_2d_shape)
Exemple #20
0
    def step_fn(inputs):
      """Per-Replica StepFn."""
      images, labels = inputs
      if FLAGS.ensemble_size > 1:
        images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1])
        labels = tf.tile(labels, [FLAGS.ensemble_size])

      with tf.GradientTape() as tape:
        logits = model(images, training=True)
        if FLAGS.use_bfloat16:
          logits = tf.cast(logits, tf.float32)

        probs = tf.nn.softmax(logits)
        if FLAGS.ensemble_size > 1:
          per_probs = tf.reshape(
              probs, tf.concat([[FLAGS.ensemble_size, -1], probs.shape[1:]], 0))
          diversity_results = ed.metrics.average_pairwise_diversity(
              per_probs, FLAGS.ensemble_size)

        negative_log_likelihood = tf.reduce_mean(
            tf.keras.losses.sparse_categorical_crossentropy(labels,
                                                            logits,
                                                            from_logits=True))
        filtered_variables = []
        for var in model.trainable_variables:
          # Apply l2 on the BN parameters and bias terms. This
          # excludes only fast weight approximate posterior/prior parameters,
          # but pay caution to their naming scheme.
          if ('kernel' in var.name or
              'batch_norm' in var.name or
              'bias' in var.name):
            filtered_variables.append(tf.reshape(var, (-1,)))

        l2_loss = FLAGS.l2 * 2 * tf.nn.l2_loss(
            tf.concat(filtered_variables, axis=0))
        kl = sum(model.losses) / APPROX_IMAGENET_TRAIN_IMAGES
        kl_scale = tf.cast(global_step + 1, tf.float32)
        kl_scale /= steps_per_epoch * FLAGS.kl_annealing_epochs
        kl_scale = tf.minimum(1., kl_scale)
        kl_loss = kl_scale * kl
        loss = negative_log_likelihood + l2_loss + kl_loss
        # Scale the loss given the TPUStrategy will reduce sum all gradients.
        scaled_loss = loss / strategy.num_replicas_in_sync

      grads = tape.gradient(scaled_loss, model.trainable_variables)

      # Separate learning rate implementation.
      if FLAGS.fast_weight_lr_multiplier != 1.0:
        grads_and_vars = []
        for grad, var in zip(grads, model.trainable_variables):
          # Apply different learning rate on the fast weights. This excludes BN
          # and slow weights, but pay caution to the naming scheme.
          if ('batch_norm' not in var.name and 'kernel' not in var.name):
            grads_and_vars.append((grad * FLAGS.fast_weight_lr_multiplier,
                                   var))
          else:
            grads_and_vars.append((grad, var))
        optimizer.apply_gradients(grads_and_vars)
      else:
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

      metrics['train/ece'].update_state(labels, probs)
      metrics['train/loss'].update_state(loss)
      metrics['train/negative_log_likelihood'].update_state(
          negative_log_likelihood)
      metrics['train/kl'].update_state(kl)
      metrics['train/kl_scale'].update_state(kl_scale)
      metrics['train/accuracy'].update_state(labels, logits)
      if FLAGS.ensemble_size > 1:
        for k, v in diversity_results.items():
          training_diversity['train/' + k].update_state(v)
      global_step.assign_add(1)
    def _distributional_transform(self, x, event_shape):
        """Performs distributional transform of the mixture samples.

    Distributional transform removes the parameters from samples of a
    multivariate distribution by applying conditional CDFs:
      (F(x_1), F(x_2 | x1_), ..., F(x_d | x_1, ..., x_d-1))
    (the indexing is over the 'flattened' event dimensions).
    The result is a sample of product of Uniform[0, 1] distributions.

    We assume that the components are factorized, so the conditional CDFs become
      F(x_i | x_1, ..., x_i-1) = sum_k w_i^k F_k (x_i),
    where w_i^k is the posterior mixture weight: for i > 0
      w_i^k = w_k prob_k(x_1, ..., x_i-1) / sum_k' w_k' prob_k'(x_1, ..., x_i-1)
    and w_0^k = w_k is the mixture probability of the k-th component.

    Args:
      x: Sample of mixture distribution
      event_shape: The event shape of this distribution

    Returns:
      Result of the distributional transform
    """

        if tensorshape_util.rank(x.shape) is None:
            # tf.math.softmax raises an error when applied to inputs of undefined
            # rank.
            raise ValueError(
                'Distributional transform does not support inputs of '
                'undefined rank.')

        # Obtain factorized components distribution and assert that it's
        # a scalar distribution.
        if isinstance(self._components_distribution, independent.Independent):
            univariate_components = self._components_distribution.distribution
        else:
            univariate_components = self._components_distribution

        with tf.control_dependencies([
                assert_util.assert_equal(
                    univariate_components.is_scalar_event(),
                    True,
                    message='`univariate_components` must have scalar event')
        ]):
            event_ndims = ps.rank_from_shape(event_shape)
            x_padded = self._pad_sample_dims(
                x, event_ndims=event_ndims)  # [S, B, 1, E]
            log_prob_x = univariate_components.log_prob(
                x_padded)  # [S, B, k, E]
            cdf_x = univariate_components.cdf(x_padded)  # [S, B, k, E]

            # log prob_k (x_1, ..., x_i-1)
            event_size = ps.cast(ps.reduce_prod(event_shape), dtype=tf.int32)
            cumsum_log_prob_x = tf.reshape(
                tf.math.cumsum(
                    # [S*prod(B)*k, prod(E)]
                    tf.reshape(log_prob_x, [-1, event_size]),
                    exclusive=True,
                    axis=-1),
                ps.shape(log_prob_x))  # [S, B, k, E]

            event_ndims = ps.rank_from_shape(event_shape)
            logits_mix_prob = self.mixture_distribution.logits_parameter()
            logits_mix_prob = tf.reshape(
                logits_mix_prob,  # [k] or [B, k]
                ps.concat([
                    ps.shape(logits_mix_prob),
                    ps.ones([event_ndims], dtype=tf.int32),
                ],
                          axis=0))  # [k, [1]*e] or [B, k, [1]*e]

            # Logits of the posterior weights: log w_k + log prob_k (x_1, ..., x_i-1)
            log_posterior_weights_x = logits_mix_prob + cumsum_log_prob_x

            component_axis = tensorshape_util.rank(x.shape) - event_ndims
            posterior_weights_x = tf.math.softmax(log_posterior_weights_x,
                                                  axis=component_axis)
            return tf.reduce_sum(posterior_weights_x * cdf_x,
                                 axis=component_axis)
  def _head(self, neck_outputs):

    # <tf.float32>[time * batch_size, 1, hidden_dim]
    visual_feature = neck_outputs['visual_feature']
    # <tf.float32>[time * batch_size, num_tokens, hidden_dim]
    text_feature = neck_outputs['text_feature']

    # <tf.float32>[time, batch_size, 1, hidden_dim]
    visual_feature = tf.reshape(
        visual_feature,
        [self._current_num_timesteps, self._current_batch_size] +
        visual_feature.shape[1:].as_list())

    # <tf.float32>[batch_size, time, hidden_dim]
    visual_feature = tf.squeeze(visual_feature, axis=2)
    visual_feature = tf.transpose(visual_feature, [1, 0, 2])

    first_true = utils.get_first_true_column(
        tf.reshape(neck_outputs[constants.DISC_MASK],
                   [self._current_num_timesteps, self._current_batch_size]))

    # <tf.float32>[batch_size, num_tokens, hidden_dim]
    text_feature = tf.cond(
        tf.keras.backend.any(first_true),
        lambda: tf.boolean_mask(text_feature, tf.reshape(first_true, [-1])),
        lambda: tf.reshape(text_feature, [
            self._current_num_timesteps, self._current_batch_size
        ] + text_feature.shape[1:].as_list())[0, :, :, :])
    # visual_feature = tf.nn.l2_normalize(visual_feature, axis=2)
    # text_feature = tf.nn.l2_normalize(text_feature, axis=2)

    # <tf.float32>[batch_size, time, num_tokens]
    alpha_i_j = tf.matmul(visual_feature,
                          tf.transpose(text_feature, perm=[0, 2, 1]))
    # <tf.float32>[batch_size, time, num_tokens]
    ealpha_i_j = tf.exp(alpha_i_j)
    sum_i_j = tf.tile(
        tf.expand_dims(tf.reduce_sum(ealpha_i_j, 2), 2),
        [1, 1, tf.shape(ealpha_i_j)[2]])
    mask = tf.cast(
        tf.transpose(
            tf.reshape(neck_outputs[constants.DISC_MASK],
                       [self._current_num_timesteps, self._current_batch_size]),
            perm=[1, 0]), tf.float32)
    # <tf.float32>[batch, time, num_tokens]
    c_i_j = tf.divide(ealpha_i_j, sum_i_j)
    # <tf.float32>[batch, time]
    score = tf.reduce_sum(c_i_j * alpha_i_j, 2)

    escore = tf.exp(-1 * score) * mask
    sum_escore = tf.tile(
        tf.expand_dims(tf.reduce_sum(escore, 1), 1), [1, tf.shape(escore)[1]])
    score_weight = tf.divide(escore, sum_escore)
    similarities = tf.reduce_sum(mask * score * score_weight, 1)
    similarities = tf.expand_dims(similarities, axis=0)
    # [time_step, batch_size]
    similarities = tf.tile(similarities, [self._current_num_timesteps, 1])

    # Apply an affine transform.
    similarities = similarities * self.affine_a + self.affine_b

    output_a = tf.reshape(tf.convert_to_tensor(self.affine_a), [1, 1])
    output_b = tf.reshape(tf.convert_to_tensor(self.affine_b), [1, 1])

    output_a = tf.tile(output_a,
                       [self._current_num_timesteps, self._current_batch_size])
    output_b = tf.tile(output_b,
                       [self._current_num_timesteps, self._current_batch_size])

    return common.AgentOutput(
        policy_logits=similarities, baseline=(output_a, output_b))
Exemple #23
0
    def __call__(self, roi_features, class_indices, is_training=None):
        """Mask branch for the Mask-RCNN model.

    Args:
      roi_features: A ROI feature tensor of shape
        [batch_size, num_rois, height_l, width_l, num_filters].
      class_indices: a Tensor of shape [batch_size, num_rois], indicating
        which class the ROI is.
      is_training: `boolean`, if True if model is in training mode.
    Returns:
      mask_outputs: a tensor with a shape of
        [batch_size, num_masks, mask_height, mask_width, num_classes],
        representing the mask predictions.
      fg_gather_indices: a tensor with a shape of [batch_size, num_masks, 2],
        representing the fg mask targets.
    Raises:
      ValueError: If boxes is not a rank-3 tensor or the last dimension of
        boxes is not 4.
    """
        def _get_stddev_equivalent_to_msra_fill(kernel_size, fan_out):
            """Returns the stddev of random normal initialization as MSRAFill."""
            # Reference: https://github.com/pytorch/pytorch/blob/master/caffe2/operators/filler_op.h#L445-L463  # pylint: disable=line-too-long
            # For example, kernel size is (3, 3) and fan out is 256, stddev is 0.029.
            # stddev = (2/(3*3*256))^0.5 = 0.029
            return (2 / (kernel_size[0] * kernel_size[1] * fan_out))**0.5

        with backend.get_graph().as_default():
            with tf.name_scope('mask_head'):
                _, num_rois, height, width, filters = roi_features.get_shape(
                ).as_list()
                net = tf.reshape(roi_features, [-1, height, width, filters])

                for i in range(4):
                    kernel_size = (3, 3)
                    fan_out = 256
                    init_stddev = _get_stddev_equivalent_to_msra_fill(
                        kernel_size, fan_out)
                    net = tf.keras.layers.Conv2D(
                        fan_out,
                        kernel_size=kernel_size,
                        strides=(1, 1),
                        padding='same',
                        dilation_rate=(1, 1),
                        activation=None,
                        kernel_initializer=tf.keras.initializers.RandomNormal(
                            stddev=init_stddev),
                        bias_initializer=tf.zeros_initializer(),
                        name='mask-conv-l%d' % i)(net)
                    net = self._batch_norm_relu()(net, is_training=is_training)

                kernel_size = (2, 2)
                fan_out = 256
                init_stddev = _get_stddev_equivalent_to_msra_fill(
                    kernel_size, fan_out)
                net = tf.keras.layers.Conv2DTranspose(
                    fan_out,
                    kernel_size=kernel_size,
                    strides=(2, 2),
                    padding='valid',
                    activation=None,
                    kernel_initializer=tf.keras.initializers.RandomNormal(
                        stddev=init_stddev),
                    bias_initializer=tf.zeros_initializer(),
                    name='conv5-mask')(net)
                net = self._batch_norm_relu()(net, is_training=is_training)

                kernel_size = (1, 1)
                fan_out = self._num_classes
                init_stddev = _get_stddev_equivalent_to_msra_fill(
                    kernel_size, fan_out)
                mask_outputs = tf.keras.layers.Conv2D(
                    fan_out,
                    kernel_size=kernel_size,
                    strides=(1, 1),
                    padding='valid',
                    kernel_initializer=tf.keras.initializers.RandomNormal(
                        stddev=init_stddev),
                    bias_initializer=tf.zeros_initializer(),
                    name='mask_fcn_logits')(net)
                mask_outputs = tf.reshape(mask_outputs, [
                    -1, num_rois, self._mask_target_size,
                    self._mask_target_size, self._num_classes
                ])

                with tf.name_scope('masks_post_processing'):
                    # TODO(pengchong): Figure out the way not to use the static inferred
                    # batch size.
                    batch_size, num_masks = class_indices.get_shape().as_list()
                    mask_outputs = tf.transpose(a=mask_outputs,
                                                perm=[0, 1, 4, 2, 3])
                    # Contructs indices for gather.
                    batch_indices = tf.tile(
                        tf.expand_dims(tf.range(batch_size), axis=1),
                        [1, num_masks])
                    mask_indices = tf.tile(
                        tf.expand_dims(tf.range(num_masks), axis=0),
                        [batch_size, 1])
                    gather_indices = tf.stack(
                        [batch_indices, mask_indices, class_indices], axis=2)
                    mask_outputs = tf.gather_nd(mask_outputs, gather_indices)
            return mask_outputs
Exemple #24
0
def _milstein_step(*, dim, i, written_count, current_state, result, drift_fn,
                   volatility_fn, grad_volatility_fn, wiener_mean, num_samples,
                   times, dt, sqrt_dt, keep_mask, random_type, seed,
                   normal_draws, input_gradients, stratonovich_order,
                   aux_normal_draws):
    """Performs one step of Milstein scheme."""
    current_time = times[i + 1]
    written_count = tf.cast(written_count, tf.int32)
    if normal_draws is not None:
        dw = normal_draws[i]
    else:
        dw = random.mv_normal_sample((num_samples, ),
                                     mean=wiener_mean,
                                     random_type=random_type,
                                     seed=seed)
    if aux_normal_draws is not None:
        stratonovich_draws = []
        for j in range(3):
            stratonovich_draws.append(
                tf.reshape(aux_normal_draws[j][i],
                           [num_samples, dim, stratonovich_order]))
    else:
        stratonovich_draws = []
        # Three sets of normal draws for stratonovich integrals.
        for j in range(3):
            stratonovich_draws.append(
                random.mv_normal_sample(
                    (num_samples, ),
                    mean=tf.zeros((dim, stratonovich_order),
                                  dtype=current_state.dtype,
                                  name='stratonovich_draws_{}'.format(j)),
                    random_type=random_type,
                    seed=seed))

    if dim == 1:
        drift = drift_fn(current_time, current_state)
        vol = volatility_fn(current_time, current_state)
        grad_vol = grad_volatility_fn(current_time, current_state,
                                      tf.ones_like(current_state))
        next_state = _milstein_1d(dw=dw,
                                  dt=dt[i],
                                  sqrt_dt=sqrt_dt[i],
                                  current_state=current_state,
                                  drift=drift,
                                  vol=vol,
                                  grad_vol=grad_vol)
    else:
        drift = drift_fn(current_time, current_state)
        vol = volatility_fn(current_time, current_state)
        # This is a list of size equal to the dimension of the state space `dim`.
        # It contains tensors of shape [num_samples, dim, wiener_dim] representing
        # the gradient of the volatility function. In our case, the dimension of the
        # wiener process `wiener_dim` is equal to the state dimension `dim`.
        grad_vol = [
            grad_volatility_fn(current_time, current_state, start)
            for start in input_gradients
        ]
        next_state = _milstein_nd(dim=dim,
                                  num_samples=num_samples,
                                  dw=dw,
                                  dt=dt[i],
                                  sqrt_dt=sqrt_dt[i],
                                  current_state=current_state,
                                  drift=drift,
                                  vol=vol,
                                  grad_vol=grad_vol,
                                  stratonovich_draws=stratonovich_draws,
                                  stratonovich_order=stratonovich_order)

    result = utils.maybe_update_along_axis(tensor=result,
                                           do_update=keep_mask[i + 1],
                                           ind=written_count,
                                           axis=1,
                                           new_tensor=tf.expand_dims(
                                               next_state, axis=1))
    written_count += tf.cast(keep_mask[i + 1], dtype=tf.int32)
    return i + 1, written_count, next_state, result
Exemple #25
0
 def _transpose_around_bijector_fn(self,
                                   bijector_fn,
                                   arg,
                                   src_event_ndims,
                                   dest_event_ndims=None,
                                   fn_reduces_event=False,
                                   **kwargs):
     # This function moves the axes corresponding to `self.sample_shape` to the
     # left of the batch shape, then applies `bijector_fn`, then moves the axes
     # corresponding to `self.sample_shape` back to the event part of the shape.
     #
     # `src_event_ndims` and `dest_event_ndims` indicate the expected event rank
     # (omitting `self.sample_shape`) before and after applying `bijector_fn`.
     #
     # This function arose because forward and inverse ended up being quite
     # similar. It was then only a small generalization to also support {F/I}LDJ.
     batch_ndims = ps.rank_from_shape(self.distribution.batch_shape_tensor,
                                      self.distribution.batch_shape)
     extra_sample_ndims = ps.rank_from_shape(self.sample_shape)
     arg_ndims = ps.rank(arg)
     # (1) Expand arg's dims.
     d = arg_ndims - batch_ndims - extra_sample_ndims - src_event_ndims
     arg = tf.reshape(arg,
                      shape=ps.pad(ps.shape(arg),
                                   paddings=[[ps.maximum(0, -d), 0]],
                                   constant_values=1))
     arg_ndims = ps.rank(arg)
     sample_ndims = ps.maximum(0, d)
     # (2) Transpose arg's dims.
     sample_dims = ps.range(0, sample_ndims)
     batch_dims = ps.range(sample_ndims, sample_ndims + batch_ndims)
     extra_sample_dims = ps.range(
         sample_ndims + batch_ndims,
         sample_ndims + batch_ndims + extra_sample_ndims)
     event_dims = ps.range(sample_ndims + batch_ndims + extra_sample_ndims,
                           arg_ndims)
     perm = ps.concat(
         [sample_dims, extra_sample_dims, batch_dims, event_dims], axis=0)
     arg = tf.transpose(arg, perm=perm)
     # (3) Apply underlying bijector.
     result = bijector_fn(arg, **kwargs)
     # (4) Transpose sample_shape from the sample to the event shape.
     result_ndims = ps.rank(result)
     if fn_reduces_event:
         dest_event_ndims = 0
     d = result_ndims - batch_ndims - extra_sample_ndims - dest_event_ndims
     if fn_reduces_event:
         # In some cases, fn may reduce event too far, i.e. ildj may return a
         # scalar `0.`, which won't work with the transpose we do below.
         result = tf.reshape(result,
                             shape=ps.pad(ps.shape(result),
                                          paddings=[[ps.maximum(0, -d), 0]],
                                          constant_values=1))
         result_ndims = ps.rank(result)
     sample_ndims = ps.maximum(0, d)
     sample_dims = ps.range(0, sample_ndims)
     extra_sample_dims = ps.range(sample_ndims,
                                  sample_ndims + extra_sample_ndims)
     batch_dims = ps.range(sample_ndims + extra_sample_ndims,
                           sample_ndims + extra_sample_ndims + batch_ndims)
     event_dims = ps.range(sample_ndims + extra_sample_ndims + batch_ndims,
                           result_ndims)
     perm = ps.concat(
         [sample_dims, batch_dims, extra_sample_dims, event_dims], axis=0)
     return tf.transpose(result, perm=perm)
Exemple #26
0
 def _slopes(self, x):
     x = tf.reshape(x, [-1, self._nbins - 1])
     return tf.math.softplus(x) + 1e-2
Exemple #27
0
def main(unused_args):
    del unused_args

    #
    # General setup.
    #

    ebm_util.init_tf2()

    ebm_util.set_seed(FLAGS.seed)

    output_dir = FLAGS.logdir
    checkpoint_dir = os.path.join(output_dir, 'checkpoint')
    samples_dir = os.path.join(output_dir, 'samples')

    tf.io.gfile.makedirs(samples_dir)
    tf.io.gfile.makedirs(checkpoint_dir)

    log_f = tf.io.gfile.GFile(os.path.join(output_dir, 'log.out'), mode='w')
    logger = ebm_util.setup_logging('main', log_f, console=False)
    logger.info({k: v._value for (k, v) in FLAGS._flags().items()})  # pylint: disable=protected-access

    #
    # Data
    #

    if FLAGS.dataset == 'mnist':
        x_train = ebm_util.mnist_dataset(N_CH)
    elif FLAGS.dataset == 'celeba':
        x_train = ebm_util.celeba_dataset()
    else:
        raise ValueError(f'Unknown dataset. {FLAGS.dataset}')
    train_ds = tf.data.Dataset.from_tensor_slices(x_train).shuffle(
        10000).batch(FLAGS.batch_size)

    #
    # Models
    #

    if FLAGS.q_type == 'mean_field_gaussian':
        q = MeanFieldGaussianQ()
    u = make_u()

    #
    # Optimizers
    #

    def lr_p(step):
        lr = FLAGS.p_learning_rate * (1. - (step / (1.5 * FLAGS.train_steps)))
        return lr

    def lr_q(step):
        lr = FLAGS.q_learning_rate * (1. - (step / (1.5 * FLAGS.train_steps)))
        return lr

    opt_q = tf.optimizers.Adam(learning_rate=ebm_util.LambdaLr(lr_q))
    opt_p = tf.optimizers.Adam(learning_rate=ebm_util.LambdaLr(lr_p),
                               beta_1=FLAGS.p_adam_beta_1)

    #
    # Checkpointing
    #

    global_step_var = tf.Variable(0, trainable=False)
    checkpoint = tf.train.Checkpoint(opt_p=opt_p,
                                     opt_q=opt_q,
                                     u=u,
                                     q=q,
                                     global_step_var=global_step_var)

    checkpoint_path = os.path.join(checkpoint_dir, 'checkpoint')
    if tf.io.gfile.exists(checkpoint_path + '.index'):
        print(f'Restoring from {checkpoint_path}')
        checkpoint.restore(checkpoint_path)

    #
    # Stats initialization
    #

    stat_i = []
    stat_keys = [
        'E_pos',  # Mean energy of the positive samples.
        'E_neg_q',  # Mean energy of the negative samples (pre-HMC).
        'E_neg_p',  # Mean energy of the negative samples (post-HMC).
        'H',  # Entropy of Q (if known).
        'pd_pos',  # Pairse differences of the positive samples.
        'pd_neg_q',  # Pairwise differences of the negative samples (pre-HMC).
        'pd_neg_p',  # Pairwise differences of the negative samples (post-HMC).
        'hmc_disp',  # L2 distance between initial and final entropyMC samples.
        'hmc_p_accept',  # entropyMC P(accept).
        'hmc_step_size',  # entropyMC step size.
        'x_neg_p_min',  # Minimum value of the negative samples (post-HMC).
        'x_neg_p_max',  # Maximum value of the negative samples (post-HMC).
        'time',  # Time taken to do the training step.
    ]
    stat = {k: [] for k in stat_keys}

    def array_to_str(a, fmt='{:>8.4f}'):
        return ' '.join([fmt.format(v) for v in a])

    def stats_callback(step, entropy, pd_neg_q):
        del step, entropy, pd_neg_q

    step_size = FLAGS.mcmc_step_size

    train_ds_iter = iter(train_ds)
    x_pos_1 = ebm_util.data_preprocess(next(train_ds_iter))
    x_pos_2 = ebm_util.data_preprocess(next(train_ds_iter))

    global_step = global_step_var.numpy()

    while global_step < (FLAGS.train_steps + 1):
        for x_pos in train_ds:

            # Drop partial batches.
            if x_pos.shape[0] != FLAGS.batch_size:
                continue

            #
            # Update
            #

            start_time = time.time()

            x_pos = ebm_util.data_preprocess(x_pos)
            x_pos = ebm_util.data_discrete_noise(x_pos)

            if FLAGS.p_loss == 'neutra_hmc':
                (x_neg_q, x_neg_p, p_accept, step_size, pos_e, pos_e_updated,
                 neg_e_q, neg_e_p,
                 neg_e_p_updated) = train_p(q, u, x_pos, step_size, opt_p)
            elif FLAGS.p_loss == 'neutra_iid':
                (x_neg_q, x_neg_p, p_accept, step_size, pos_e, pos_e_updated,
                 neg_e_q, neg_e_p,
                 neg_e_p_updated) = train_p_mh(q, u, x_pos, step_size, opt_p)
            else:
                raise ValueError(f'Unknown P loss {FLAGS.p_loss}')

            if FLAGS.q_loss == 'forward_kl':
                train_q_fwd_kl(q, x_neg_p, opt_q)
                entropy = 0.0
                mle_loss = 0.0
            elif FLAGS.q_loss == 'reverse_kl':
                for _ in range(10):
                    _, entropy = train_q_rev_kl(q, u, opt_q)
                mle_loss = 0.0
            elif FLAGS.q_loss == 'reverse_kl_mle':
                for _ in range(FLAGS.q_sub_steps):
                    alpha = FLAGS.q_rkl_weight
                    (_, entropy, _, mle_loss, norm_grads_ebm,
                     norm_grads_mle) = train_q_rev_kl_mle(
                         q, u, x_pos, tf.convert_to_tensor(alpha), opt_q)

            elif FLAGS.q_loss == 'mle':
                mle_loss = train_q_mle(q, x_pos, opt_q)
                entropy = 0.0
            else:
                raise ValueError(f'Unknown Q loss {FLAGS.q_loss}')

            end_time = time.time()

            #
            # Stats
            #

            hmc_disp = tf.reduce_mean(
                tf.norm(tf.reshape(x_neg_q, [64, -1]) -
                        tf.reshape(x_neg_p, [64, -1]),
                        axis=1))

            if global_step % FLAGS.plot_steps == 0:

                # Positives + negatives.
                ebm_util.plot(
                    tf.reshape(ebm_util.data_postprocess(x_neg_q),
                               [FLAGS.batch_size, N_WH, N_WH, N_CH]),
                    os.path.join(samples_dir, f'x_neg_q_{global_step}.png'))
                ebm_util.plot(
                    tf.reshape(ebm_util.data_postprocess(x_neg_p),
                               [FLAGS.batch_size, N_WH, N_WH, N_CH]),
                    os.path.join(samples_dir, f'x_neg_p_{global_step}.png'))
                ebm_util.plot(
                    tf.reshape(ebm_util.data_postprocess(x_pos),
                               [FLAGS.batch_size, N_WH, N_WH, N_CH]),
                    os.path.join(samples_dir, f'x_pos_{global_step}.png'))

                # Samples for various temperatures.
                for t in [0.1, 0.5, 1.0, 2.0, 4.0]:
                    _, x_neg_q_t, _ = q.sample_with_log_prob(FLAGS.batch_size,
                                                             temp=t)
                    ebm_util.plot(
                        tf.reshape(ebm_util.data_postprocess(x_neg_q_t),
                                   [FLAGS.batch_size, N_WH, N_WH, N_CH]),
                        os.path.join(samples_dir,
                                     f'x_neg_t_{t}_{global_step}.png'))

                stats_callback(global_step, entropy,
                               ebm_util.nearby_difference(x_neg_q))

                stat_i.append(global_step)
                stat['E_pos'].append(pos_e_updated)
                stat['E_neg_q'].append(neg_e_q)
                stat['E_neg_p'].append(neg_e_p)
                stat['H'].append(entropy)
                stat['pd_neg_q'].append(ebm_util.nearby_difference(x_neg_q))
                stat['pd_neg_p'].append(ebm_util.nearby_difference(x_neg_p))
                stat['pd_pos'].append(ebm_util.nearby_difference(x_pos))
                stat['hmc_disp'].append(hmc_disp)
                stat['hmc_p_accept'].append(p_accept)
                stat['hmc_step_size'].append(step_size)
                stat['x_neg_p_min'].append(tf.reduce_min(x_neg_p))
                stat['x_neg_p_max'].append(tf.reduce_max(x_neg_p))
                stat['time'].append(end_time - start_time)

                ebm_util.plot_stat(stat_keys, stat, stat_i, output_dir)

                # Doing a linear interpolation in the latent space.
                z_pos_1 = q.forward(x_pos_1)[0]
                z_pos_2 = q.forward(x_pos_2)[0]

                x_alphas = []
                n_steps = 10
                for j in range(0, n_steps + 1):
                    alpha = (j / n_steps)
                    z_alpha = (1. - alpha) * z_pos_1 + (alpha) * z_pos_2
                    x_alpha = q.reverse(z_alpha)[0]
                    x_alphas.append(x_alpha)

                ebm_util.plot_n_by_m(
                    ebm_util.data_postprocess(
                        tf.reshape(tf.stack(x_alphas, axis=1), [
                            (n_steps + 1) * FLAGS.batch_size, N_WH, N_WH, N_CH
                        ])),
                    os.path.join(samples_dir, f'x_alpha_{global_step}.png'),
                    FLAGS.batch_size, n_steps + 1)

                # Doing random perturbations in the latent space.
                for eps in [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 2e0, 2.5e0, 3e0]:
                    z_pos_2_eps = z_pos_2 + eps * tf.random.normal(
                        z_pos_2.shape)
                    x_alpha = q.reverse(z_pos_2_eps)[0]
                    ebm_util.plot(
                        tf.reshape(ebm_util.data_postprocess(x_alpha),
                                   [FLAGS.batch_size, N_WH, N_WH, N_CH]),
                        os.path.join(samples_dir,
                                     f'x_alpha_eps_{eps}_{global_step}.png'))

                # Checking the log-probabilites of positive and negative examples under
                # Q.
                z_neg_test, x_neg_test, _ = q.sample_with_log_prob(
                    FLAGS.batch_size, temp=FLAGS.q_temperature)
                z_pos_test = q.forward(x_pos)[0]

                z_neg_test_pd = ebm_util.nearby_difference(z_neg_test)
                z_pos_test_pd = ebm_util.nearby_difference(z_pos_test)

                z_norms_neg = tf.reduce_mean(tf.norm(z_neg_test, axis=1))
                z_norms_pos = tf.reduce_mean(tf.norm(z_pos_test, axis=1))

                log_prob_neg = tf.reduce_mean(q.log_prob(x_neg_test))
                log_prob_pos = tf.reduce_mean(q.log_prob(x_pos))

                logger.info('  '.join([
                    f'i={global_step:6d}',
                    # Pre-update, post-update
                    (f'E_pos=[{pos_e:10.4f} {pos_e_updated:10.4f} ' +
                     f'{pos_e_updated - pos_e:10.4f}]'),
                    # Pre-update pre-HMC, pre-update post-HMC, post-update post-HMC
                    (f'E_neg=[{neg_e_q:10.4f} {neg_e_p:10.4f} ' +
                     f'{neg_e_p_updated:10.4f} {neg_e_p_updated - neg_e_p:10.4f}]'
                     ),
                    f'mle={tf.reduce_mean(mle_loss):8.4f}',
                    f'H={entropy:8.4f}',
                    f'norm_grads_ebm={norm_grads_ebm:8.4f}',
                    f'norm_grads_mle={norm_grads_mle:8.4f}',
                    f'pd(x_pos)={ebm_util.nearby_difference(x_pos):8.4f}',
                    f'pd(x_neg_q)={ebm_util.nearby_difference(x_neg_q):8.4f}',
                    f'pd(x_neg_p)={ebm_util.nearby_difference(x_neg_p):8.4f}',
                    f'hmc_disp={hmc_disp:8.4f}',
                    f'p(accept)={p_accept:8.4f}',
                    f'step_size={step_size:8.4f}',
                    # Min, max.
                    (f'x_neg_q=[{tf.reduce_min(x_neg_q):8.4f} ' +
                     f'{tf.reduce_max(x_neg_q):8.4f}]'),
                    (f'x_neg_p=[{tf.reduce_min(x_neg_p):8.4f} ' +
                     f'{tf.reduce_max(x_neg_p):8.4f}]'),
                    f'z_neg_norm={array_to_str(z_norms_neg)}',
                    f'z_pos_norm={array_to_str(z_norms_pos)}',
                    f'z_neg_test_pd={z_neg_test_pd:>8.2f}',
                    f'z_pos_test_pd={z_pos_test_pd:>8.2f}',
                    f'log_prob_neg={log_prob_neg:12.2f}',
                    f'log_prob_pos={log_prob_pos:12.2f}',
                ]))

            if global_step % FLAGS.save_steps == 0:

                global_step_var.assign(global_step)
                checkpoint.write(os.path.join(checkpoint_dir, 'checkpoint'))

            global_step += 1
    def __call__(self, net, is_training=False):
        """Builds Dropblock layer.

    Args:
      net: `Tensor` input tensor.
      is_training: `bool` if True, the model is in training mode.

    Returns:
      A version of input tensor with DropBlock applied.
    """
        if not is_training or self._dropblock_keep_prob is None:
            return net

        logging.info(
            'Applying DropBlock: dropblock_size {}, net.shape {}'.format(
                self._dropblock_size, net.shape))

        if self._data_format == 'channels_last':
            _, height, width, _ = net.get_shape().as_list()
        else:
            _, _, height, width = net.get_shape().as_list()

        total_size = width * height
        dropblock_size = min(self._dropblock_size, min(width, height))
        # Seed_drop_rate is the gamma parameter of DropBlcok.
        seed_drop_rate = (1.0 - self._dropblock_keep_prob
                          ) * total_size / dropblock_size**2 / (
                              (width - self._dropblock_size + 1) *
                              (height - self._dropblock_size + 1))

        # Forces the block to be inside the feature map.
        w_i, h_i = tf.meshgrid(tf.range(width), tf.range(height))
        valid_block = tf.logical_and(
            tf.logical_and(w_i >= int(dropblock_size // 2),
                           w_i < width - (dropblock_size - 1) // 2),
            tf.logical_and(h_i >= int(dropblock_size // 2),
                           h_i < width - (dropblock_size - 1) // 2))

        if self._data_format == 'channels_last':
            valid_block = tf.reshape(valid_block, [1, height, width, 1])
        else:
            valid_block = tf.reshape(valid_block, [1, 1, height, width])

        randnoise = tf.random.uniform(net.shape, dtype=tf.float32)
        valid_block = tf.cast(valid_block, dtype=tf.float32)
        seed_keep_rate = tf.cast(1 - seed_drop_rate, dtype=tf.float32)
        block_pattern = (1 - valid_block + seed_keep_rate + randnoise) >= 1
        block_pattern = tf.cast(block_pattern, dtype=tf.float32)

        if self._data_format == 'channels_last':
            ksize = [1, self._dropblock_size, self._dropblock_size, 1]
        else:
            ksize = [1, 1, self._dropblock_size, self._dropblock_size]
        block_pattern = -tf.nn.max_pool2d(
            -block_pattern,
            ksize=ksize,
            strides=[1, 1, 1, 1],
            padding='SAME',
            data_format='NHWC'
            if self._data_format == 'channels_last' else 'NCHW')

        percent_ones = tf.cast(tf.reduce_sum(input_tensor=block_pattern),
                               tf.float32) / tf.cast(
                                   tf.size(input=block_pattern), tf.float32)

        net = net / tf.cast(percent_ones, net.dtype) * tf.cast(
            block_pattern, net.dtype)
        return net
Exemple #29
0
def sample_and_preprocess(video,
                          labels,
                          seq_label,
                          seq_len,
                          name,
                          num_steps,
                          augment,
                          sample_all=False,
                          sample_all_stride=1,
                          add_shape=False):
    """Samples frames and prepares them for training."""

    if sample_all:
        # When dealing with very long videos we can choose to sub-sample to fit
        # data in memory. But be aware this also evaluates over a subset of frames.
        # Subsampling the validation set videos when reporting performance is not
        # recommended.
        steps = tf.range(0, seq_len, sample_all_stride)
        seq_len = tf.shape(steps)[0]
        chosen_steps = steps
    else:
        stride = CONFIG.DATA.STRIDE
        sampling_strategy = CONFIG.DATA.SAMPLING_STRATEGY

        # TODO(debidatta) : More flexible sampling
        if sampling_strategy == 'stride':
            # Offset can be set between 0 and maximum location from which we can get
            # total coverage of the video without having to pad.
            # This handles sampling over longer sequences.
            offset = tf.random.uniform(
                (),
                0,
                tf.maximum(tf.cast(1, tf.int64), seq_len - stride * num_steps),
                dtype=tf.int64)
            # This handles sampling over shorter sequences by padding the last frame
            # many times. This is not ideal for the way alignment training batches are
            # created.
            steps = tf.minimum(
                seq_len - 1,
                tf.range(offset, offset + num_steps * stride + 1, stride))
            steps = steps[:num_steps]
        elif sampling_strategy == 'offset_uniform':
            # Sample a random offset less than a provided max offset. Among all frames
            # higher than the chosen offset, randomly sample num_frames
            check1 = tf.debugging.assert_greater_equal(
                seq_len,
                tf.cast(CONFIG.DATA.RANDOM_OFFSET, tf.int64),
                message='Random offset is more than sequence length.')
            check2 = tf.less_equal(
                tf.cast(num_steps, tf.int64),
                seq_len - tf.cast(CONFIG.DATA.RANDOM_OFFSET, tf.int64),
            )

            def _sample_random():
                with tf.control_dependencies([tf.identity(check1.outputs[0])]):
                    offset = CONFIG.DATA.RANDOM_OFFSET
                    steps = tf.random.shuffle(tf.range(offset, seq_len))
                    steps = tf.gather(steps, tf.range(0, num_steps))
                    steps = tf.gather(
                        steps,
                        tf.nn.top_k(steps, k=num_steps).indices[::-1])
                    return steps

            def _sample_all():
                return tf.range(0, num_steps, dtype=tf.int64)

            steps = tf.cond(check2, _sample_random, _sample_all)

        else:
            raise ValueError(
                'Sampling strategy %s is unknown. Supported values are '
                'stride, offset_uniform .' % sampling_strategy)

        if not sample_all and 'tcn' in CONFIG.TRAINING_ALGO:
            pos_window = CONFIG.TCN.POSITIVE_WINDOW
            # pylint: disable=g-long-lambda
            pos_steps = tf.map_fn(
                lambda step: tf.random.uniform(
                    (), minval=step - pos_window, maxval=step, dtype=tf.int64),
                steps)
            # pylint: enable=g-long-lambda
            steps = tf.stack([pos_steps, steps])
            steps = tf.reshape(tf.transpose(steps), (-1, ))

        # Store chosen indices.
        chosen_steps = steps
        # Get multiple context steps depending on config at selected steps.
        steps = tf.reshape(tf.map_fn(get_steps, steps), [-1])
        steps = tf.maximum(tf.cast(0, tf.int64), steps)
        steps = tf.minimum(seq_len - 1, steps)

    shape_all_steps = CONFIG.DATA.NUM_STEPS * num_steps
    if not sample_all and 'tcn' in CONFIG.TRAINING_ALGO:
        shape_all_steps *= 2

    # Select data based on steps/
    video = tf.gather(video, steps)
    # Decode the encoded JPEG images
    video = tf.map_fn(tf.image.decode_jpeg,
                      video,
                      parallel_iterations=FLAGS.num_parallel_calls,
                      dtype=tf.uint8)
    # Take images in range [0, 255] and normalize to [0, 1]
    video = tf.map_fn(normalize_input,
                      video,
                      parallel_iterations=FLAGS.num_parallel_calls,
                      dtype=tf.float32)
    # Perform data-augmentation and return images in range [-1, 1]
    video = preprocess_input(video, augment)
    if add_shape:
        video.set_shape(
            [shape_all_steps, CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE, 3])

    if CONFIG.DATA.FRAME_LABELS:
        labels = tf.gather(labels, steps)
        if add_shape:
            labels.set_shape([shape_all_steps])

    return {
        'frames': video,
        'frame_labels': labels,
        'chosen_steps': chosen_steps,
        'seq_lens': seq_len,
        'seq_labels': seq_label,
        'name': name
    }
Exemple #30
0
    def __call__(self,
                 fpn_features,
                 boxes,
                 outer_boxes,
                 classes,
                 is_training=None):
        """Generate the detection priors from the box detections and FPN features.

    This corresponds to the Fig. 4 of the ShapeMask paper at
    https://arxiv.org/pdf/1904.03239.pdf

    Args:
      fpn_features: a dictionary of FPN features.
      boxes: a float tensor of shape [batch_size, num_instances, 4]
        representing the tight gt boxes from dataloader/detection.
      outer_boxes: a float tensor of shape [batch_size, num_instances, 4]
        representing the loose gt boxes from dataloader/detection.
      classes: a int Tensor of shape [batch_size, num_instances]
        of instance classes.
      is_training: training mode or not.

    Returns:
      crop_features: a float Tensor of shape [batch_size * num_instances,
          mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
          instance feature crop.
      detection_priors: A float Tensor of shape [batch_size * num_instances,
        mask_size, mask_size, 1].
    """
        with backend.get_graph().as_default():
            # loads class specific or agnostic shape priors
            if self._shape_prior_path:
                if self._use_category_for_mask:
                    fid = tf.io.gfile.GFile(self._shape_prior_path, 'rb')
                    # The encoding='bytes' options is for incompatibility between python2
                    # and python3 pickle.
                    class_tups = pickle.load(fid, encoding='bytes')
                    max_class_id = class_tups[-1][0] + 1
                    class_masks = np.zeros(
                        (max_class_id, self._num_clusters,
                         self._mask_crop_size, self._mask_crop_size),
                        dtype=np.float32)
                    for cls_id, _, cls_mask in class_tups:
                        assert cls_mask.shape == (self._num_clusters,
                                                  self._mask_crop_size**2)
                        class_masks[cls_id] = cls_mask.reshape(
                            self._num_clusters, self._mask_crop_size,
                            self._mask_crop_size)

                    self.class_priors = tf.convert_to_tensor(value=class_masks,
                                                             dtype=tf.float32)
                else:
                    npy_path = tf.io.gfile.GFile(self._shape_prior_path)
                    class_np_masks = np.load(npy_path)
                    assert class_np_masks.shape == (
                        self._num_clusters, self._mask_crop_size,
                        self._mask_crop_size), 'Invalid priors!!!'
                    self.class_priors = tf.convert_to_tensor(
                        value=class_np_masks, dtype=tf.float32)
            else:
                self.class_priors = tf.zeros([
                    self._num_clusters, self._mask_crop_size,
                    self._mask_crop_size
                ], tf.float32)

            batch_size = boxes.get_shape()[0]
            min_level_shape = fpn_features[
                self._min_mask_level].get_shape().as_list()
            self._max_feature_size = min_level_shape[1]
            detection_prior_levels = self._compute_box_levels(boxes)
            level_outer_boxes = outer_boxes / tf.pow(
                2., tf.expand_dims(detection_prior_levels, -1))
            detection_prior_levels = tf.cast(detection_prior_levels, tf.int32)
            uniform_priors = spatial_transform_ops.crop_mask_in_target_box(
                tf.ones([
                    batch_size, self._num_of_instances, self._mask_crop_size,
                    self._mask_crop_size
                ], tf.float32), boxes, outer_boxes, self._mask_crop_size)

            # Prepare crop features.
            multi_level_features = self._get_multilevel_features(fpn_features)
            crop_features = spatial_transform_ops.single_level_feature_crop(
                multi_level_features, level_outer_boxes,
                detection_prior_levels, self._min_mask_level,
                self._mask_crop_size)

            # Predict and fuse shape priors.
            shape_weights = self._classify_and_fuse_detection_priors(
                uniform_priors, classes, crop_features)
            fused_shape_priors = self._fuse_priors(shape_weights, classes)
            fused_shape_priors = tf.reshape(fused_shape_priors, [
                batch_size, self._num_of_instances, self._mask_crop_size,
                self._mask_crop_size
            ])
            predicted_detection_priors = spatial_transform_ops.crop_mask_in_target_box(
                fused_shape_priors, boxes, outer_boxes, self._mask_crop_size)
            predicted_detection_priors = tf.reshape(
                predicted_detection_priors,
                [-1, self._mask_crop_size, self._mask_crop_size, 1])

            return crop_features, predicted_detection_priors
Exemple #31
0
 def get_batch_nodes(self, indices):
   radius_batch = tf.reshape(self.radius_by_batch, [1, -1, 1])
   return self.get_hyperbolic_points(radius_batch, self.node(indices))
Exemple #32
0
    def __call__(self,
                 crop_features,
                 detection_priors,
                 inst_classes,
                 is_training=None):
        """Generate instance masks from FPN features and detection priors.

    This corresponds to the Fig. 5-6 of the ShapeMask paper at
    https://arxiv.org/pdf/1904.03239.pdf

    Args:
      crop_features: a float Tensor of shape [batch_size * num_instances,
        mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
        instance feature crop.
      detection_priors: a float Tensor of shape [batch_size * num_instances,
        mask_crop_size, mask_crop_size, 1]. This is the detection prior for
        the instance.
      inst_classes: a int Tensor of shape [batch_size, num_instances]
        of instance classes.
      is_training: a bool indicating whether in training mode.

    Returns:
      mask_outputs: instance mask prediction as a float Tensor of shape
        [batch_size * num_instances, mask_size, mask_size, num_classes].
    """
        # Embed the anchor map into some feature space for anchor conditioning.
        detection_prior_features = tf.keras.layers.Conv2D(
            self._num_downsample_channels,
            kernel_size=(1, 1),
            bias_initializer=tf.zeros_initializer(),
            kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.,
                                                                  stddev=0.01),
            padding='same',
            name='anchor-conv')(detection_priors)

        prior_conditioned_features = crop_features + detection_prior_features
        coarse_output_features = self.coarsemask_decoder_net(
            prior_conditioned_features, is_training)

        coarse_mask_classes = tf.keras.layers.Conv2D(
            self._mask_num_classes,
            kernel_size=(1, 1),
            # Focal loss bias initialization to have foreground 0.01 probability.
            bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) /
                                                             0.01)),
            kernel_initializer=tf.keras.initializers.RandomNormal(mean=0,
                                                                  stddev=0.01),
            padding='same',
            name='class-predict')(coarse_output_features)

        if self._use_category_for_mask:
            inst_classes = tf.cast(tf.reshape(inst_classes, [-1]), tf.int32)
            coarse_mask_classes_t = tf.transpose(a=coarse_mask_classes,
                                                 perm=(0, 3, 1, 2))
            # pylint: disable=g-long-lambda
            coarse_mask_logits = tf.cond(
                pred=tf.size(input=inst_classes) > 0,
                true_fn=lambda: tf.gather_nd(
                    coarse_mask_classes_t,
                    tf.stack([
                        tf.range(tf.size(input=inst_classes)), inst_classes - 1
                    ],
                             axis=1)),
                false_fn=lambda: coarse_mask_classes_t[:, 0, :, :])
            # pylint: enable=g-long-lambda
            coarse_mask_logits = tf.expand_dims(coarse_mask_logits, -1)
        else:
            coarse_mask_logits = coarse_mask_classes

        coarse_class_probs = tf.nn.sigmoid(coarse_mask_logits)
        class_probs = tf.cast(coarse_class_probs,
                              prior_conditioned_features.dtype)

        return coarse_mask_classes, class_probs, prior_conditioned_features
Exemple #33
0
 def call(self, x):
     x = tf.reshape(x, shape=[-1, N_WH, N_WH, N_CH])
     prior = tf.reduce_sum((x**2), axis=[1, 2, 3])
     energy = tf.squeeze(self.net(x))
     return FLAGS.p_prior_weight * prior + energy / FLAGS.p_temperature