def _get_weights(self, hidden_dim=None, model_id=None):
   """Overrides super class function."""
   if hidden_dim is None:
     hidden_dim = self._body_input_depth
   if self.fusion_mode() == "share_embeddings":
     return tf.get_variable(
             "ens_weights_shared", [self._vocab_size, hidden_dim],
             initializer=tf.random_normal_initializer(0.0, hidden_dim**-0.5))
   shards = []
   if model_id is None:
     model_ids = range(len(self._model_hparams.ensemble_hidden_sizes))
   else:
     model_ids = [model_id]
   for model_id in model_ids:
     model_hidden_size = self._model_hparams.ensemble_hidden_sizes[model_id]
     var_name = "ens_weights_%d" % model_id
     model_embed_matrix = tf.get_variable(
         var_name, [self._vocab_size, model_hidden_size],
         initializer=tf.random_normal_initializer(0.0, model_hidden_size**-0.5))
     if not self._model_hparams.ensemble_enabled[model_id]:
       model_embed_matrix = model_embed_matrix * 0.0  # Disabled, but variables are still created
     if not self._model_hparams.ensemble_trainable[model_id]:
       model_embed_matrix = tf.stop_gradient(model_embed_matrix)
     shards.append(model_embed_matrix)
   if len(shards) == 1:
     return shards[0]
   ret = tf.concat(shards, 1)
   # Convert ret to tensor.
   if not tf.contrib.eager.in_eager_mode():
     ret = common_layers.convert_gradient_to_tensor(ret)
   return ret
Esempio n. 2
0
    def _get_weights(self, hidden_dim=None):
        """Create or get concatenated embedding or softmax variable.

    Args:
      hidden_dim: dim of the variable. Defaults to self._body_input_depth

    Returns:
       a list of self._num_shards Tensors.
    """
        if hidden_dim is None:
            hidden_dim = self._body_input_depth
        num_shards = self._model_hparams.symbol_modality_num_shards
        shards = []
        for i in range(num_shards):
            shard_size = (self._vocab_size // num_shards) + (
                1 if i < self._vocab_size % num_shards else 0)
            var_name = "weights_%d" % i
            shards.append(
                tf.get_variable(var_name, [shard_size, hidden_dim],
                                initializer=tf.random_normal_initializer(
                                    0.0, hidden_dim**-0.5)))
        if num_shards == 1:
            ret = shards[0]
        else:
            ret = tf.concat(shards, 0)
        # Convert ret to tensor.
        if not tf.contrib.eager.in_eager_mode():
            ret = common_layers.convert_gradient_to_tensor(ret)
        return ret
Esempio n. 3
0
  def _get_weights(self, hidden_dim=None):
    """Create or get concatenated embedding or softmax variable.

    Args:
      hidden_dim: dim of the variable. Defaults to self._body_input_depth

    Returns:
       a list of self._num_shards Tensors.
    """
    if hidden_dim is None:
      hidden_dim = self._body_input_depth
    num_shards = self._model_hparams.symbol_modality_num_shards
    shards = []
    for i in range(num_shards):
      shard_size = (self._vocab_size // num_shards) + (
          1 if i < self._vocab_size % num_shards else 0)
      var_name = "weights_%d" % i
      shards.append(
          tf.get_variable(
              var_name, [shard_size, hidden_dim],
              initializer=tf.random_normal_initializer(0.0, hidden_dim**-0.5)))
    if num_shards == 1:
      ret = shards[0]
    else:
      ret = tf.concat(shards, 0)
    # Convert ret to tensor.
    if not tf.contrib.eager.in_eager_mode():
      ret = common_layers.convert_gradient_to_tensor(ret)
    return ret
Esempio n. 4
0
 def combine(self, expert_out, multiply_by_gates=True):
     """Sum together the expert output, weighted by the gates.
 The slice corresponding to a particular batch element `b` is computed
 as the sum over all experts `i` of the expert output, weighted by the
 corresponding gate values.  If `multiply_by_gates` is set to False, the
 gate values are ignored.
 Args:
   expert_out: a list of `num_experts` `Tensor`s, each with shape
     `[expert_batch_size_i, <extra_output_dims>]`.
   multiply_by_gates: a boolean
 Returns:
   a `Tensor` with shape `[batch_size, <extra_output_dims>]`.
 """
     # see comments on convert_gradient_to_tensor
     stitched = common_layers.convert_gradient_to_tensor(
         tf.concat(expert_out, 0))
     if multiply_by_gates:
         stitched *= tf.expand_dims(self._nonzero_gates, 1)
     combined = tf.unsorted_segment_sum(stitched, self._batch_index,
                                        tf.shape(self._gates)[0])
     return combined
Esempio n. 5
0
def _get_weights(model_hparams, vocab_size, hidden_dim=None):
  """Copied from tensor2tensor/layers/modalities.py but uses total vocab."""
  if hidden_dim is None:
    hidden_dim = model_hparams.hidden_size
  num_shards = model_hparams.symbol_modality_num_shards
  shards = []
  for i in range(num_shards):
    shard_size = (sum(vocab_size) // num_shards) + (
        1 if i < sum(vocab_size) % num_shards else 0)
    var_name = 'weights_%d' % i
    shards.append(
        tf.get_variable(
            var_name, [shard_size, hidden_dim],
            initializer=tf.random_normal_initializer(0.0, hidden_dim**-0.5)))
  if num_shards == 1:
    ret = shards[0]
  else:
    ret = tf.concat(shards, 0)
  # Convert ret to tensor.
  if not tf.contrib.eager.in_eager_mode():
    ret = common_layers.convert_gradient_to_tensor(ret)
  return ret
Esempio n. 6
0
 def _get_weights(self, hidden_dim=None):
     """Copied from tensor2tensor/layers/modalities.py but uses total vocab."""
     if hidden_dim is None:
         hidden_dim = self._body_input_depth
     num_shards = self._model_hparams.symbol_modality_num_shards
     shards = []
     for i in range(num_shards):
         shard_size = (sum(self._vocab_size) // num_shards) + (
             1 if i < sum(self._vocab_size) % num_shards else 0)
         var_name = 'weights_%d' % i
         shards.append(
             tf.get_variable(var_name, [shard_size, hidden_dim],
                             initializer=tf.random_normal_initializer(
                                 0.0, hidden_dim**-0.5)))
     if num_shards == 1:
         ret = shards[0]
     else:
         ret = tf.concat(shards, 0)
     # Convert ret to tensor.
     if not tf.contrib.eager.in_eager_mode():
         ret = common_layers.convert_gradient_to_tensor(ret)
     return ret
Esempio n. 7
0
def _get_weights(model_hparams, vocab_size, hidden_dim=None):
    """Create or get concatenated embedding or softmax variable.

  Args:
    model_hparams: tf.HParams, model hyperparmeters.
    vocab_size: int, vocabulary size.
    hidden_dim: dim of the variable. Defaults to model_hparams.hidden_size

  Returns:
     a list of num_shards Tensors.
  """
    if hidden_dim is None:
        hidden_dim = model_hparams.hidden_size
    num_shards = model_hparams.symbol_modality_num_shards
    shards = []

    sparsity_technique = model_hparams.get("sparsity_technique")
    aux_params_shards = []
    for i in range(num_shards):
        shard_size = (vocab_size //
                      num_shards) + (1 if i < vocab_size % num_shards else 0)
        var_name = "weights_%d" % i

        weight_init_stddev = hidden_dim**-0.5
        if (model_hparams.get("load_masks_from")
                and model_hparams.get("initial_sparsity")):
            # If we are loading constant masks for scratch-e or scratch-b
            # experiments, we optionally rescale the variance of the weight
            # initialization.
            initial_sparsity = model_hparams.get("initial_sparsity")
            weight_init_stddev = (hidden_dim * (1 - initial_sparsity))**-0.5
            tf.logging.info(
                "Using sparse initialization with sparsity {} for symbol ".
                format(initial_sparsity))

        shards.append(
            tf.get_variable(var_name, [shard_size, hidden_dim],
                            initializer=tf.random_normal_initializer(
                                0.0, weight_init_stddev)))
        if sparsity_technique == "variational_dropout":
            aux_params_shards.append(
                tf.get_variable(
                    var_name + "_aux", [shard_size, hidden_dim],
                    initializer=tf.constant_initializer(value=-10.0)))
        elif sparsity_technique == "l0_regularization":
            initializer = tf.random_normal_initializer(mean=2.197, stddev=0.01)
            aux_params_shards.append(
                tf.get_variable(var_name + "_aux", [shard_size, hidden_dim],
                                initializer=initializer))

    if num_shards == 1:
        ret = shards[0]
    else:
        ret = tf.concat(shards, 0)

    if not aux_params_shards:
        # Convert ret to tensor.
        if not tf.contrib.eager.in_eager_mode():
            ret = common_layers.convert_gradient_to_tensor(ret)
        return ret

    # Handle the auxiliary parameters
    if num_shards == 1:
        aux_ret = aux_params_shards[0]
    else:
        aux_ret = tf.concat(aux_params_shards, 0)

    global COLLECTED_VARIABLES
    if not COLLECTED_VARIABLES:
        if sparsity_technique == "variational_dropout":
            tf.add_to_collection(common_sparse.VARIATIONAL_DROPOUT_PARAMETERS,
                                 (ret, aux_ret))
        elif sparsity_technique == "l0_regularization":
            tf.add_to_collection(common_sparse.L0_REGULARIZATION_PARAMETERS,
                                 (ret, aux_ret))
        COLLECTED_VARIABLES = True

    # Convert aux ret to tensor.
    if not tf.contrib.eager.in_eager_mode():
        ret = common_layers.convert_gradient_to_tensor(ret)
        aux_ret = common_layers.convert_gradient_to_tensor(aux_ret)
    return (ret, aux_ret)