def _get_weights(self, hidden_dim=None, model_id=None): """Overrides super class function.""" if hidden_dim is None: hidden_dim = self._body_input_depth if self.fusion_mode() == "share_embeddings": return tf.get_variable( "ens_weights_shared", [self._vocab_size, hidden_dim], initializer=tf.random_normal_initializer(0.0, hidden_dim**-0.5)) shards = [] if model_id is None: model_ids = range(len(self._model_hparams.ensemble_hidden_sizes)) else: model_ids = [model_id] for model_id in model_ids: model_hidden_size = self._model_hparams.ensemble_hidden_sizes[model_id] var_name = "ens_weights_%d" % model_id model_embed_matrix = tf.get_variable( var_name, [self._vocab_size, model_hidden_size], initializer=tf.random_normal_initializer(0.0, model_hidden_size**-0.5)) if not self._model_hparams.ensemble_enabled[model_id]: model_embed_matrix = model_embed_matrix * 0.0 # Disabled, but variables are still created if not self._model_hparams.ensemble_trainable[model_id]: model_embed_matrix = tf.stop_gradient(model_embed_matrix) shards.append(model_embed_matrix) if len(shards) == 1: return shards[0] ret = tf.concat(shards, 1) # Convert ret to tensor. if not tf.contrib.eager.in_eager_mode(): ret = common_layers.convert_gradient_to_tensor(ret) return ret
def _get_weights(self, hidden_dim=None): """Create or get concatenated embedding or softmax variable. Args: hidden_dim: dim of the variable. Defaults to self._body_input_depth Returns: a list of self._num_shards Tensors. """ if hidden_dim is None: hidden_dim = self._body_input_depth num_shards = self._model_hparams.symbol_modality_num_shards shards = [] for i in range(num_shards): shard_size = (self._vocab_size // num_shards) + ( 1 if i < self._vocab_size % num_shards else 0) var_name = "weights_%d" % i shards.append( tf.get_variable(var_name, [shard_size, hidden_dim], initializer=tf.random_normal_initializer( 0.0, hidden_dim**-0.5))) if num_shards == 1: ret = shards[0] else: ret = tf.concat(shards, 0) # Convert ret to tensor. if not tf.contrib.eager.in_eager_mode(): ret = common_layers.convert_gradient_to_tensor(ret) return ret
def _get_weights(self, hidden_dim=None): """Create or get concatenated embedding or softmax variable. Args: hidden_dim: dim of the variable. Defaults to self._body_input_depth Returns: a list of self._num_shards Tensors. """ if hidden_dim is None: hidden_dim = self._body_input_depth num_shards = self._model_hparams.symbol_modality_num_shards shards = [] for i in range(num_shards): shard_size = (self._vocab_size // num_shards) + ( 1 if i < self._vocab_size % num_shards else 0) var_name = "weights_%d" % i shards.append( tf.get_variable( var_name, [shard_size, hidden_dim], initializer=tf.random_normal_initializer(0.0, hidden_dim**-0.5))) if num_shards == 1: ret = shards[0] else: ret = tf.concat(shards, 0) # Convert ret to tensor. if not tf.contrib.eager.in_eager_mode(): ret = common_layers.convert_gradient_to_tensor(ret) return ret
def combine(self, expert_out, multiply_by_gates=True): """Sum together the expert output, weighted by the gates. The slice corresponding to a particular batch element `b` is computed as the sum over all experts `i` of the expert output, weighted by the corresponding gate values. If `multiply_by_gates` is set to False, the gate values are ignored. Args: expert_out: a list of `num_experts` `Tensor`s, each with shape `[expert_batch_size_i, <extra_output_dims>]`. multiply_by_gates: a boolean Returns: a `Tensor` with shape `[batch_size, <extra_output_dims>]`. """ # see comments on convert_gradient_to_tensor stitched = common_layers.convert_gradient_to_tensor( tf.concat(expert_out, 0)) if multiply_by_gates: stitched *= tf.expand_dims(self._nonzero_gates, 1) combined = tf.unsorted_segment_sum(stitched, self._batch_index, tf.shape(self._gates)[0]) return combined
def _get_weights(model_hparams, vocab_size, hidden_dim=None): """Copied from tensor2tensor/layers/modalities.py but uses total vocab.""" if hidden_dim is None: hidden_dim = model_hparams.hidden_size num_shards = model_hparams.symbol_modality_num_shards shards = [] for i in range(num_shards): shard_size = (sum(vocab_size) // num_shards) + ( 1 if i < sum(vocab_size) % num_shards else 0) var_name = 'weights_%d' % i shards.append( tf.get_variable( var_name, [shard_size, hidden_dim], initializer=tf.random_normal_initializer(0.0, hidden_dim**-0.5))) if num_shards == 1: ret = shards[0] else: ret = tf.concat(shards, 0) # Convert ret to tensor. if not tf.contrib.eager.in_eager_mode(): ret = common_layers.convert_gradient_to_tensor(ret) return ret
def _get_weights(self, hidden_dim=None): """Copied from tensor2tensor/layers/modalities.py but uses total vocab.""" if hidden_dim is None: hidden_dim = self._body_input_depth num_shards = self._model_hparams.symbol_modality_num_shards shards = [] for i in range(num_shards): shard_size = (sum(self._vocab_size) // num_shards) + ( 1 if i < sum(self._vocab_size) % num_shards else 0) var_name = 'weights_%d' % i shards.append( tf.get_variable(var_name, [shard_size, hidden_dim], initializer=tf.random_normal_initializer( 0.0, hidden_dim**-0.5))) if num_shards == 1: ret = shards[0] else: ret = tf.concat(shards, 0) # Convert ret to tensor. if not tf.contrib.eager.in_eager_mode(): ret = common_layers.convert_gradient_to_tensor(ret) return ret
def _get_weights(model_hparams, vocab_size, hidden_dim=None): """Create or get concatenated embedding or softmax variable. Args: model_hparams: tf.HParams, model hyperparmeters. vocab_size: int, vocabulary size. hidden_dim: dim of the variable. Defaults to model_hparams.hidden_size Returns: a list of num_shards Tensors. """ if hidden_dim is None: hidden_dim = model_hparams.hidden_size num_shards = model_hparams.symbol_modality_num_shards shards = [] sparsity_technique = model_hparams.get("sparsity_technique") aux_params_shards = [] for i in range(num_shards): shard_size = (vocab_size // num_shards) + (1 if i < vocab_size % num_shards else 0) var_name = "weights_%d" % i weight_init_stddev = hidden_dim**-0.5 if (model_hparams.get("load_masks_from") and model_hparams.get("initial_sparsity")): # If we are loading constant masks for scratch-e or scratch-b # experiments, we optionally rescale the variance of the weight # initialization. initial_sparsity = model_hparams.get("initial_sparsity") weight_init_stddev = (hidden_dim * (1 - initial_sparsity))**-0.5 tf.logging.info( "Using sparse initialization with sparsity {} for symbol ". format(initial_sparsity)) shards.append( tf.get_variable(var_name, [shard_size, hidden_dim], initializer=tf.random_normal_initializer( 0.0, weight_init_stddev))) if sparsity_technique == "variational_dropout": aux_params_shards.append( tf.get_variable( var_name + "_aux", [shard_size, hidden_dim], initializer=tf.constant_initializer(value=-10.0))) elif sparsity_technique == "l0_regularization": initializer = tf.random_normal_initializer(mean=2.197, stddev=0.01) aux_params_shards.append( tf.get_variable(var_name + "_aux", [shard_size, hidden_dim], initializer=initializer)) if num_shards == 1: ret = shards[0] else: ret = tf.concat(shards, 0) if not aux_params_shards: # Convert ret to tensor. if not tf.contrib.eager.in_eager_mode(): ret = common_layers.convert_gradient_to_tensor(ret) return ret # Handle the auxiliary parameters if num_shards == 1: aux_ret = aux_params_shards[0] else: aux_ret = tf.concat(aux_params_shards, 0) global COLLECTED_VARIABLES if not COLLECTED_VARIABLES: if sparsity_technique == "variational_dropout": tf.add_to_collection(common_sparse.VARIATIONAL_DROPOUT_PARAMETERS, (ret, aux_ret)) elif sparsity_technique == "l0_regularization": tf.add_to_collection(common_sparse.L0_REGULARIZATION_PARAMETERS, (ret, aux_ret)) COLLECTED_VARIABLES = True # Convert aux ret to tensor. if not tf.contrib.eager.in_eager_mode(): ret = common_layers.convert_gradient_to_tensor(ret) aux_ret = common_layers.convert_gradient_to_tensor(aux_ret) return (ret, aux_ret)