def __init__(self, embedding=None, vocab_size=None, hparams=None): ModuleBase.__init__(self, hparams) self._vocab_size = vocab_size self._embedding = None self.sampling_method = self._hparams.sampling_method with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( \ layers.get_initializer(self._hparams.initializer)) if self._hparams.position_embedder.name == 'sinusoids': self.position_embedder = \ position_embedders.SinusoidsSegmentalPositionEmbedder( \ self._hparams.position_embedder.hparams) if self._hparams.use_embedding: if embedding is None and vocab_size is None: raise ValueError("""If 'embedding' is not provided, 'vocab_size' must be specified.""") if isinstance(embedding, (tf.Tensor, tf.Variable)): self._embedding = embedding else: self._embedding = embedder_utils.get_embedding( self._hparams.embedding, embedding, vocab_size, variable_scope=self.variable_scope) self._embed_dim = shape_list(self._embedding)[-1] if self._hparams.zero_pad: self._embedding = tf.concat( \ (tf.zeros(shape=[1, self._embed_dim]),\ self._embedding[1:, :]), 0) if self._vocab_size is None: self._vocab_size = self._embedding.get_shape().as_list()[0] self.output_layer = \ self.build_output_layer(shape_list(self._embedding)[-1])
def __init__(self, hparams=None): EncoderBase.__init__(self, hparams) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) self.multihead_attention_list = [] self.poswise_networks = [] for i in range(self._hparams.num_blocks): with tf.variable_scope("layer_{}".format(i)): with tf.variable_scope('attention'): mh_attn = MultiheadAttentionEncoder( self._hparams.multihead_attention) self.multihead_attention_list.append(mh_attn) if self._hparams.dim != mh_attn.hparams.output_dim: raise ValueError( 'The "dim" in the hparams of ' '"multihead_attention" should be equal to the ' '"dim" of TransformerEncoder') pw_net = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) final_dim = pw_net.hparams.layers[-1]['kwargs']['units'] if self._hparams.dim != final_dim: raise ValueError( 'The output dimenstion of ' '"poswise_feedforward" should be equal ' 'to the "dim" of TransformerEncoder.') self.poswise_networks.append(pw_net)
def __init__(self, input_size: int, hparams=None): super().__init__(hparams=hparams) use_bias = self._hparams.use_bias self.Q_dense = nn.Linear(input_size, self._hparams.num_units, bias=use_bias) self.K_dense = nn.Linear(input_size, self._hparams.num_units, bias=use_bias) self.V_dense = nn.Linear(input_size, self._hparams.num_units, bias=use_bias) self.O_dense = nn.Linear(self._hparams.num_units, self._hparams.output_dim, bias=use_bias) if self._hparams.initializer: # TODO(haoransh): we may define kernel_initializer and bias # initializer seperately initialize = layers.get_initializer(self._hparams.initializer) assert initialize is not None for name, param in self.named_parameters(): if name.split('.')[-1] == 'weight': print('name:{}'.format(name)) initialize(param)
def reset_parameters(self): initialize = layers.get_initializer(self._hparams.initializer) if initialize is not None: # Do not re-initialize LayerNorm modules. for name, param in self.named_parameters(): if name.split('.')[-1] == 'weight' and 'layer_norm' not in name: initialize(param)
def __init__(self, vocab_size=None, output_layer=None, hparams=None): ModuleBase.__init__(self, hparams) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) # Make the output layer self._output_layer, self._vocab_size = _make_output_layer( output_layer, vocab_size, self._hparams.output_layer_bias, self.variable_scope) # Make attention and poswise networks self.multihead_attentions = {'self_att': [], 'encdec_att': []} self.poswise_networks = [] for i in range(self._hparams.num_blocks): layer_name = 'layer_{}'.format(i) with tf.variable_scope(layer_name): with tf.variable_scope("self_attention"): multihead_attention = MultiheadAttentionEncoder( self._hparams.multihead_attention) self.multihead_attentions['self_att'].append( multihead_attention) if self._hparams.dim != \ multihead_attention.hparams.output_dim: raise ValueError('The output dimenstion of ' 'MultiheadEncoder should be equal ' 'to the dim of TransformerDecoder') with tf.variable_scope('encdec_attention'): multihead_attention = MultiheadAttentionEncoder( self._hparams.multihead_attention) self.multihead_attentions['encdec_att'].append( multihead_attention) if self._hparams.dim != \ multihead_attention.hparams.output_dim: raise ValueError('The output dimenstion of ' 'MultiheadEncoder should be equal ' 'to the dim of TransformerDecoder') pw_net = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) final_dim = pw_net.hparams.layers[-1]['kwargs']['units'] if self._hparams.dim != final_dim: raise ValueError( 'The output dimenstion of ' '"poswise_feedforward" should be equal ' 'to the "dim" of TransformerDecoder.') self.poswise_networks.append(pw_net) # Built in _build() self.context = None self.context_sequence_length = None self.embedding = None self._helper = None self._cache = None self.max_decoding_length = None
def __init__(self, embedding, hparams=None): ModuleBase.__init__(self, hparams) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) if self._hparams.position_embedder_type == 'sinusoids': self.position_embedder = SinusoidsPositionEmbedder( self._hparams.position_embedder_hparams) else: self.position_embedder = PositionEmbedder( position_size=self._hparams.position_size, hparams=self._hparams.position_embedder_hparams) self._embedding = embedding self._vocab_size = self._embedding.get_shape().as_list()[0] self.output_layer = \ self._build_output_layer(shape_list(self._embedding)[-1]) self.multihead_attentions = {'self_att': [], 'encdec_att': []} self.poswise_networks = [] for i in range(self._hparams.num_blocks): layer_name = 'layer_{}'.format(i) with tf.variable_scope(layer_name): with tf.variable_scope("self_attention"): multihead_attention = MultiheadAttentionEncoder( self._hparams.multihead_attention) self.multihead_attentions['self_att'].append( multihead_attention) # pylint: disable=protected-access if self._hparams.dim != \ multihead_attention._hparams.output_dim: raise ValueError('The output dimenstion of ' 'MultiheadEncoder should be equal ' 'to the dim of TransformerDecoder') with tf.variable_scope('encdec_attention'): multihead_attention = MultiheadAttentionEncoder( self._hparams.multihead_attention) self.multihead_attentions['encdec_att'].append( multihead_attention) if self._hparams.dim != \ multihead_attention._hparams.output_dim: raise ValueError('The output dimenstion of ' 'MultiheadEncoder should be equal ' 'to the dim of TransformerDecoder') poswise_network = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) if self._hparams.dim != \ poswise_network._hparams.layers[-1]['kwargs']['units']: raise ValueError('The output dimenstion of ' 'FeedForwardNetwork should be equal ' 'to the dim of TransformerDecoder') self.poswise_networks.append(poswise_network)
def __init__(self, pretrained_model_name: Optional[str] = None, cache_dir: Optional[str] = None, hparams=None): super().__init__(hparams=hparams) # Create the underlying encoder encoder_hparams = dict_fetch(hparams, XLNetEncoder.default_hparams()) self._encoder = XLNetEncoder( pretrained_model_name=pretrained_model_name, cache_dir=cache_dir, hparams=encoder_hparams) # TODO: The logic here is very similar to that in XLNetClassifier. # We need to reduce the code redundancy. if self._hparams.use_projection: if self._hparams.regr_strategy == 'all_time': self.projection = nn.Linear( self._encoder.output_size * self._hparams.max_seq_length, self._encoder.output_size * self._hparams.max_seq_length) else: self.projection = nn.Linear(self._encoder.output_size, self._encoder.output_size) self.dropout = nn.Dropout(self._hparams.dropout) logit_kwargs = self._hparams.logit_layer_kwargs if logit_kwargs is None: logit_kwargs = {} elif not isinstance(logit_kwargs, HParams): raise ValueError("hparams['logit_layer_kwargs'] " "must be a dict.") else: logit_kwargs = logit_kwargs.todict() if self._hparams.regr_strategy == 'all_time': self.hidden_to_logits = nn.Linear( self._encoder.output_size * self._hparams.max_seq_length, 1, **logit_kwargs) else: self.hidden_to_logits = nn.Linear(self._encoder.output_size, 1, **logit_kwargs) if self._hparams.initializer: initialize = get_initializer(self._hparams.initializer) assert initialize is not None if self._hparams.use_projection: initialize(self.projection.weight) initialize(self.projection.bias) initialize(self.hidden_to_logits.weight) if self.hidden_to_logits.bias: initialize(self.hidden_to_logits.bias) else: if self._hparams.use_projection: self.projection.apply(init_weights) self.hidden_to_logits.apply(init_weights)
def __init__(self, hparams=None): EncoderBase.__init__(self, hparams) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) self.position_embedder = \ SinusoidsPositionEmbedder( self._hparams.position_embedder_hparams)
def __init__(self, pretrained_model_name: Optional[str] = None, cache_dir: Optional[str] = None, hparams=None): super().__init__(hparams=hparams) # Create the underlying encoder encoder_hparams = dict_fetch(hparams, GPT2Encoder.default_hparams()) self._encoder = GPT2Encoder( pretrained_model_name=pretrained_model_name, cache_dir=cache_dir, hparams=encoder_hparams) # Create a dropout layer self._dropout_layer = nn.Dropout(self._hparams.dropout) # Create an additional classification layer if needed self.num_classes = self._hparams.num_classes if self.num_classes <= 0: self._logits_layer = None else: logit_kwargs = self._hparams.logit_layer_kwargs if logit_kwargs is None: logit_kwargs = {} elif not isinstance(logit_kwargs, HParams): raise ValueError("hparams['logit_layer_kwargs'] " "must be a dict.") else: logit_kwargs = logit_kwargs.todict() if self._hparams.clas_strategy == 'all_time': self._logits_layer = nn.Linear( self._encoder.output_size * self._hparams.max_seq_length, self.num_classes, **logit_kwargs) else: self._logits_layer = nn.Linear(self._encoder.output_size, self.num_classes, **logit_kwargs) if self._hparams.initializer: initialize = get_initializer(self._hparams.initializer) assert initialize is not None if self._logits_layer is not None: initialize(self._logits_layer.weight) if self._logits_layer.bias is not None: initialize(self._logits_layer.bias) self.is_binary = (self.num_classes == 1) or \ (self.num_classes <= 0 and self._hparams.dim == 1)
def get_embedding(hparams=None, init_value=None, num_embeds=None, variable_scope='Embedding'): """Creates embedding variable if not exists. Args: hparams (dict or HParams, optional): Embedding hyperparameters. Missing hyperparameters are set to default values. See :func:`~texar.modules.default_embedding_hparams` for all hyperparameters and default values. If :attr:`init_value` is given, :attr:`hparams["initializer"]`, and :attr:`hparams["dim"]` are ignored. init_value (Tensor or numpy array, optional): Initial values of the embedding variable. If not given, embedding is initialized as specified in :attr:`hparams["initializer"]`. num_embeds (int, optional): The number of embedding items (e.g., vocabulary size). Required if :attr:`init_value` is not provided. variable_scope (str or VariableScope, optional): Variable scope of the embedding variable. Returns: Variable or Tensor: A 2D `Variable` or `Tensor` of the same shape with :attr:`init_value` or of the shape :attr:`[num_embeds, hparams["dim"]]`. """ with tf.variable_scope(variable_scope): if hparams is None or isinstance(hparams, dict): hparams = HParams(hparams, default_embedding_hparams()) regularizer = layers.get_regularizer(hparams["regularizer"]) if init_value is None: initializer = layers.get_initializer(hparams["initializer"]) dim = hparams["dim"] if not isinstance(hparams["dim"], (list, tuple)): dim = [dim] embedding = tf.get_variable(name='w', shape=[num_embeds] + dim, initializer=initializer, regularizer=regularizer, trainable=hparams["trainable"]) else: init_value = tf.cast(init_value, tf.float32) embedding = tf.get_variable(name='w', initializer=init_value, regularizer=regularizer, trainable=hparams["trainable"]) return embedding
def __init__(self, vocab_size=None, output_layer=None, tau=None, hparams=None): EncoderBase.__init__(self, hparams) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) # Make the output layer self._output_layer, self._vocab_size = _make_output_layer( output_layer, vocab_size, self._hparams.output_layer_bias, self.variable_scope) # Make attention and poswise networks self.graph_multihead_attention_list = [] self.poswise_networks = [] for i in range(self._hparams.num_blocks): with tf.variable_scope("layer_{}".format(i)): with tf.variable_scope('attention'): mh_attn = GraphMultiheadAttentionEncoder( self._hparams.graph_multihead_attention) self.graph_multihead_attention_list.append(mh_attn) if self._hparams.dim != mh_attn.hparams.output_dim: raise ValueError( 'The "dim" in the hparams of ' '"multihead_attention" should be equal to the ' '"dim" of CrossGraphTransformerFixedLengthDecoder' ) pw_net = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) final_dim = pw_net.hparams.layers[-1]['kwargs']['units'] if self._hparams.dim != final_dim: raise ValueError( 'The output dimenstion of ' '"poswise_feedforward" should be equal ' 'to the "dim" of CrossGraphTransformerFixedLengthDecoder.' ) self.poswise_networks.append(pw_net) self._helper = None self._tau = tau
def __init__(self, hparams=None): EncoderBase.__init__(self, hparams) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) if self._hparams.position_embedder_type == 'sinusoids': self.position_embedder = SinusoidsPositionEmbedder( self._hparams.position_embedder_hparams) else: self.position_embedder = PositionEmbedder( position_size=self._hparams.position_size, hparams=self._hparams.position_embedder_hparams) # pylint: disable=protected-access if self._hparams.dim != \ self.position_embedder._hparams.dim: raise ValueError('"dim" in ' 'TransformerEncoder hparams must be equal ' 'to "dim" in its ' 'position_embedder_hparams.') self.multihead_attention_list = [] self.poswise_networks = [] for i in range(self._hparams.num_blocks): with tf.variable_scope("layer_{}".format(i)): with tf.variable_scope('attention'): multihead_attention = MultiheadAttentionEncoder( self._hparams.multihead_attention) self.multihead_attention_list.append( multihead_attention) # pylint: disable=protected-access if self._hparams.dim != \ multihead_attention._hparams.output_dim: raise ValueError('The "dim" in the hparams of ' 'multihead_attention should be equal ' 'to the "dim" of TransformerEncoder') poswise_network = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) # pylint: disable=protected-access if self._hparams.dim != \ poswise_network._hparams.layers[-1]['kwargs']['units']: # poswise_network._hparams.layers[-1]['units']: raise ValueError('The "units" in the "kwargs" of ' 'FeedForwardNetwork should be equal ' 'to the "dim" of TransformerEncoder') self.poswise_networks.append(poswise_network)
def __init__(self, embedding, hparams=None): ModuleBase.__init__(self, hparams) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( \ layers.get_initializer(self._hparams.initializer)) self.position_embedder = \ SinusoidsPositionEmbedder( self._hparams.position_embedder_hparams) self._embedding = embedding self._vocab_size = self._embedding.get_shape().as_list()[0] self.output_layer = \ self._build_output_layer(shape_list(self._embedding)[-1])
def get_embedding(num_embeds: Optional[int] = None, init_value: Optional[torch.Tensor] = None, hparams=None): r"""Creates embedding variable if not exists. Args: hparams (dict or HParams, optional): Embedding hyperparameters. Missing hyperparameters are set to default values. See :func:`~texar.modules.default_embedding_hparams` for all hyperparameters and default values. If :attr:`init_value` is given, :attr:`hparams["initializer"]`, and :attr:`hparams["dim"]` are ignored. init_value (Tensor or numpy array, optional): Initial values of the embedding variable. If not given, embedding is initialized as specified in :attr:`hparams["initializer"]`. num_embeds (int, optional): The number of embedding items (e.g., vocabulary size). Required if :attr:`init_value` is not provided. Returns: A 2D :tensor:`Tensor` of the same shape with :attr:`init_value` or of the shape ``[num_embeds, hparams["dim"]]``. """ if hparams is None or isinstance(hparams, dict): hparams = HParams(hparams, default_embedding_hparams()) if init_value is None: initializer = layers.get_initializer( getattr(hparams, "initializer", None)) # TODO Shibiao: add regularizer dim = hparams["dim"] if not isinstance(hparams["dim"], (list, tuple)): dim = [dim] embedding = torch.empty(size=[num_embeds] + dim) # initializer should be set by layers.get_initializer if initializer: embedding = initializer(embedding) else: embedding = torch.nn.init.xavier_uniform_(embedding) else: if torch.is_tensor(init_value): embedding = init_value # Do not copy the tensor. else: embedding = torch.tensor(init_value, dtype=torch.float) return embedding
def __init__(self, pretrained_model_name: Optional[str] = None, cache_dir: Optional[str] = None, hparams=None): super().__init__(pretrained_model_name=pretrained_model_name, cache_dir=cache_dir, hparams=hparams) if self.pretrained_model_dir: self._hparams = HParams(self.pretrained_model_hparams, self._hparams.todict()) # Word embedding self.word_embedder = WordEmbedder(vocab_size=self._hparams.vocab_size, hparams=self._hparams.embed) # Segment embedding for each type of tokens self.segment_embedder = WordEmbedder( vocab_size=self._hparams.type_vocab_size, hparams=self._hparams.segment_embed) # Position embedding self.position_embedder = PositionEmbedder( position_size=self._hparams.position_size, hparams=self._hparams.position_embed) # The BERT encoder (a TransformerEncoder) self.encoder = TransformerEncoder(hparams=self._hparams.encoder) self.pooler = nn.Sequential( nn.Linear(self._hparams.hidden_size, self._hparams.hidden_size), nn.Tanh()) if self.pretrained_model_dir: bert_utils.init_bert_checkpoint(self, self.pretrained_model_dir) elif self._hparams.initializer: initialize = layers.get_initializer(self._hparams.initializer) assert initialize is not None # Do not re-initialize LayerNorm modules. for name, param in self.named_parameters(): if name.split( '.')[-1] == 'weight' and 'layer_norm' not in name: initialize(param)
def __init__(self, hparams=None): EncoderBase.__init__(self, hparams) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) self.Q_dense = tf.layers.Dense(self._hparams.num_units, use_bias=False, name='q') self.K_dense = tf.layers.Dense(self._hparams.num_units, use_bias=False, name='k') self.V_dense = tf.layers.Dense(self._hparams.num_units, use_bias=False, name='v') self.O_dense = tf.layers.Dense(self._hparams.output_dim, use_bias=False, name='o')
def __init__(self, pretrained_model_name=None, cache_dir=None, hparams=None): EncoderBase.__init__(self, hparams) BertBase.__init__(self, pretrained_model_name, cache_dir, hparams) # put these things to BertBase if self.pretrained_model: self._hparams = HParams(self.pretrained_model_hparams, self._hparams.todict()) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) # Word embedding self.word_embedder = WordEmbedder( vocab_size=self._hparams.vocab_size, hparams=self._hparams.embed) # Segment embedding for each type of tokens self.segment_embedder = WordEmbedder( vocab_size=self._hparams.type_vocab_size, hparams=self._hparams.segment_embed) # Position embedding self.position_embedder = PositionEmbedder( position_size=self._hparams.position_size, hparams=self._hparams.position_embed) # The BERT encoder (a TransformerEncoder) self.encoder = TransformerEncoder(hparams=self._hparams.encoder) with tf.variable_scope("pooler"): kwargs_i = { "units": self._hparams.hidden_size, "activation": tf.tanh } layer_hparams = {"type": "Dense", "kwargs": kwargs_i} self.pooler = layers.get_layer(hparams=layer_hparams)
def __init__(self, hparams=None): EncoderBase.__init__(self, hparams) use_bias = self._hparams.use_bias with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) self.Q_dense = tf.layers.Dense(self._hparams.num_units, use_bias=use_bias, name='query') self.K_dense = tf.layers.Dense(self._hparams.num_units, use_bias=use_bias, name='key') self.V_dense = tf.layers.Dense(self._hparams.num_units, use_bias=use_bias, name='value') self.O_dense = tf.layers.Dense(self._hparams.output_dim, use_bias=use_bias, name='output')
def __init__(self, embedding, vocab_size=None, hparams=None): EncoderBase.__init__(self, hparams) self._vocab_size = vocab_size self._embedding = None self.enc = None with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) if self._hparams.position_embedder.name == 'sinusoids': self.position_embedder = \ position_embedders.SinusoidsPositionEmbedder(\ self._hparams.position_embedder.hparams) if self._hparams.use_embedding: if isinstance(embedding, tf.Variable): self._embedding = embedding embed_dim = self._embedding.get_shape().as_list()[-1] if self._hparams.zero_pad: # TODO(zhiting): vocab has zero pad if not self._hparams.bos_pad: self._embedding = tf.concat(\ (tf.zeros(shape=[1, embed_dim]), self._embedding[1:, :]), 0) else: self._embedding = tf.concat(\ (tf.zeros(shape=[2, embed_dim]), self._embedding[2:, :]), 0) if self._vocab_size is None: self._vocab_size = self._embedding.get_shape().as_list()[0] with tf.variable_scope(self.variable_scope): if self._hparams.target_space_id is not None: space_embedding = tf.get_variable('target_space_embedding', \ [32, embed_dim]) self.target_symbol_embedding = tf.gather(space_embedding, \ self._hparams.target_space_id) else: self.target_symbol_embedding = None self.stack_output = None
def __init__(self, pretrained_model_name: Optional[str] = None, cache_dir: Optional[str] = None, hparams=None): super().__init__(pretrained_model_name=pretrained_model_name, cache_dir=cache_dir, hparams=hparams) if self.pretrained_model_dir: self._hparams = HParams(self.pretrained_model_hparams, self._hparams.todict()) # Word embedding self.word_embedder = WordEmbedder(vocab_size=self._hparams.vocab_size, hparams=self._hparams.embed) # Position embedding self.position_embedder = PositionEmbedder( position_size=self._hparams.position_size, hparams=self._hparams.position_embed) # The GPT2 decoder (a TransformerDecoder) self.decoder = TransformerDecoder( vocab_size=self._hparams.vocab_size, output_layer=self.word_embedder.embedding, hparams=self._hparams.decoder) if self.pretrained_model_dir: gpt2_utils.init_gpt2_checkpoint(self, self.pretrained_model_dir) elif self._hparams.initializer: initialize = layers.get_initializer(self._hparams.initializer) assert initialize is not None # Do not re-initialize LayerNorm modules. for name, param in self.named_parameters(): if name.split( '.')[-1] == 'weight' and 'layer_norm' not in name: initialize(param)
def __init__(self, hparams=None): EncoderBase.__init__(self, hparams) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) self.position_embedder = \ SinusoidsPositionEmbedder( self._hparams.position_embedder_hparams) self.multihead_attention_list = [] self.poswise_networks = [] for i in range(self._hparams.num_blocks): with tf.variable_scope("layer_{}".format(i)): with tf.variable_scope('self_attention'): multihead_attention = MultiheadAttentionEncoder( self._hparams.multihead_attention) self.multihead_attention_list.append( multihead_attention) # pylint: disable=protected-access if self._hparams.dim != \ multihead_attention._hparams.output_dim: raise ValueError('The output dimenstion of' 'MultiheadEncoder should be equal' 'to the dim of TransformerEncoder') poswise_network = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) # pylint: disable=protected-access if self._hparams.dim != \ poswise_network._hparams.layers[-1]['kwargs']['units']: # poswise_network._hparams.layers[-1]['units']: raise ValueError('The output dimenstion of' 'FeedForwardNetwork should be equal' 'to the dim of TransformerEncoder') self.poswise_networks.append(poswise_network)
def __init__(self, hparams=None): EncoderBase.__init__(self, hparams) self._input_size = self._hparams.dim self.self_attns = nn.ModuleList() if not self._hparams.use_bert_config: self.self_attn_layer_norm = nn.ModuleList() self.poswise_networks = nn.ModuleList() self.poswise_layer_norm = nn.ModuleList() self.output_layer_norm = nn.ModuleList() if self._hparams.use_bert_config: # In TensorFlow, eps for LayerNorm is 1e-12 by default. eps = 1e-12 else: # In PyTorch, eps for LayerNorm is 1e-6 by default. eps = 1e-6 for _ in range(self._hparams.num_blocks): mh_attn = MultiheadAttentionEncoder( self._input_size, self._hparams.multihead_attention) self.self_attns.append(mh_attn) if not self._hparams.use_bert_config: self.self_attn_layer_norm.append( nn.LayerNorm(self._input_size, eps=eps)) if self._hparams.dim != mh_attn.hparams.output_dim: raise ValueError( 'The "dim" in the hparams of ' '"multihead_attention" should be equal to the ' '"dim" of TransformerEncoder') pw_net = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) final_dim = pw_net.hparams.layers[-1]['kwargs']['out_features'] if self._hparams.dim != final_dim: raise ValueError('The output dimenstion of ' '"poswise_feedforward" should be equal ' 'to the "dim" of TransformerEncoder.') self.poswise_networks.append(pw_net) self.poswise_layer_norm.append( nn.LayerNorm(self._input_size, eps=eps)) if self._hparams.use_bert_config: self.output_layer_norm.append( nn.LayerNorm(self._input_size, eps=eps)) self.embed_dropout = nn.Dropout(p=self._hparams.embedding_dropout) self.residual_dropout = nn.Dropout(p=self._hparams.residual_dropout) if self._hparams.use_bert_config: self.input_normalizer = nn.LayerNorm(self._input_size, eps=eps) else: self.final_layer_normalizer = nn.LayerNorm(self._input_size, eps=eps) if self._hparams.initializer: initialize = layers.get_initializer(self._hparams.initializer) assert initialize is not None # Do not re-initialize LayerNorm modules. for name, param in self.named_parameters(): if name.split( '.')[-1] == 'weight' and 'layer_norm' not in name: initialize(param)
def __init__(self, vocab_size: Optional[int] = None, output_layer: Optional[Union[nn.Module, torch.Tensor]] = None, hparams: Optional[HParams] = None): super().__init__( 0, vocab_size, # dummy value for input_size input_time_major=False, output_time_major=False, hparams=hparams) self._input_size = self._hparams.dim self._output_layer, self._vocab_size = _make_output_layer( output_layer, vocab_size, self._input_size, self._hparams.output_layer_bias) self.self_attns = nn.ModuleList() self.self_attn_layer_norm = nn.ModuleList() self.enc_dec_attns = nn.ModuleList() self.end_dec_attn_layer_norm = nn.ModuleList() self.poswise_networks = nn.ModuleList() self.poswise_layer_norm = nn.ModuleList() if self._hparams.use_gpt_config: eps = 1e-5 else: eps = 1e-12 for _ in range(self._hparams.num_blocks): attn_module = MultiheadAttentionEncoder( self._input_size, self._hparams.multihead_attention) if self._hparams.dim != attn_module.output_size: raise ValueError("The output dimension of " "MultiheadEncoder should be equal " "to the dim of TransformerDecoder") self.self_attns.append(attn_module) self.self_attn_layer_norm.append( nn.LayerNorm(self._input_size, eps=eps)) attn_module = MultiheadAttentionEncoder( self._input_size, self._hparams.multihead_attention) if self._hparams.dim != attn_module.output_size: raise ValueError("The output dimension of " "MultiheadEncoder should be equal " "to the dim of TransformerDecoder") self.enc_dec_attns.append(attn_module) self.end_dec_attn_layer_norm.append( nn.LayerNorm(self._input_size, eps=eps)) poswise_network = FeedForwardNetwork( hparams=self._hparams.poswise_feedforward) if (poswise_network.hparams.layers[-1]['kwargs']['out_features'] != self._hparams.dim): raise ValueError("The output dimension of " "FeedForwardNetwork should be equal " "to the dim of TransformerDecoder") self.poswise_networks.append(poswise_network) self.poswise_layer_norm.append( nn.LayerNorm(self._input_size, eps=eps)) self.final_layer_norm = nn.LayerNorm(self._input_size, eps=eps) self.embed_dropout = nn.Dropout(self._hparams.embedding_dropout) self.residual_dropout = nn.Dropout(self._hparams.residual_dropout) if self._hparams.initializer: # TODO: This might be different to what TensorFlow does initialize = layers.get_initializer(self._hparams.initializer) assert initialize is not None # Do not re-initialize LayerNorm modules. for name, param in self.named_parameters(): if name.split( ".")[-1] == "weight" and "layer_norm" not in name: initialize(param)