def __init__(self, hparams=None): ModuleBase.__init__(self, hparams) hidden_dim = self._hparams.hidden_dim ffn_inner_dim = self._hparams.ffn_inner_dim dropout = self._hparams.dropout activation = self._hparams.activation if activation == 'gelu': activation = layers.gelu with tf.variable_scope(self.variable_scope): tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) l1_hparams = { "type": "Dense", "kwargs": { "units": ffn_inner_dim, "activation": activation } } self.linear1 = layers.get_layer(hparams=l1_hparams) dropout_hparams = { "type": "Dropout", "kwargs": { "rate": dropout } } self.dropout = layers.get_layer(hparams=dropout_hparams) l2_hparams = { "type": "Dense", "kwargs": { "units": hidden_dim } } self.linear2 = layers.get_layer(hparams=l2_hparams)
def __init__(self, hparams=None): EncoderBase.__init__(self, hparams) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) self.multihead_attention_list = [] self.poswise_networks = [] for i in range(self._hparams.num_blocks): with tf.variable_scope("layer_{}".format(i)): with tf.variable_scope('attention'): mh_attn = MultiheadAttentionEncoder( self._hparams.multihead_attention) self.multihead_attention_list.append(mh_attn) if self._hparams.dim != mh_attn.hparams.output_dim: raise ValueError( 'The "dim" in the hparams of ' '"multihead_attention" should be equal to the ' '"dim" of TransformerEncoder') pw_net = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) final_dim = pw_net.hparams.layers[-1]['kwargs']['units'] if self._hparams.dim != final_dim: raise ValueError( 'The output dimenstion of ' '"poswise_feedforward" should be equal ' 'to the "dim" of TransformerEncoder.') self.poswise_networks.append(pw_net)
def __init__(self, vocab_size=None, output_layer=None, hparams=None): ModuleBase.__init__(self, hparams) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) # Make the output layer self._output_layer, self._vocab_size = _make_output_layer( output_layer, vocab_size, self._hparams.output_layer_bias, self.variable_scope) # Make attention and poswise networks self.multihead_attentions = {'self_att': [], 'encdec_att': []} self.poswise_networks = [] for i in range(self._hparams.num_blocks): layer_name = 'layer_{}'.format(i) with tf.variable_scope(layer_name): with tf.variable_scope("self_attention"): multihead_attention = MultiheadAttentionEncoder( self._hparams.multihead_attention) self.multihead_attentions['self_att'].append( multihead_attention) if self._hparams.dim != \ multihead_attention.hparams.output_dim: raise ValueError('The output dimenstion of ' 'MultiheadEncoder should be equal ' 'to the dim of TransformerDecoder') with tf.variable_scope('encdec_attention'): multihead_attention = MultiheadAttentionEncoder( self._hparams.multihead_attention) self.multihead_attentions['encdec_att'].append( multihead_attention) if self._hparams.dim != \ multihead_attention.hparams.output_dim: raise ValueError('The output dimenstion of ' 'MultiheadEncoder should be equal ' 'to the dim of TransformerDecoder') pw_net = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) final_dim = pw_net.hparams.layers[-1]['kwargs']['units'] if self._hparams.dim != final_dim: raise ValueError( 'The output dimenstion of ' '"poswise_feedforward" should be equal ' 'to the "dim" of TransformerDecoder.') self.poswise_networks.append(pw_net) # Built in _build() self.context = None self.context_sequence_length = None self.embedding = None self._helper = None self._cache = None self.max_decoding_length = None
def __init__(self, r_r_bias, r_w_bias, r_s_bias=None, segment_embed=None, hparams=None): ModuleBase.__init__(self, hparams=hparams) self.num_heads = self._hparams.num_heads self.head_dim = self._hparams.head_dim hidden_dim = self._hparams.hidden_dim with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) # Official implementation creates these head variables. # If we create dense layers instead, there would be dimension # mismatch while loading the tensors # TODO(avinash) : Can we reshape tensors while loading the ckpt? self.q_head = tf.get_variable( 'q/kernel', [hidden_dim, self.num_heads, self.head_dim]) self.k_head = tf.get_variable( 'k/kernel', [hidden_dim, self.num_heads, self.head_dim]) self.v_head = tf.get_variable( 'v/kernel', [hidden_dim, self.num_heads, self.head_dim]) self.k_head_r = tf.get_variable( 'r/kernel', [hidden_dim, self.num_heads, self.head_dim]) self.dropout = layers.get_layer(hparams={ "type": "Dropout", "kwargs": { "rate": self._hparams.dropout } }) self.dropout_attn = layers.get_layer( hparams={ "type": "Dropout", "kwargs": { "rate": self._hparams.attention_dropout } }) self.output_projection = tf.get_variable( 'o/kernel', [hidden_dim, self.num_heads, self.head_dim]) self.r_r_bias = r_r_bias self.r_w_bias = r_w_bias if self._hparams.use_segments: self.segment_embed = segment_embed self.r_s_bias = r_s_bias self.scale = 1 / (self.head_dim**0.5)
def __init__(self, pretrained_model_name=None, cache_dir=None, hparams=None): super(XLNetClassifier, self).__init__(hparams=hparams) with tf.variable_scope(self.variable_scope): tf.get_variable_scope().set_initializer( get_initializer(self._hparams.initializer)) # Creates the underlying encoder encoder_hparams = dict_fetch(hparams, XLNetEncoder.default_hparams()) if encoder_hparams is not None: encoder_hparams['name'] = "encoder" self._encoder = XLNetEncoder( pretrained_model_name=pretrained_model_name, cache_dir=cache_dir, hparams=encoder_hparams) if self._hparams.use_projection: self.projection = get_layer( hparams={ "type": "Dense", "kwargs": { "units": self._encoder.output_size } }) # Creates an dropout layer drop_kwargs = {"rate": self._hparams.dropout} layer_hparams = {"type": "Dropout", "kwargs": drop_kwargs} self._dropout_layer = get_layer(hparams=layer_hparams) # Creates an additional classification layer if needed self._num_classes = self._hparams.num_classes if self._num_classes <= 0: self._logit_layer = None else: logit_kwargs = self._hparams.logit_layer_kwargs if logit_kwargs is None: logit_kwargs = {} elif not isinstance(logit_kwargs, HParams): raise ValueError( "hparams['logit_layer_kwargs'] must be a dict.") else: logit_kwargs = logit_kwargs.todict() logit_kwargs.update({"units": self._num_classes}) if 'name' not in logit_kwargs: logit_kwargs['name'] = "logit_layer" layer_hparams = {"type": "Dense", "kwargs": logit_kwargs} self._logit_layer = get_layer(hparams=layer_hparams)
def get_embedding(hparams=None, init_value=None, num_embeds=None, variable_scope='Embedding'): r"""Creates embedding variable if not exists. Args: hparams (dict or HParams, optional): Embedding hyperparameters. Missing hyperparameters are set to default values. See :func:`~texar.tf.modules.default_embedding_hparams` for all hyperparameters and default values. If :attr:`init_value` is given, :attr:`hparams["initializer"]`, and :attr:`hparams["dim"]` are ignored. init_value (Tensor or numpy array, optional): Initial values of the embedding variable. If not given, embedding is initialized as specified in :attr:`hparams["initializer"]`. num_embeds (int, optional): The number of embedding items (e.g., vocabulary size). Required if :attr:`init_value` is not provided. variable_scope (str or VariableScope, optional): Variable scope of the embedding variable. Returns: Variable or Tensor: A 2D `Variable` or `Tensor` of the same shape with :attr:`init_value` or of the shape ``[num_embeds, hparams["dim"]]``. """ with tf.variable_scope(variable_scope): if hparams is None or isinstance(hparams, dict): hparams = HParams(hparams, default_embedding_hparams()) regularizer = layers.get_regularizer(hparams["regularizer"]) if init_value is None: initializer = layers.get_initializer(hparams["initializer"]) dim = hparams["dim"] if not isinstance(hparams["dim"], (list, tuple)): dim = [dim] embedding = tf.get_variable(name='w', shape=[num_embeds] + dim, initializer=initializer, regularizer=regularizer, trainable=hparams["trainable"]) else: init_value = tf.cast(init_value, tf.float32) embedding = tf.get_variable(name='w', initializer=init_value, regularizer=regularizer, trainable=hparams["trainable"]) return embedding
def __init__(self, vocab_size=None, output_layer=None, tau=None, hparams=None): EncoderBase.__init__(self, hparams) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) # Make the output layer self._output_layer, self._vocab_size = _make_output_layer( output_layer, vocab_size, self._hparams.output_layer_bias, self.variable_scope) # Make attention and poswise networks self.graph_multihead_attention_list = [] self.poswise_networks = [] for i in range(self._hparams.num_blocks): with tf.variable_scope("layer_{}".format(i)): with tf.variable_scope('attention'): mh_attn = GraphMultiheadAttentionEncoder( self._hparams.graph_multihead_attention) self.graph_multihead_attention_list.append(mh_attn) if self._hparams.dim != mh_attn.hparams.output_dim: raise ValueError( 'The "dim" in the hparams of ' '"multihead_attention" should be equal to the ' '"dim" of CrossGraphTransformerFixedLengthDecoder' ) pw_net = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) final_dim = pw_net.hparams.layers[-1]['kwargs']['units'] if self._hparams.dim != final_dim: raise ValueError( 'The output dimenstion of ' '"poswise_feedforward" should be equal ' 'to the "dim" of CrossGraphTransformerFixedLengthDecoder.' ) self.poswise_networks.append(pw_net) self._helper = None self._tau = tau
def __init__(self, hparams=None): EncoderBase.__init__(self, hparams) use_bias = self._hparams.use_bias with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) self.Q_dense = tf.layers.Dense(self._hparams.num_units, use_bias=use_bias, name='query') self.K_dense = tf.layers.Dense(self._hparams.num_units, use_bias=use_bias, name='key') self.V_dense = tf.layers.Dense(self._hparams.num_units, use_bias=use_bias, name='value') self.O_dense = tf.layers.Dense(self._hparams.output_dim, use_bias=use_bias, name='output')
def __init__(self, pretrained_model_name=None, cache_dir=None, hparams=None): PretrainedBase.__init__(self, pretrained_model_name, cache_dir, hparams) if self.pretrained_model_dir: self._hparams = HParams(self.pretrained_model_hparams, self._hparams.todict()) with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) # Word embedding self.word_embedder = WordEmbedder( vocab_size=self._hparams.vocab_size, hparams=self._hparams.embed) # Segment embedding for each type of tokens self.segment_embedder = WordEmbedder( vocab_size=self._hparams.type_vocab_size, hparams=self._hparams.segment_embed) # Position embedding self.position_embedder = PositionEmbedder( position_size=self._hparams.position_size, hparams=self._hparams.position_embed) # The BERT encoder (a TransformerEncoder) self.encoder = TransformerEncoder(hparams=self._hparams.encoder) with tf.variable_scope("pooler"): kwargs_i = { "units": self._hparams.hidden_size, "activation": tf.tanh } layer_hparams = {"type": "Dense", "kwargs": kwargs_i} self.pooler = layers.get_layer(hparams=layer_hparams)
def reset_parameters(self): with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( get_initializer(self._hparams.initializer))
def __init__(self, pretrained_model_name=None, cache_dir=None, hparams=None): PretrainedBase.__init__(self, pretrained_model_name, cache_dir, hparams) if self.pretrained_model_dir: self._hparams = HParams(self.pretrained_model_hparams, self._hparams.todict()) num_layers = self._hparams.num_layers use_segments = self._hparams.use_segments untie_r = self._hparams.untie_r with tf.variable_scope(self.variable_scope): if self._hparams.initializer: tf.get_variable_scope().set_initializer( layers.get_initializer(self._hparams.initializer)) if untie_r: self.r_w_bias = tf.get_variable('r_w_bias', [ num_layers, self._hparams.num_heads, self._hparams.head_dim ], dtype=tf.float32) self.r_r_bias = tf.get_variable('r_r_bias', [ num_layers, self._hparams.num_heads, self._hparams.head_dim ], dtype=tf.float32) else: self.r_w_bias = tf.get_variable( 'r_w_bias', [self._hparams.num_heads, self._hparams.head_dim], dtype=tf.float32) self.r_r_bias = tf.get_variable( 'r_r_bias', [self._hparams.num_heads, self._hparams.head_dim], dtype=tf.float32) if use_segments: self.segment_embed = tf.get_variable('seg_embed', [ num_layers, 2, self._hparams.num_heads, self._hparams.head_dim ], dtype=tf.float32) self.r_s_bias = (tf.get_variable( 'r_s_bias', [ num_layers, self._hparams.num_heads, self._hparams.head_dim ], dtype=tf.float32) if untie_r else tf.get_variable( 'r_s_bias', [self._hparams.num_heads, self._hparams.head_dim], dtype=tf.float32)) else: self.segment_embed = None self.r_s_bias = None # Word embedding self.word_embedder = WordEmbedder( vocab_size=self._hparams.vocab_size, hparams={"dim": self._hparams.hidden_dim}) # Position embedding self.pos_embed = RelativePositionalEncoding( hparams={ "dim": self._hparams.hidden_dim, "max_seq_len": self._hparams.max_seq_len }) self.attn_layers = [] self.ff_layers = [] rel_attn_hparams = dict_fetch( self._hparams, RelativeMutiheadAttention.default_hparams()) rel_attn_hparams["name"] = "rel_attn" ff_hparams = dict_fetch(self._hparams, PositionWiseFF.default_hparams()) ff_hparams["name"] = "ff" for i in range(num_layers): with tf.variable_scope("layer_{}".format(i)): if self._hparams.untie_r: if use_segments: self.attn_layers.append( RelativeMutiheadAttention( self.r_r_bias[i], self.r_w_bias[i], self.r_s_bias[i], self.segment_embed[i], hparams=rel_attn_hparams)) else: self.attn_layers.append( RelativeMutiheadAttention( self.r_r_bias[i], self.r_w_bias[i], hparams=rel_attn_hparams)) else: if use_segments: self.attn_layers.append( RelativeMutiheadAttention( self.r_r_bias, self.r_w_bias, self.r_s_bias, self.segment_embed[i], hparams=rel_attn_hparams)) else: self.attn_layers.append( RelativeMutiheadAttention( self.r_r_bias, self.r_w_bias, hparams=rel_attn_hparams)) self.ff_layers.append(PositionWiseFF(hparams=ff_hparams)) dropout_hparams = { "type": "Dropout", "kwargs": { "rate": self._hparams.dropout } } self.dropout = layers.get_layer(hparams=dropout_hparams) self.mask_embed = tf.get_variable('mask_emb', [1, 1, self.hparams.hidden_dim], dtype=tf.float32)