def get_model( n_vocab, n_ctx=1024, n_embd=768, n_head=12, n_layer=12, fixed_input_shape=False # neededforTPU training ): """Get basic GPT-2 model. :param n_vocab: Number of vocabulary tokens. :param n_ctx: The length of each input. :param n_embd: The dimension of embeddings. :param n_head: Number of heads in transformer. :param n_layer: Number of transformer blocks. :return: The model. """ if fixed_input_shape: input_layer_shape = (n_ctx, ) else: input_layer_shape = (None, ) input_layer = keras.layers.Input(shape=input_layer_shape, name='Input') embed_token, embeddings = EmbeddingRet( input_dim=n_vocab, output_dim=n_embd, mask_zero=False, name='Embed-Token', )(input_layer) embed_token_pos = PositionEmbedding( input_dim=n_ctx, output_dim=n_embd, mode=PositionEmbedding.MODE_ADD, name='Embed-Token-Pos', )(embed_token) last_layer = embed_token_pos for i in range(n_layer): last_layer = _get_encoder_component( name='Encode-%d' % i, input_layer=last_layer, head_num=n_head, hidden_dim=n_embd * 4, attention_activation=None, feed_forward_activation=gelu, ) norm_layer = LayerNormalization(name='Norm', )(last_layer) output_layer = EmbeddingSim( use_bias=False, name='Output', )([norm_layer, embeddings]) model = keras.models.Model(inputs=input_layer, outputs=output_layer) model.compile( optimizer=keras.optimizers.Adam(), loss=keras.losses.sparse_categorical_crossentropy, ) return model
def test_no_mask(self): input_layer = keras.layers.Input(shape=(None,), name='Input') embed, embed_weights = EmbeddingRet( input_dim=20, output_dim=100, name='Embedding', )(input_layer) output_layer = EmbeddingSim( name='Embed-Sim', )([embed, embed_weights]) model = keras.models.Model(inputs=input_layer, outputs=output_layer) model.compile(optimizer='adam', loss='mse') model_path = os.path.join(tempfile.gettempdir(), 'test_embed_sim_%f.h5' % np.random.random()) model.save(model_path) model = keras.models.load_model(model_path, custom_objects=get_custom_objects()) model.summary(line_length=100) batch_inputs = np.random.randint(low=0, high=19, size=(32, 100)) batch_outputs = model.predict(batch_inputs) batch_outputs = np.argmax(batch_outputs, axis=-1) self.assertEqual(batch_inputs.tolist(), batch_outputs.tolist())
def get_model(token_num, embed_dim, encoder_num, decoder_num, head_num, hidden_dim, attention_activation=None, feed_forward_activation='relu', dropout_rate=0.0, use_same_embed=True, embed_weights=None, embed_trainable=None, trainable=True): """Get full model without compilation. :param token_num: Number of distinct tokens. :param embed_dim: Dimension of token embedding. :param encoder_num: Number of encoder components. :param decoder_num: Number of decoder components. :param head_num: Number of heads in multi-head self-attention. :param hidden_dim: Hidden dimension of feed forward layer. :param attention_activation: Activation for multi-head self-attention. :param feed_forward_activation: Activation for feed-forward layer. :param dropout_rate: Dropout rate. :param use_same_embed: Whether to use the same token embedding layer. `token_num`, `embed_weights` and `embed_trainable` should be lists of two elements if it is False. :param embed_weights: Initial weights of token embedding. :param embed_trainable: Whether the token embedding is trainable. It will automatically set to False if the given value is None when embedding weights has been provided. :param trainable: Whether the keras_layers are trainable. :return: Keras model. """ if not isinstance(token_num, list): token_num = [token_num, token_num] encoder_token_num, decoder_token_num = token_num if not isinstance(embed_weights, list): embed_weights = [embed_weights, embed_weights] encoder_embed_weights, decoder_embed_weights = embed_weights if encoder_embed_weights is not None: encoder_embed_weights = [encoder_embed_weights] if decoder_embed_weights is not None: decoder_embed_weights = [decoder_embed_weights] if not isinstance(embed_trainable, list): embed_trainable = [embed_trainable, embed_trainable] encoder_embed_trainable, decoder_embed_trainable = embed_trainable if encoder_embed_trainable is None: encoder_embed_trainable = encoder_embed_weights is None if decoder_embed_trainable is None: decoder_embed_trainable = decoder_embed_weights is None if use_same_embed: encoder_embed_layer = decoder_embed_layer = EmbeddingRet( input_dim=encoder_token_num, output_dim=embed_dim, mask_zero=True, weights=encoder_embed_weights, trainable=encoder_embed_trainable, name='Token-Embedding', ) else: encoder_embed_layer = EmbeddingRet( input_dim=encoder_token_num, output_dim=embed_dim, mask_zero=True, weights=encoder_embed_weights, trainable=encoder_embed_trainable, name='Encoder-Token-Embedding', ) decoder_embed_layer = EmbeddingRet( input_dim=decoder_token_num, output_dim=embed_dim, mask_zero=True, weights=decoder_embed_weights, trainable=decoder_embed_trainable, name='Decoder-Token-Embedding', ) encoder_input = keras.layers.Input(shape=(None, ), name='Encoder-Input') encoder_embed = TrigPosEmbedding( mode=TrigPosEmbedding.MODE_ADD, name='Encoder-Embedding', )(encoder_embed_layer(encoder_input)[0]) encoded_layer = get_encoders( encoder_num=encoder_num, input_layer=encoder_embed, head_num=head_num, hidden_dim=hidden_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, trainable=trainable, ) decoder_input = keras.layers.Input(shape=(None, ), name='Decoder-Input') decoder_embed, decoder_embed_weights = decoder_embed_layer(decoder_input) decoder_embed = TrigPosEmbedding( mode=TrigPosEmbedding.MODE_ADD, name='Decoder-Embedding', )(decoder_embed) decoded_layer = get_decoders( decoder_num=decoder_num, input_layer=decoder_embed, encoded_layer=encoded_layer, head_num=head_num, hidden_dim=hidden_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, trainable=trainable, ) dense_layer = EmbeddingSim( trainable=trainable, name='Output', )([decoded_layer, decoder_embed_weights]) return keras.models.Model(inputs=[encoder_input, decoder_input], outputs=dense_layer)
def build_xlnet(units, training, num_token, num_block, num_head, hidden_dim, batch_size, memory_len, target_len, permute=None, mask_index=Tokenizer.SYM_PAD, dropout=0.0, attention_dropout=0.0, attention_type=ATTENTION_TYPE_BI, clamp_len=None, shared_biases=True): """Build XLNet. :param units: Hidden dimensions throughout the model. :param training: Whether in training mode. :param num_token: Number of distinct tokens. :param num_block: Number of basic encoder blocks. :param num_head: Number of heads for attention. :param hidden_dim: Dimension inside position-wise feed-forward layer. :param batch_size: Maximum batch size. :param memory_len: The maximum length of memories. :param target_len: The length of prediction block. :param permute: Whether to enable permutation. :param mask_index: The index of padding. :param dropout: General dropout rate. :param attention_dropout: Dropout rate inside attention layer. :param attention_type: 'uni' or 'bi'. :param clamp_len: The maximum value of relative position. :param shared_biases: Whether to use the same biases for all layers. :return: The built model. """ if permute is None: permute = training token_input = keras.layers.Input( shape=(target_len,), name='Input-Token', ) seg_input = keras.layers.Input( shape=(target_len,), name='Input-Segment', ) memory_length_input = keras.layers.Input( shape=(1,), name='Input-Memory-Length', ) inputs = [token_input, seg_input, memory_length_input] if training: query_input = keras.layers.Input( shape=(target_len,), name='Input-Mask', ) inputs.append(query_input) else: query_input = None token_embed, embed_weights = EmbeddingRet( input_dim=num_token, output_dim=units, mask_zero=mask_index == 0, name='Embed-Token', )(token_input) if mask_index is not None and mask_index != 0: masking = CreateMask( mask_value=mask_index, name='Masking', )(token_input) token_embed = RestoreMask(name='Embed-Token-Masked')([token_embed, masking]) if training: mask_embed = MaskEmbedding( units=units, name='Embed-Mask' )([token_embed, query_input]) else: mask_embed = None if 0.0 < dropout < 1.0: token_embed = keras.layers.Dropout( rate=dropout, name='Embed-Token-Dropout' )(token_embed) if training: mask_embed = keras.layers.Dropout( rate=dropout, name='Embed-Mask-Dropout' )(mask_embed) memories = [Memory( batch_size=batch_size, memory_len=memory_len, target_len=target_len, output_dim=units, name='Memory-0', )([token_embed, memory_length_input])] pos_embed = PositionalEmbedding( output_dim=units, clamp_len=clamp_len, directional=attention_type == 'uni', name='Embed-Pos', )([token_embed, memories[0]]) content_mask, query_mask = PermutationMask( enabled=permute, directional=attention_type == 'uni', name='Permutation', )([token_embed, memories[0]]) context_bias, relative_bias, segment_bias = None, None, None if shared_biases: context_bias, relative_bias = RelativeBias( units, name='Relative-Bias', )(memories[0]) segment_bias = SegmentBias( units, name='Segment-Bias', )(memories[0]) content_output, query_output = token_embed, None if training: query_output = mask_embed for i in range(num_block): if not shared_biases: context_bias, relative_bias = RelativeBias( units, name='Relative-Bias-{}'.format(i + 1), )(memories[i]) segment_bias = SegmentBias( units, name='Segment-Bias-{}'.format(i + 1), )(memories[i]) segment_mat, segment_embed = RelativeSegmentEmbedding( units=units, name='Embed-Segment-{}'.format(i + 1), )([seg_input, memories[i]]) attention = Attention( units=units, num_head=num_head, use_bias=False, attention_dropout=attention_dropout, name='Attention-{}'.format(i + 1), ) if 0.0 < dropout < 1.0: attention_dropout_layer = keras.layers.Dropout( rate=dropout, name='Attention-Dropout-{}'.format(i + 1), ) else: attention_dropout_layer = None attention_add = keras.layers.Add(name='Attention-Residual-{}'.format(i + 1)) attention_layer_norm = LayerNormalization(name='Attention-Normal-{}'.format(i + 1)) feed_forward = FeedForward( units=hidden_dim, dropout_rate=dropout, activation=gelu, name='FeedForward-{}'.format(i + 1), ) if 0.0 < dropout < 1.0: feed_forward_dropout = keras.layers.Dropout( rate=dropout, name='FeedForward-Dropout-{}'.format(i + 1), ) else: feed_forward_dropout = None feed_forward_add = keras.layers.Add(name='FeedForward-Residual-{}'.format(i + 1)) feed_forward_layer_norm = LayerNormalization(name='FeedForward-Normal-{}'.format(i + 1)) content = content_output def _build_output(query, mask): attention_input = query _output = attention([ query, content, memories[i], segment_mat, segment_embed, pos_embed, context_bias, relative_bias, segment_bias, mask, ]) if attention_dropout_layer is not None: _output = attention_dropout_layer(_output) _output = attention_add([attention_input, _output]) _output = attention_layer_norm(_output) feed_forward_input = _output _output = feed_forward(_output) if feed_forward_dropout is not None: _output = feed_forward_dropout(_output) _output = feed_forward_add([feed_forward_input, _output]) _output = feed_forward_layer_norm(_output) return _output content_output = _build_output(content_output, content_mask) if training: query_output = _build_output(query_output, query_mask) if i < num_block - 1: memories.append(Memory( batch_size=batch_size, memory_len=memory_len, target_len=target_len, output_dim=units, name='Memory-{}'.format(i + 1), )([content_output, memory_length_input])) if training: output = EmbeddingSim(name='Softmax')([query_output, embed_weights]) else: output = content_output model = keras.models.Model( inputs=inputs, outputs=output ) return model
def get_model(token_num, embed_dim, encoder_num, decoder_num, head_num, hidden_dim, num_classes, add_new_node, attention_activation=None, feed_forward_activation='relu', dropout_rate=0.0, use_same_embed=True, embed_weights=None, embed_trainable=None, trainable=True, use_adapter=False, adapter_units=None, adapter_activation='relu'): if not isinstance(token_num, list): token_num = [token_num, token_num] encoder_token_num, decoder_token_num = token_num if not isinstance(embed_weights, list): embed_weights = [embed_weights, embed_weights] encoder_embed_weights, decoder_embed_weights = embed_weights if encoder_embed_weights is not None: encoder_embed_weights = [encoder_embed_weights] if decoder_embed_weights is not None: decoder_embed_weights = [decoder_embed_weights] if not isinstance(embed_trainable, list): embed_trainable = [embed_trainable, embed_trainable] encoder_embed_trainable, decoder_embed_trainable = embed_trainable if encoder_embed_trainable is None: encoder_embed_trainable = encoder_embed_weights is None if decoder_embed_trainable is None: decoder_embed_trainable = decoder_embed_weights is None if use_same_embed: encoder_embed_layer = decoder_embed_layer = EmbeddingRet( input_dim=encoder_token_num, output_dim=embed_dim, mask_zero=True, weights=encoder_embed_weights, trainable=encoder_embed_trainable, name='Token-Embedding', ) else: encoder_embed_layer = EmbeddingRet( input_dim=encoder_token_num, output_dim=embed_dim, mask_zero=True, weights=encoder_embed_weights, trainable=encoder_embed_trainable, name='Encoder-Token-Embedding', ) decoder_embed_layer = EmbeddingRet( input_dim=decoder_token_num, output_dim=embed_dim, mask_zero=True, weights=decoder_embed_weights, trainable=decoder_embed_trainable, name='Decoder-Token-Embedding', ) encoder_input = keras.layers.Input(shape=(None,), name='Encoder-Input') encoder_embed = TrigPosEmbedding( mode=TrigPosEmbedding.MODE_ADD, name='Encoder-Embedding', )(encoder_embed_layer(encoder_input)[0]) encoded_layer = get_encoders( encoder_num=encoder_num, input_layer=encoder_embed, head_num=head_num, hidden_dim=hidden_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, trainable=trainable, use_adapter=use_adapter, adapter_units=adapter_units, adapter_activation=adapter_activation, ) decoder_input = keras.layers.Input(shape=(None,), name='Decoder-Input') decoder_embed, decoder_embed_weights = decoder_embed_layer(decoder_input) decoder_embed = TrigPosEmbedding( mode=TrigPosEmbedding.MODE_ADD, name='Decoder-Embedding', )(decoder_embed) decoded_layer = get_decoders( decoder_num=decoder_num, input_layer=decoder_embed, encoded_layer=encoded_layer, head_num=head_num, hidden_dim=hidden_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, trainable=trainable, use_adapter=use_adapter, adapter_units=adapter_units, adapter_activation=adapter_activation, ) dense_layer = EmbeddingSim( trainable=trainable, name='normal_end', )([decoded_layer, decoder_embed_weights]) if add_new_node == False: dense = Dense(units=num_classes, activation="softmax")(decoded_layer) elif add_new_node == True: print("add new node") dense = Dense(units=num_classes+1, activation="softmax")(decoded_layer) #return keras.models.Model(inputs=[encoder_input], outputs=dense) return keras.models.Model(inputs=[encoder_input, decoder_input], outputs=dense)
def get_model(n_vocab, n_ctx=1024, n_embd=768, n_head=12, n_layer=12, batch_size=None, fixed_input_shape=False): """Get basic GPT-2 model. :param n_vocab: Number of vocabulary tokens. :param n_ctx: The length of each input. :param n_embd: The dimension of embeddings. :param n_head: Number of heads in transformer. :param n_layer: Number of transformer blocks. :param batch_size: Batch size of the model. :param fixed_input_shape: Whether the length of input is fixed. (Needed for TPU training) :return: The model. """ if fixed_input_shape: input_layer_shape = (batch_size, n_ctx) else: input_layer_shape = (batch_size, None) lm_input_layer = tf.keras.layers.Input( batch_shape=input_layer_shape, name='LMInput', ) mc_input_layer = tf.keras.layers.Input( batch_shape=(batch_size, ), name='MCInput', ) embed_token, embeddings = EmbeddingRet( input_dim=n_vocab, output_dim=n_embd, mask_zero=False, name='Embed-Token', )(lm_input_layer) embed_token_pos = PositionEmbedding( input_dim=n_ctx, output_dim=n_embd, mode=PositionEmbedding.MODE_ADD, name='Embed-Token-Pos', )(embed_token) last_layer = embed_token_pos for i in range(n_layer): last_layer = _get_encoder_component( name='Encode-%d' % i, input_layer=last_layer, head_num=n_head, hidden_dim=n_embd * 4, attention_activation=None, feed_forward_activation=gelu, ) norm_layer = LayerNormalization(name='Norm', )(last_layer) lm_head = EmbeddingSim( use_bias=False, name='LMOutput', )([norm_layer, embeddings]) mc_sequence_summary = SequenceSummary(name='MCSequenceSummary')( [norm_layer, mc_input_layer]) mc_linear = Dense(units=1, input_shape=(n_embd, ), name='MCDense')(mc_sequence_summary) mc_head = Dropout(rate=0.1, name='MCOutput')(mc_linear) losses = { "LMOutput": lm_loss_function, "MCOutput": mc_loss_function, } lossWeights = {"LMOutput": 2.0, "MCOutput": 1.0} metrics = {"LMOutput": get_metrics(), 'MCOutput': get_metrics(is_mc=True)} model = tf.keras.models.Model(inputs=[lm_input_layer, mc_input_layer], outputs=[lm_head, mc_head]) model.compile( optimizer=tf.keras.optimizers.Adam(clipnorm=1.), loss=losses, loss_weights=lossWeights, #metrics=metrics ) return model