def test_mask_loss(self): def _loss(y_true, _): return K.sum(y_true, axis=-1) inputs = [keras.layers.Input((5, )), keras.layers.Input((5, ))] embed = keras.layers.Embedding(input_dim=2, output_dim=3, mask_zero=True)(inputs[0]) masked = Masked()([embed, inputs[1]]) model = keras.models.Model(inputs, masked) model.compile( optimizer='sgd', loss=_loss, ) token_input = np.array([ [1, 1, 1, 0, 0], [1, 1, 1, 1, 0], ]) mask_input = np.array([ [0, 1, 0, 0, 0], [1, 0, 0, 0, 0], ]) outputs = np.arange(30).reshape((2, 5, 3)) if TF_KERAS: expected = 6.0 else: expected = 30.0 self.assertAlmostEqual(expected, model.evaluate([token_input, mask_input], outputs), places=3)
def test_mask_loss(self): def _loss(y_true, _): return K.sum(y_true, axis=-1) inputs = [keras.layers.Input((5,)), keras.layers.Input((5,))] embed = keras.layers.Embedding(input_dim=2, output_dim=3, mask_zero=True)(inputs[0]) masked = Masked()([embed, inputs[1]]) model = keras.models.Model(inputs, masked) model.compile( optimizer='sgd', loss=_loss, ) token_input = np.array([ [1, 1, 1, 0, 0], [1, 1, 1, 1, 0], ]) mask_input = np.array([ [0, 1, 0, 0, 0], [1, 0, 0, 0, 0], ]) outputs = np.arange(30, dtype=K.floatx()).reshape((2, 5, 3)) actual = model.evaluate([token_input, mask_input], outputs) self.assertTrue(np.abs(actual - 6.0) < 1e-6 or np.abs(actual - 30.0) < 1e-6, actual)
def test_mask_result(self): input_layer = keras.layers.Input( shape=(None, ), name='Input', ) embed_layer = keras.layers.Embedding( input_dim=12, output_dim=9, mask_zero=True, name='Embedding', )(input_layer) transformer_layer = get_encoders( encoder_num=1, input_layer=embed_layer, head_num=1, hidden_dim=12, attention_activation=None, feed_forward_activation=gelu, dropout_rate=0.1, ) dense_layer = keras.layers.Dense( units=12, activation='softmax', name='Dense', )(transformer_layer) mask_layer = keras.layers.Input( shape=(None, ), name='Mask', ) masked_layer, mask_result = Masked( return_masked=True, name='Masked', )([dense_layer, mask_layer]) print([masked_layer, mask_result]) model = keras.models.Model( inputs=[input_layer, mask_layer], outputs=[masked_layer, mask_result], ) model.compile( optimizer='adam', loss='mse', ) model.summary(line_length=150) predicts = model.predict([ np.asarray([ [1, 2, 3, 4, 5, 6, 7, 8, 0, 0], [1, 2, 3, 4, 0, 0, 0, 0, 0, 0], ]), np.asarray([ [0, 0, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0, 0], ]), ]) expect = np.asarray([ [0, 0, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0, 0], ]) self.assertTrue(np.allclose(expect, predicts[1]))
def test_sample(self): inputs = get_inputs(seq_len=512) embed_layer, _ = get_embedding(inputs, token_num=12, embed_dim=768, pos_num=512) masked_layer = Masked(name='Masked')([embed_layer, inputs[-1]]) model = keras.models.Model(inputs=inputs, outputs=masked_layer) model.compile( optimizer='adam', loss='mse', metrics={}, ) model.summary() model.predict([ np.asarray([[1] + [0] * 511]), np.asarray([[0] * 512]), np.asarray([[1] + [0] * 511]), ]) self.assertEqual((None, 512, 768), model.layers[-1].output_shape)
def test_fit(self): input_layer = keras.layers.Input( shape=(15, ), name='Input', ) embed_layer = keras.layers.Embedding( input_dim=12, output_dim=24, mask_zero=True, name='Embedding', )(input_layer) rnn_layer = keras.layers.Bidirectional( keras.layers.LSTM(units=100, return_sequences=True), name='Bi-LSTM', )(embed_layer) dense_layer = keras.layers.Dense( units=12, activation='softmax', name='Dense', )(rnn_layer) mask_layer = keras.layers.Input( shape=(None, ), name='Mask', ) masked_layer = Masked(name='Masked', )([dense_layer, mask_layer]) model = keras.models.Model( inputs=[input_layer, mask_layer], outputs=masked_layer, ) model.compile( optimizer=keras.optimizers.Adam(lr=1e-4), loss=keras.losses.sparse_categorical_crossentropy, metrics=[keras.metrics.sparse_categorical_crossentropy], ) model.summary(line_length=150) def _generator(batch_size=32): while True: inputs, masked, outputs = [], [], [] for _ in range(batch_size): inputs.append([]) masked.append([]) outputs.append([]) has_mask = False for i in range(1, 11): inputs[-1].append(i) outputs[-1].append([i]) if random.random() < 0.3: has_mask = True inputs[-1][-1] = 11 masked[-1].append(1) else: masked[-1].append(0) if not has_mask: masked[-1][0] = 1 inputs[-1] += [0] * (15 - len(inputs[-1])) masked[-1] += [0] * (15 - len(masked[-1])) outputs[-1] += [[0]] * (15 - len(outputs[-1])) yield [np.asarray(inputs), np.asarray(masked)], np.asarray(outputs) model.fit_generator( generator=_generator(), steps_per_epoch=1000, epochs=10, validation_data=_generator(), validation_steps=100, callbacks=[ keras.callbacks.EarlyStopping(monitor='val_loss', patience=2) ], ) for inputs, outputs in _generator(batch_size=32): predicts = model.predict(inputs) actual = np.argmax(predicts, axis=-1) for i in range(32): for j in range(15): if inputs[1][i][j]: self.assertEqual(j + 1, actual[i][j]) break
def build_albert(token_num, pos_num=512, seq_len=512, embed_dim=128, hidden_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, attention_activation=None, feed_forward_activation='gelu', training=True, trainable=None, output_layers=None): """Get ALBERT model. See: https://arxiv.org/pdf/1909.11942.pdf :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param hidden_dim: Dimensions of hidden layers. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param training: A built model with MLM and NSP outputs will be returned if it is `True`, otherwise the input layers and the last feature extraction layer will be returned. :param trainable: Whether the model is trainable. :param output_layers: A list of indices of output layers. """ if attention_activation == 'gelu': attention_activation = gelu if feed_forward_activation == 'gelu': feed_forward_activation = gelu if trainable is None: trainable = training def _trainable(_layer): if isinstance(trainable, (list, tuple, set)): for prefix in trainable: if _layer.name.startswith(prefix): return True return False return trainable # Build inputs input_token = keras.layers.Input(shape=(seq_len, ), name='Input-Token') input_segment = keras.layers.Input(shape=(seq_len, ), name='Input-Segment') inputs = [input_token, input_segment] # Build embeddings embed_token, embed_weights, embed_projection = AdaptiveEmbedding( input_dim=token_num, output_dim=hidden_dim, embed_dim=embed_dim, mask_zero=True, trainable=trainable, return_embeddings=True, return_projections=True, name='Embed-Token', )(input_token) embed_segment = keras.layers.Embedding( input_dim=2, output_dim=hidden_dim, trainable=trainable, name='Embed-Segment', )(input_segment) embed_layer = keras.layers.Add(name='Embed-Token-Segment')( [embed_token, embed_segment]) embed_layer = PositionEmbedding( input_dim=pos_num, output_dim=hidden_dim, mode=PositionEmbedding.MODE_ADD, trainable=trainable, name='Embedding-Position', )(embed_layer) if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='Embedding-Dropout', )(embed_layer) else: dropout_layer = embed_layer embed_layer = LayerNormalization( trainable=trainable, name='Embedding-Norm', )(dropout_layer) # Build shared transformer attention_layer = MultiHeadAttention( head_num=head_num, activation=attention_activation, name='Attention', ) attention_normal = LayerNormalization(name='Attention-Normal') feed_forward_layer = FeedForward(units=feed_forward_dim, activation=feed_forward_activation, name='Feed-Forward') feed_forward_normal = LayerNormalization(name='Feed-Forward-Normal') transformed = embed_layer transformed_layers = [] for i in range(transformer_num): attention_input = transformed transformed = attention_layer(transformed) if dropout_rate > 0.0: transformed = keras.layers.Dropout( rate=dropout_rate, name='Attention-Dropout-{}'.format(i + 1), )(transformed) transformed = keras.layers.Add( name='Attention-Add-{}'.format(i + 1), )( [attention_input, transformed]) transformed = attention_normal(transformed) feed_forward_input = transformed transformed = feed_forward_layer(transformed) if dropout_rate > 0.0: transformed = keras.layers.Dropout( rate=dropout_rate, name='Feed-Forward-Dropout-{}'.format(i + 1), )(transformed) transformed = keras.layers.Add( name='Feed-Forward-Add-{}'.format(i + 1), )( [feed_forward_input, transformed]) transformed = feed_forward_normal(transformed) transformed_layers.append(transformed) if training: # Build tasks mlm_dense_layer = keras.layers.Dense( units=hidden_dim, activation=feed_forward_activation, name='MLM-Dense', )(transformed) mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer) mlm_pred_layer = AdaptiveSoftmax( input_dim=hidden_dim, output_dim=token_num, embed_dim=embed_dim, bind_embeddings=True, bind_projections=True, name='MLM-Sim', )([mlm_norm_layer, embed_weights, embed_projection]) masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]]) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=hidden_dim, activation='tanh', name='SOP-Dense', )(extract_layer) nsp_pred_layer = keras.layers.Dense( units=2, activation='softmax', name='SOP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer]) for layer in model.layers: layer.trainable = _trainable(layer) return model if output_layers is not None: if isinstance(output_layers, list): output_layers = [ transformed_layers[index] for index in output_layers ] output = keras.layers.Concatenate(name='Output', )(output_layers) else: output = transformed_layers[output_layers] model = keras.models.Model(inputs=inputs, outputs=output) return model model = keras.models.Model(inputs=inputs, outputs=transformed) for layer in model.layers: layer.trainable = _trainable(layer) return inputs, transformed
def get_model(token_num, pos_num=512, seq_len=512, embed_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, attention_activation=None, feed_forward_activation='gelu', training=True, trainable=None, output_layer_num=1, use_task_embed=False, task_num=10, use_adapter=False, adapter_units=None): """Get BERT model. See: https://arxiv.org/pdf/1810.04805.pdf :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param training: A built model with MLM and NSP outputs will be returned if it is `True`, otherwise the input layers and the last feature extraction layer will be returned. :param trainable: Whether the model is trainable. :param output_layer_num: The number of layers whose outputs will be concatenated as a single output. Only available when `training` is `False`. :param use_task_embed: Whether to add task embeddings to existed embeddings. :param task_num: The number of tasks. :param use_adapter: Whether to use feed-forward adapters before each residual connections. :param adapter_units: The dimension of the first transformation in feed-forward adapter. :return: The built model. """ if attention_activation == 'gelu': attention_activation = gelu if feed_forward_activation == 'gelu': feed_forward_activation = gelu if trainable is None: trainable = training if adapter_units is None: adapter_units = max(1, embed_dim // 100) def _trainable(_layer): if isinstance(trainable, (list, tuple, set)): for prefix in trainable: if _layer.name.startswith(prefix): return True return False return trainable inputs = get_inputs(seq_len=seq_len) x, s, m = inputs x = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]), name='Input-Token-Reshape')(x) s = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]), name='Input-Segment-Reshape')(s) m = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]), name='Input-Mention-Reshape')(m) embed_layer, embed_weights = get_embedding( [x, s, m], token_num=token_num, embed_dim=embed_dim, pos_num=pos_num, dropout_rate=dropout_rate, ) if use_task_embed: task_input = keras.layers.Input( shape=(1, ), name='Input-Task', ) embed_layer = TaskEmbedding( input_dim=task_num, output_dim=embed_dim, mask_zero=False, name='Embedding-Task', )([embed_layer, task_input]) inputs = inputs[:2] + [task_input, inputs[-1]] if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='Embedding-Dropout', )(embed_layer) else: dropout_layer = embed_layer embed_layer = LayerNormalization( trainable=trainable, name='Embedding-Norm', )(dropout_layer) transformed = get_encoders( encoder_num=transformer_num, input_layer=embed_layer, head_num=head_num, hidden_dim=feed_forward_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, use_adapter=use_adapter, adapter_units=adapter_units, adapter_activation=gelu, ) if training: mlm_dense_layer = keras.layers.Dense( units=embed_dim, activation=feed_forward_activation, name='MLM-Dense', )(transformed) mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer) mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')( [mlm_norm_layer, embed_weights]) masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]]) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=embed_dim, activation='tanh', name='NSP-Dense', )(extract_layer) nsp_pred_layer = keras.layers.Dense( units=2, activation='softmax', name='NSP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer]) for layer in model.layers: layer.trainable = _trainable(layer) return model else: model = keras.models.Model(inputs=inputs, outputs=transformed) for layer in model.layers: layer.trainable = _trainable(layer) if isinstance(output_layer_num, int): output_layer_num = min(output_layer_num, transformer_num) output_layer_num = [-i for i in range(1, output_layer_num + 1)] outputs = [] for layer_index in output_layer_num: if layer_index < 0: layer_index = transformer_num + layer_index layer_index += 1 layer = model.get_layer( name='Encoder-{}-FeedForward-Norm'.format(layer_index)) outputs.append(layer.output) if len(outputs) > 1: transformed = keras.layers.Concatenate(name='Encoder-Output')(list( reversed(outputs))) else: transformed = outputs[0] return inputs, transformed