Exemple #1
0
 def load_model(self):
     tf.keras.backend.clear_session()
     logging.info("Loading RuBERT model...")
     paths = get_checkpoint_paths("model_bert")
     inputs = load_trained_model_from_checkpoint(
         config_file=paths.config,
         checkpoint_file=paths.checkpoint, seq_len=50)
     outputs = MaskedGlobalMaxPool1D(name="Pooling")(inputs.output)
     vocab = load_vocabulary(paths.vocab)
     return tf.keras.Model(inputs=inputs.inputs,
                           outputs=outputs), vocab, Tokenizer(vocab)
Exemple #2
0
 def __init__(self, docs, vec):
     self.texts = np.array(docs)
     self.vec = vec
     paths = get_checkpoint_paths(".")
     inputs = load_trained_model_from_checkpoint(
         config_file=paths.config,
         checkpoint_file=paths.checkpoint,
         seq_len=50)
     outputs = MaskedGlobalMaxPool1D(name='Pooling')(inputs.output)
     self.model = Model(inputs=inputs.inputs, outputs=outputs)
     self.vocab = load_vocabulary(paths.vocab)
     self.tokenizer = Tokenizer(self.vocab)
Exemple #3
0
def build_bert(model, poolings=None, output_layer_num=1):
    """Extract embeddings from texts.

    :param model: Path to the checkpoint or built model without MLM and NSP.
    :param texts: Iterable texts.
    :param poolings: Pooling methods. Word embeddings will be returned if it is None.
                     Otherwise concatenated pooled embeddings will be returned.
    :param vocabs: A dict should be provided if model is built.
    :param cased: Whether it is cased for tokenizer.
    :param batch_size: Batch size.
    :param cut_embed: The computed embeddings will be cut based on their input lengths.
    :param output_layer_num: The number of layers whose outputs will be concatenated as a single output.
                             Only available when `model` is a path to checkpoint.
    :return: A list of numpy arrays representing the embeddings.
    """
    model = get_pretrained(PretrainedList.multi_cased_base)
    if isinstance(model, (str, type(u''))):
        paths = get_checkpoint_paths(model)
        model = load_trained_model_from_checkpoint(
            config_file=paths.config,
            checkpoint_file=paths.checkpoint,
            output_layer_num=output_layer_num,
        )

    outputs = []

    if poolings is not None:
        if isinstance(poolings, (str, type(u''))):
            poolings = [poolings]
        # outputs = []
        for pooling in poolings:
            if pooling == POOL_NSP:
                outputs.append(
                    Extract(index=0, name='Pool-NSP')(model.outputs[0]))
            elif pooling == POOL_MAX:
                outputs.append(
                    MaskedGlobalMaxPool1D(name='Pool-Max')(model.outputs[0]))
            elif pooling == POOL_AVE:
                outputs.append(
                    keras.layers.GlobalAvgPool1D(name='Pool-Ave')(
                        model.outputs[0]))
            else:
                raise ValueError('Unknown pooling method: {}'.format(pooling))
        # print(outputs)
        if len(outputs) == 1:
            outputs = outputs[0]
        else:
            outputs = keras.layers.Concatenate(name='Concatenate')(outputs)
        outputs = Lambda(bert_output_sum)(outputs)
        # model = keras.models.Model(inputs=model.inputs, outputs=outputs)
    return model.inputs, outputs
 def test_masked_global_max_pool_1d_predict(self):
     embed = np.random.standard_normal((11, 13))
     input_layer = keras.layers.Input(shape=(None, ))
     embed_layer = keras.layers.Embedding(
         input_dim=11,
         output_dim=13,
         mask_zero=True,
         weights=[embed],
     )(input_layer)
     pool_layer = MaskedGlobalMaxPool1D()(embed_layer)
     model = keras.models.Model(inputs=input_layer, outputs=pool_layer)
     model.compile(optimizer='adam', loss='mse')
     x = np.array([[1, 2, 0, 0], [2, 3, 4, 0]])
     y = model.predict(x)
     self.assertTrue(np.allclose(np.max(embed[1:3], axis=0), y[0]))
     self.assertTrue(np.allclose(np.max(embed[2:5], axis=0), y[1]))
 def test_masked_global_max_pool_1d_fit(self):
     input_layer = keras.layers.Input(shape=(None, ))
     embed_layer = keras.layers.Embedding(
         input_dim=11,
         output_dim=13,
         mask_zero=False,
     )(input_layer)
     pool_layer = MaskedGlobalMaxPool1D()(embed_layer)
     dense_layer = keras.layers.Dense(units=2,
                                      activation='softmax')(pool_layer)
     model = keras.models.Model(inputs=input_layer, outputs=dense_layer)
     model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
     model.summary()
     x = np.random.randint(0, 11, (32, 7))
     y = np.random.randint(0, 2, (32, ))
     model.fit(x, y)
Exemple #6
0
 def test_masked_global_max_pool_1d_predict(self):
     input_layer = keras.layers.Input(shape=(None, ))
     embed_layer = keras.layers.Embedding(input_dim=5,
                                          output_dim=6,
                                          mask_zero=True,
                                          name='Embed')(input_layer)
     pool_layer = MaskedGlobalMaxPool1D()(embed_layer)
     model = keras.models.Model(inputs=input_layer, outputs=pool_layer)
     model.compile(optimizer='adam', loss='mse')
     x = np.array([[1, 2, 0, 0], [2, 3, 4, 0]])
     y = model.predict(x)
     embed = model.get_layer('Embed').get_weights()[0]
     expected = np.max(embed[1:3], axis=0)
     self.assertTrue(np.allclose(expected, y[0]), (expected, y[0]))
     expected = np.max(embed[2:5], axis=0)
     self.assertTrue(np.allclose(expected, y[1]), (expected, y[1]))
Exemple #7
0
 def test_masked_conv_1d_fit(self):
     input_layer = keras.layers.Input(shape=(None,))
     embed_layer = keras.layers.Embedding(
         input_dim=11,
         output_dim=13,
         mask_zero=True,
     )(input_layer)
     conv_layer = MaskedConv1D(filters=7, kernel_size=3, padding='same')(embed_layer)
     pool_layer = MaskedGlobalMaxPool1D()(conv_layer)
     dense_layer = keras.layers.Dense(units=2, activation='softmax')(pool_layer)
     model = keras.models.Model(inputs=input_layer, outputs=dense_layer)
     model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
     model.summary()
     x = np.array(np.random.randint(0, 11, (32, 7)).tolist() * 100)
     y = np.array(np.random.randint(0, 2, (32,)).tolist() * 100)
     model.fit(x, y, epochs=10)
     y_hat = model.predict(x).argmax(axis=-1)
     self.assertEqual(y.tolist(), y_hat.tolist())
Exemple #8
0
def link_model():

    input_en = Input(shape=(13, ), name='kb_en')
    input_begin = Input(shape=(13, ), name='begin')
    input_end = Input(shape=(13, ), name='end')
    bert_path = 'bert_model/'
    config_path = bert_path + 'bert_config.json'
    checkpoint_path = bert_path + 'bert_model.ckpt'
    bert_model = load_trained_model_from_checkpoint(config_path,
                                                    checkpoint_path,
                                                    trainable=True,
                                                    seq_len=52)
    entity_embedding = Embedding(input_dim=312452,
                                 output_dim=768,
                                 weights=[embedding_matrix_entity],
                                 trainable=True,
                                 name='entity_embedding')
    men_sen = bert_model.output
    mask_sen = Lambda(lambda x: K.cast(K.greater(x, 0), 'float32'))(
        bert_model.input[0])
    men_sen = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))(
        [men_sen, mask_sen])
    men_sen = SpatialDropout1D(0.15)(men_sen)
    [forward, backward] = Bidirectional(CuDNNGRU(128, return_sequences=True),
                                        merge_mode=None)(men_sen, mask=None)
    gru = concatenate([forward, backward], axis=-1)
    max_x = MaskedGlobalMaxPool1D()(gru)
    x = StateMix()([input_begin, input_end, forward, backward])
    t_dim = K.int_shape(x)[-1]
    x = Lambda(seq_and_vec, output_shape=(13, t_dim * 2))([x, max_x])
    mask = Lambda(lambda x: K.cast(K.greater(x, -1), 'float32'))(input_begin)
    kb_en = entity_embedding(input_en)
    x = concatenate([kb_en, x], axis=-1)
    x = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([x, mask])
    x = Dropout(0.1)(x)
    x = Conv1D(128, 1, activation='relu', padding='same')(x)
    # x = Dense(units=128, activation='relu')(x)
    x = TimeDistributed(Dropout(0.1))(x)
    x = Dense(units=1, activation='sigmoid')(x)
    model = Model(bert_model.inputs + [input_en, input_begin, input_end], x)
    model.compile(optimizer=adam(),
                  loss=binary_crossentropy,
                  metrics=[metrics_f1])
    return model
if len(sys.argv) != 2:
    print('python load_model.py UNZIPPED_MODEL_PATH')
    sys.exit(-1)

print(
    'This demo demonstrates how to load the pre-trained model and extract the sentence embedding with pooling.'
)

model_path = sys.argv[1]
config_path = os.path.join(model_path, 'bert_config.json')
checkpoint_path = os.path.join(model_path, 'bert_model.ckpt')
dict_path = os.path.join(model_path, 'vocab.txt')

model = load_trained_model_from_checkpoint(config_path,
                                           checkpoint_path,
                                           seq_len=10)
pool_layer = MaskedGlobalMaxPool1D(name='Pooling')(model.output)
model = keras.models.Model(inputs=model.inputs, outputs=pool_layer)
model.summary(line_length=120)

token_dict = load_vocabulary(dict_path)

tokenizer = Tokenizer(token_dict)
text = '语言模型'
tokens = tokenizer.tokenize(text)
print('Tokens:', tokens)
indices, segments = tokenizer.encode(first=text, max_len=10)

predicts = model.predict([np.array([indices]), np.array([segments])])[0]
print('Pooled:', predicts.tolist()[:5])