def test_extract_embeddings_invalid_pooling(self): with self.assertRaises(ValueError): extract_embeddings( self.model_path, [ ('all work and no play', 'makes jack a dull boy'), ('makes jack a dull boy', 'all work and no play'), ], poolings=['invalid'], )
def test_extract_embeddings_default(self): embeddings = extract_embeddings( self.model_path, ['all work and no play', 'makes jack a dull boy~']) self.assertEqual(2, len(embeddings)) self.assertEqual((7, 4), embeddings[0].shape) self.assertEqual((8, 4), embeddings[1].shape)
def test_extract_embeddings_pair(self): embeddings = extract_embeddings( self.model_path, [ ('all work and no play', 'makes jack a dull boy'), ('makes jack a dull boy', 'all work and no play'), ], ) self.assertEqual(2, len(embeddings)) self.assertEqual((13, 4), embeddings[0].shape)
def test_extract_embeddings_single_pooling(self): embeddings = extract_embeddings( self.model_path, [ ('all work and no play', 'makes jack a dull boy'), ('makes jack a dull boy', 'all work and no play'), ], poolings=POOL_NSP, ) self.assertEqual(2, len(embeddings)) self.assertEqual((4, ), embeddings[0].shape)
def test_extract_embeddings_multi_pooling(self): embeddings = extract_embeddings( self.model_path, [ ('all work and no play', 'makes jack a dull boy'), ('makes jack a dull boy', 'all work and no play'), ], poolings=[POOL_NSP, POOL_MAX, POOL_AVE], output_layer_num=2, ) self.assertEqual(2, len(embeddings)) self.assertEqual((24, ), embeddings[0].shape)
def test_extract_embeddings_variable_lengths(self): tokens = [ '[PAD]', '[UNK]', '[CLS]', '[SEP]', 'all', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', '~', ] token_dict = {token: i for i, token in enumerate(tokens)} inputs, outputs = get_model( token_num=len(tokens), pos_num=20, seq_len=None, embed_dim=13, transformer_num=1, feed_forward_dim=17, head_num=1, training=False, ) model = keras.models.Model(inputs, outputs) embeddings = extract_embeddings( model, [ ('all work and no play', 'makes jack'), ('a dull boy', 'all work and no play and no play'), ], vocabs=token_dict, batch_size=2, ) self.assertEqual(2, len(embeddings)) self.assertEqual((10, 13), embeddings[0].shape) self.assertEqual((14, 13), embeddings[1].shape)
def test_extract_embeddings_from_file(self): with codecs.open(os.path.join(self.model_path, 'vocab.txt'), 'r', 'utf8') as reader: texts = map(lambda x: x.strip(), reader) embeddings = extract_embeddings(self.model_path, texts) self.assertEqual(15, len(embeddings))
from bertTAT.bert import extract_embeddings from bertTAT.bert import load_trained_model_from_checkpoint import numpy as np import os, codecs # 1. 提取预训练模型文件的路径 now_path = os.path.dirname(__file__) pretrained_path = now_path + "/../pretrained_model/chinese_L-12_H-768_A-12" # 1. 如果不需要微调,只想提取词/句子的特征,如提取每个句子对应的全部词的特征 texts = ["世上无难事", '只要肯攀登!'] embeddings = extract_embeddings(pretrained_path, texts) print("embedding:", np.array(embeddings[0]).shape) # 2. 输入是成对的句子,想使用最后4层特征,且提取NSP位位置输出和max-pooling的结果 # 输出结果中不再包含词的特征,NSP和max-pooling的输出会拼接在一起,每个numpy数组的大小为(768 x 4 x 2,) from bertTAT.bert import extract_embeddings, POOL_NSP, POOL_MAX texts = [('公司加班很严重', '但也要保持学习!'), ('算法学习', '永不止步。')] embeddings = extract_embeddings(pretrained_path, texts, output_layer_num=4, poolings=[POOL_NSP, POOL_MAX]) print("句子对:", np.array(embeddings).shape) # 3. 可以使用adapter来对预训练模型进行微调,下面的代码只让adapter和layer normalization成为可训练的层 layer_num = 12 config_path = os.path.join(pretrained_path, 'bert_config.json') model_path = os.path.join(pretrained_path, 'bert_model.ckpt') model = load_trained_model_from_checkpoint(