def test_padding(self): tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]'] token_dict = {token: i for i, token in enumerate(tokens)} tokenizer = Tokenizer(token_dict) text = '\u535A\u63A8' # single indices, segments = tokenizer.encode(first=text, max_len=100) expected = [2, 1, 1, 3] + [0] * 96 self.assertEqual(expected, indices) expected = [0] * 100 self.assertEqual(expected, segments) decoded = tokenizer.decode(indices) self.assertEqual(['[UNK]', '[UNK]'], decoded) indices, segments = tokenizer.encode(first=text, max_len=3) self.assertEqual([2, 1, 3], indices) self.assertEqual([0, 0, 0], segments) # paired indices, segments = tokenizer.encode(first=text, second=text, max_len=100) expected = [2, 1, 1, 3, 1, 1, 3] + [0] * 93 self.assertEqual(expected, indices) expected = [0, 0, 0, 0, 1, 1, 1] + [0] * 93 self.assertEqual(expected, segments) decoded = tokenizer.decode(indices) self.assertEqual((['[UNK]', '[UNK]'], ['[UNK]', '[UNK]']), decoded) indices, segments = tokenizer.encode(first=text, second=text, max_len=4) self.assertEqual([2, 1, 3, 3], indices) self.assertEqual([0, 0, 0, 1], segments)
class twitterProcessor(): def __init__(self, vocab_path, data_dir, SEQ_LEN): self.vocab_path = vocab_path self.data_dir = data_dir self.seq_len = SEQ_LEN def get_train_examples(self, data_dir): token_dict = {} with codecs.open(self.vocab_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) self.tokenizer = Tokenizer(token_dict) with open(data_dir, 'r', encoding='utf-8') as f: reader = f.readlines() x_train, y_train = self.create_examples(reader, "train") return x_train, y_train def create_examples(self, lines, set_type): examples = [] indices, labels = [], [] for index, line in enumerate(lines): guid = "%s-%s" % (set_type, index) split_line = line.strip().split('+++$+++') ids, segments = self.tokenizer.encode(split_line[1], max_len=self.seq_len) sentiment = split_line[0] indices.append(ids) labels.append(sentiment) return [indices, np.zeros_like(indices)], np.array(labels) def get_test_examples(self, data_dir): token_dict = {} with codecs.open(self.vocab_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) self.tokenizer = Tokenizer(token_dict) with open(data_dir, 'r', encoding='utf-8') as f: reader = f.readlines() x_test = self.create_test_examples(reader, "train") return x_test def create_test_examples(self, lines, set_type): examples = [] indices = [] for index, line in enumerate(lines): guid = "%s-%s" % (set_type, index) ids, segments = self.tokenizer.encode(line.strip(), max_len=self.seq_len) indices.append(ids) return [indices, np.zeros_like(indices)]
def work1(self, text1): out = [] if type(text1) == str: text1 = [text1] for i in text1: resu = i.replace('|', '').replace(' ', '').replace('“', '“').replace('”', '”') \ .replace('‘', '‘').replace('’', '’').replace('〔', '(').replace('〕', ')').replace('/', '') \ .replace('·', '·').replace('•', '·').replace("\\n", "\n").replace("\\r", "\r").replace("\\t", "\t") resu = re.split(r'\s+', resu) dr = re.compile(r'<[^>]+>', re.S) dd = dr.sub('', '。'.join(resu)) line = re.sub(self.restr, '', dd) eng = [",", "!", "?", ":", ";", "(", ")", "[", "]", "$", "。。"] chi = [",", "!", "?", ":", ";", "(", ")", "【", "】", "¥", '。'] for i, j in zip(eng, chi): line = line.replace(i, j) out.append(line[:28]) token_dict = {} dict_path = "../chinese_L-12_H-768_A-12/vocab.txt" with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) tokenizer = Tokenizer(token_dict) x1, x2 = [], [] for text in out: indices, segments = tokenizer.encode(first=text, max_len=512) x1.append(indices) x2.append(segments) return x1, x2
def load_task2_testX(dict_path, data_dir): if not os.path.exists(os.path.join( data_dir, 'task2_testX.npy')) or not os.path.exists( os.path.join(data_dir, 'task2_test_seg.npy')): df = pd.read_csv(os.path.join(data_dir, 'task2_public_testset.csv'), dtype=str) abstract = df.values[:, 2] # collect words token_dict = load_vocabulary(dict_path) tokenizer = Tokenizer(token_dict) input_data = [] input_seg = [] seq_len = 512 # maximum should be 638, while bert-BASE only support up to 512 for i in tqdm(abstract): j = i.replace('$$$', ' ') idx, seg = tokenizer.encode(j, max_len=seq_len) input_data.append(idx) input_seg.append(seg) X = np.asarray(input_data) seg = np.asarray(input_seg) np.save(os.path.join(data_dir, 'task2_testX.npy'), X) np.save(os.path.join(data_dir, 'task2_test_seg.npy'), seg) else: X, seg = np.load(os.path.join(data_dir, 'task2_testX.npy')), np.load( os.path.join(data_dir, 'task2_test_seg.npy')) return X, seg
def test_uncased(self): tokens = [ '[PAD]', '[UNK]', '[CLS]', '[SEP]', 'want', '##want', '##ed', 'wa', 'un', 'runn', '##ing', ',', '\u535A', '\u63A8', ] token_dict = {token: i for i, token in enumerate(tokens)} tokenizer = Tokenizer(token_dict) text = u"UNwant\u00E9d, running \nah\u535A\u63A8zzz\u00AD" tokens = tokenizer.tokenize(text) expected = [ '[CLS]', 'un', '##want', '##ed', ',', 'runn', '##ing', 'a', '##h', '\u535A', '\u63A8', 'z', '##z', '##z', '[SEP]', ] self.assertEqual(expected, tokens) indices, segments = tokenizer.encode(text) expected = [2, 8, 5, 6, 11, 9, 10, 1, 1, 12, 13, 1, 1, 1, 3] self.assertEqual(expected, indices) expected = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.assertEqual(expected, segments) decoded = tokenizer.decode(indices) expected = [ 'un', '##want', '##ed', ',', 'runn', '##ing', '[UNK]', '[UNK]', '\u535A', '\u63A8', '[UNK]', '[UNK]', '[UNK]', ] self.assertEqual(expected, decoded)
def _text_process(self, text): Tokener = Tokenizer(self.vocab_dict) encoder = [Tokener.encode(first=doc[0],second=doc[1], max_len=self.max_seq_len) for doc in text] input_ids = [i[0] for i in encoder] input_type = [i[1] for i in encoder] input_mask = [[0 if l==0 else 1 for l in i] for i in input_ids] return (input_ids,input_mask,input_type)
def load_bert_data(raw_file, train=True): config = Config() dict_path = './corpus/vocab.txt' token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) tags2id, id2tags = load_tags('tags.txt') x_ids = list() x_segments = list() x_label = list() with codecs.open(raw_file, encoding='utf-8') as f: for line in f: x = json.loads(line) input_sent = x['title'] tokenizer = Tokenizer(token_dict) x_sent_id, x_sent_segment = tokenizer.encode( input_sent, max_len=config.max_len_word) x_ids.append(x_sent_id) x_segments.append(x_sent_segment) if train: y = load_label(x, tags2id) x_label.append(y) x_label = np.asarray(x_label) return x_ids, x_segments, x_label, id2tags, None
def PreProcessInputData(self, text): tokenizer = Tokenizer(self.vocab) word_labels = [] seq_types = [] for sequence in text: code = tokenizer.encode(first=sequence, max_len=self.max_seq_length) word_labels.append(code[0]) seq_types.append(code[1]) return word_labels, seq_types
def test_empty(self): tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]'] token_dict = {token: i for i, token in enumerate(tokens)} tokenizer = Tokenizer(token_dict) text = u'' self.assertEqual(['[CLS]', '[SEP]'], tokenizer.tokenize(text)) indices, segments = tokenizer.encode(text) self.assertEqual([2, 3], indices) self.assertEqual([0, 0], segments)
def article_preprocess(self): tokenizer = Tokenizer(self.token_dict) self.text_split = [ele for ele in self.text.split('。') if len(ele) > 0] self.sent_num = len(self.text_split) tok = [tokenizer.encode(sent)[0] for sent in self.text_split] tok_pad = pad_sequences(tok, maxlen=self.seqence_len) self.data_in = [ tok_pad, np.zeros(shape=(self.sent_num, self.seqence_len)) ]
def encode_input_x(self, sentences): '''数据X序列化编码 使用BERT的Tokenizer:Token编码, 句子编码 sentences是句子列表,字符串''' tokenizer = Tokenizer(self.vocab) sent_token_ids = [] sent_segment_ids = [] for sequence in sentences: token_ids, segment_ids = tokenizer.encode( first=sequence, max_len=self.seq_maxlen) # 输入只有1个句子! sent_token_ids.append(token_ids) sent_segment_ids.append(segment_ids) return [sent_token_ids, sent_segment_ids]
def load_data(texts): tokenizer = Tokenizer(token_dict) indices = [] indices_mask = [] for text in tqdm(texts): ids,masked_ids = tokenizer.encode(text[0],text[1],max_len=SEQ_LEN) indices.append(ids) indices_mask.append(masked_ids) indices = np.array(indices) indices_mask = np.array(indices_mask) return [indices, indices_mask]
def get_infer_input(input_file, out_file): id_type = pd.read_pickle('../data/id_type.pkl') type_index = pd.read_pickle('../data/type_index.pkl') entity_id = pd.read_pickle('../data/entity_id.pkl') id_text = pd.read_pickle('../data/id_text.pkl') token_dict = get_token_dict() tokenizer = Tokenizer(token_dict) out_file = open(out_file, 'w') file_index = 0 with open(input_file) as f: for line in f: if file_index % 100 == 0: print(file_index) file_index += 1 temDict = json.loads(line) text = temDict['text'] mention_data = temDict['mention_data'] for men in mention_data: mention = men['mention'] offset = int(men['offset']) begin = int(offset) + 1 end = begin + len(mention) link_id = get_link_entity_test(mention, entity_id) men['link_id'] = link_id link_data = { 'ids': [], 'seg': [], 'begin': [], 'end': [], 'en_type': [] } for id in link_id: kb_text = id_text[id] kb_type = type_index[id_type[id][0]] indice, segment = tokenizer.encode(first=text, second=kb_text, max_len=256) link_data['ids'].append(indice) link_data['seg'].append(segment) link_data['begin'].append([begin]) link_data['end'].append([end]) link_data['en_type'].append([kb_type]) men['link_data'] = link_data out_file.write(json.dumps(temDict, ensure_ascii=False)) out_file.write('\n')
def bert_sen_token(token_dict, traininstance, maxlen): tokenizer = Tokenizer(token_dict) train_indices = [] train_segments = [] train_text = [] for text in traininstance: tokens = tokenizer.tokenize(text) indices, segments = tokenizer.encode(first=text, max_len=maxlen) train_indices.append(indices) train_segments.append(segments) train_text.append(tokens) return train_indices, train_segments, train_text
class batchGen: def __init__(self, label, bs = 16, token_dict = None): self.batch_size = bs self.random = random self.ans = label self.maxlen_doc = 512 self.tokenizer = Tokenizer(token_dict) self.iter_index = np.arange(len(self.ans)) def __len__(self): return len(self.ans) def flow(self): ''' Get a batch of data ''' n = len(self.ans) i=0 while(True): batch_doc = [] batch_doc2 = [] batch_query = [] batch_labels = [] for b in range(self.batch_size): if(i == 0): # Shuffle the dataset np.random.shuffle(self.iter_index) index = self.iter_index[i] # choose a data doc, query, label = self.GetData(index) while doc == None: i = (i+1) % n index = self.iter_index[i] # choose a data doc, query, label = self.GetData(index) x1, x2 = self.tokenizer.encode(first=query, second=doc, max_len=self.maxlen_doc) batch_doc.append(x1) batch_doc2.append(x2) batch_labels.append(label) i = (i+1) % n batch_doc = np.array(batch_doc, dtype = np.float32) batch_doc2 = np.array(batch_doc2, dtype=np.float32) batch_labels = np.array(batch_labels, dtype = np.float32) yield [batch_doc, batch_doc2], batch_labels def GetData(self, index): [query_fn,doc_fn], label = self.ans[index] doc = open('./doc/'+doc_fn).read() query = open('./train/query/' + query_fn).read() return doc, query, int(label)
def _text_process(self, text): Tokener = Tokenizer(self.vocab_dict) encoder = [ Tokener.encode(first=doc[0], max_len=self.max_seq_len) for doc in text ] input_ids = [i[0] for i in encoder] input_type = [i[1] for i in encoder] input_mask = [[0 if l == 0 else 1 for l in i] for i in input_ids] input_pos = [[0] + [self._pos2id.get(t, 0) for t in doc[1]] + [0] for doc in text] input_pos = pad_sequences(input_pos, self.max_seq_len, padding="post", truncating="post") return (input_ids, input_mask, input_type, input_pos)
def test_cased(self): tokens = [ '[UNK]', u'[CLS]', '[SEP]', 'want', '##want', u'##\u00E9d', 'wa', 'UN', 'runn', '##ing', ',', ] token_dict = {token: i for i, token in enumerate(tokens)} tokenizer = Tokenizer(token_dict, cased=True) text = u"UNwant\u00E9d, running" tokens = tokenizer.tokenize(text) expected = ['[CLS]', 'UN', '##want', u'##\u00E9d', ',', 'runn', '##ing', '[SEP]'] self.assertEqual(expected, tokens) indices, segments = tokenizer.encode(text) expected = [1, 7, 4, 5, 10, 8, 9, 2] self.assertEqual(expected, indices) expected = [0, 0, 0, 0, 0, 0, 0, 0] self.assertEqual(expected, segments)
def load_task2_trainXY(dict_path, data_dir): if not os.path.exists(os.path.join( data_dir, 'task2_trainX.npy')) or not os.path.exists( os.path.join( data_dir, 'task2_trainY.npy')) or not os.path.exists( os.path.join(data_dir, 'task2_train_seg.npy')): df = pd.read_csv(os.path.join(data_dir, 'task2_trainset.csv'), dtype=str) cate = df.values[:, -1] # generating Y Y = np.zeros((cate.shape[0], 4)) name = { 'THEORETICAL': 0, 'ENGINEERING': 1, 'EMPIRICAL': 2, 'OTHERS': 3 } for i in range(cate.shape[0]): for c in cate[i].split(' '): Y[i, name[c]] += 1 # generating X abstract = df.values[:, 2] # collect words token_dict = load_vocabulary(dict_path) tokenizer = Tokenizer(token_dict) input_data = [] input_seg = [] for i in tqdm(abstract): j = i.replace('$$$', ' ') idx, seg = tokenizer.encode(j, max_len=512) input_data.append(idx) input_seg.append(seg) X = np.array(input_data) seg = np.array(input_seg) np.save(os.path.join(data_dir, 'task2_trainX.npy'), X) np.save(os.path.join(data_dir, 'task2_trainY.npy'), Y) np.save(os.path.join(data_dir, 'task2_train_seg.npy'), seg) else: X, Y, seg = np.load(os.path.join( data_dir, 'task2_trainX.npy')), np.load( os.path.join(data_dir, 'task2_trainY.npy')), np.load( os.path.join(data_dir, 'task2_train_seg.npy')) return X, Y, seg
def get_encode(text_list,token_dict): """ :param text_list: :param token_dict: :return: """ X1 = [] X2 = [] tokenizer = Tokenizer(token_dict) for line in text_list: x1, x2 = tokenizer.encode(first=line) X1.append(x1) X2.append(x2) X1 = sequence.pad_sequences(X1, maxlen=maxlen, padding='post', truncating='post') X2 = sequence.pad_sequences(X2, maxlen=maxlen, padding="post", truncating='post') return [X1, X2]
class FineTuneBert: def __init__(self, gpu_name, gpu_num, seq_max_len, batch_size): print('--' * 10 + ' Load BERT model start ' + '--' * 10) gpu_option(gpu_name, gpu_num) self.seq_max_len = seq_max_len # same to train self.batch_size = batch_size model_path = 'models/BERT/pretrained_model/uncased_L-24_H-1024_A-16' vocab_path = os.path.join(model_path, 'vocab.txt') # load Tokenizer token_dict = load_vocabulary(vocab_path) self.tokenizer = Tokenizer(token_dict) MODEL_SAVE_PATH = 'models/BERT/fine_tune_model/bert_fine_tune.hdf5' model = load_model(MODEL_SAVE_PATH, custom_objects=get_custom_objects(), compile=False) if gpu_num >= 2: self.par_model = multi_gpu_model(model, gpus=gpu_num) else: self.par_model = model print('--' * 10 + ' Load BERT model end ' + '--' * 10) def data_generator(self, data): steps = len(data) // self.batch_size if len(data) % self.batch_size != 0: steps += 1 X1, X2 = [], [] for i in range(len(data)): d = data[i] text1 = d[0] text2 = d[1] x1, x2 = self.tokenizer.encode(first=text1, second=text2, max_len=self.seq_max_len) # 512 X1.append(x1) X2.append(x2) if len(X1) == self.batch_size or i == (len(data) - 1): yield np.array(X1), np.array(X2) X1, X2 = [], [] def classify(self, texts): pred = [] my_iter = self.data_generator(texts) for indices, segments in my_iter: p = self.par_model.predict([indices, segments]) pred += sum(p.tolist(), []) return pred
def extract(max_len=512): ''' :param max_len: 文本最大长度 :return: 字典形式,key: kb_id value: kb_id对应描述文本形成的向量 ''' model = get_model(max_len) token_dict = get_token_dict() tokenizer = Tokenizer(token_dict) id_text = pd.read_pickle('data/id_text.pkl') id_embedding = {} for id in id_text: if int(id) % 10000 == 0: print(id) text = id_text[id] indices, segments = tokenizer.encode(first=text, max_len=512) predicts = model.predict([[indices], [segments]], verbose=2) id_embedding[id] = predicts[0] pd.to_pickle(id_embedding, 'data/id_embedding.pkl')
def get_single_infer_input(ner_result): id_type = pd.read_pickle('../data/id_type.pkl') type_index = pd.read_pickle('../data/type_index.pkl') entity_id = pd.read_pickle('../data/entity_id.pkl') id_text = pd.read_pickle('../data/id_text.pkl') token_dict = get_token_dict() tokenizer = Tokenizer(token_dict) temDict = json.loads(ner_result) text = temDict['text'] mention_data = temDict['mention_data'] for men in mention_data: mention = men['mention'] offset = int(men['offset']) begin = int(offset) + 1 end = begin + len(mention) link_id = get_link_entity_test(mention, entity_id) men['link_id'] = link_id link_data = { 'ids': [], 'seg': [], 'begin': [], 'end': [], 'en_type': [] } for id in link_id: kb_text = id_text[id] kb_type = type_index[id_type[id][0]] indice, segment = tokenizer.encode(first=text, second=kb_text, max_len=256) link_data['ids'].append(indice) link_data['seg'].append(segment) link_data['begin'].append([begin]) link_data['end'].append([end]) link_data['en_type'].append([kb_type]) men['link_data'] = link_data return json.dumps(temDict)
def getBERTScore(self, queries_df): tweets = queries_df['cleanTweet'] token_dict = {} with codecs.open(self.vocab_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) tokenizer = Tokenizer(token_dict) indices = [] for index, line in enumerate(tweets): ids, segments = tokenizer.encode(line.strip(), max_len=128) indices.append(ids) x_test = [indices, np.zeros_like(indices)] predictions = self.model.predict(x_test) return predictions
class KerasBERT: def __init__(self, batch_size, gpu_num, gpu_name): gpu_option(gpu_name, gpu_num) self.batch_size = batch_size print("##### load KerasBERT start #####") # Path model_path = 'models/BERT/pretrained_model/uncased_L-24_H-1024_A-16' config_path = os.path.join(model_path, 'bert_config.json') checkpoint_path = os.path.join(model_path, 'bert_model.ckpt') vocab_path = os.path.join(model_path, 'vocab.txt') token_dict = load_vocabulary(vocab_path) model = load_trained_model_from_checkpoint(config_path, checkpoint_path) if gpu_num >= 2: self.par_model = multi_gpu_model(model, gpus=gpu_num) else: self.par_model = model self.tokenizer = Tokenizer(token_dict) print("##### load KerasBERT end #####") def bert_encode(self, texts): predicts = [] def create_array(): data = [] for text in texts: indices, segments = self.tokenizer.encode(first=text, max_len=512) data.append([indices, segments]) return data array = create_array() my_iter = data_iter(array, batch_size=self.batch_size) for w1, w2 in my_iter: m_indices = np.array(w1) m_segments = np.array(w2) predict = self.par_model.predict([m_indices, m_segments]) batch_predict = predict[:, 0].tolist() # 每句话取第一个word([CLS])的编码 predicts += batch_predict return predicts
def prepare_data(data, is_test): token_dict = {} with codecs.open(DICT_PATH, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) tokenizer = Tokenizer(token_dict) indices = [] segments = [] results = [] i = 0 for row in data: text1 = clean_text(row[0]) text2 = clean_text(row[1]) # In training set, max len is 201. Idk yet what max token count is in test set. # Still, pretrained BERT has width of 512, so that's what we will use. row_indices, row_segments = tokenizer.encode(first=text1, second=text2, max_len=512) indices.append(row_indices) segments.append(row_segments) #print(tokenizer.tokenize(text1)) #print(tokenizer.tokenize(text2)) #print(row_indices) #print(row_segments) if not is_test: results.append(row[2]) if i % 100 is 0: print("i=", i) i += 1 print("Num rows processed: ", i) if is_test: return np.array(indices), np.array(segments) else: return np.array(indices), np.array(segments), np.array(results, dtype="float32")
print('Tokens:', tokens) indices = np.array([[token_dict[token] for token in tokens] + [0] * (512 - len(tokens))]) segments = np.array([[0] * len(tokens) + [0] * (512 - len(tokens))]) masks = np.array([[0, 1, 1] + [0] * (512 - 3)]) predicts = model.predict([indices, segments, masks])[0].argmax(axis=-1).tolist() print('Fill with: ', list(map(lambda x: token_dict_inv[x], predicts[0][1:3]))) sentence_1 = '数学是利用符号语言研究數量、结构、变化以及空间等概念的一門学科。' sentence_2 = '从某种角度看屬於形式科學的一種。' print('Tokens:', tokenizer.tokenize(first=sentence_1, second=sentence_2)) indices, segments = tokenizer.encode(first=sentence_1, second=sentence_2, max_len=512) masks = np.array([[0] * 512]) predicts = model.predict([np.array([indices]), np.array([segments]), masks])[1] print('%s is random next: ' % sentence_2, bool(np.argmax(predicts, axis=-1)[0])) sentence_2 = '任何一个希尔伯特空间都有一族标准正交基。' print('Tokens:', tokenizer.tokenize(first=sentence_1, second=sentence_2)) indices, segments = tokenizer.encode(first=sentence_1, second=sentence_2, max_len=512) predicts = model.predict([np.array([indices]), np.array([segments]), masks])[1] print('%s is random next: ' % sentence_2,
from keras.layers import * from keras.models import Model import keras.backend as K from keras.optimizers import Adam from keras_bert import load_trained_model_from_checkpoint, Tokenizer, get_model bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None) for l in bert_model.layers: l.trainable = True x1_in = Input(shape=(None, )) x2_in = Input(shape=(None, )) x = bert_model([x1_in, x2_in]) # Tokenization from keras_bert import Tokenizer tokenizer = Tokenizer(token_dict) # text = '语言模型 chinese is great' # text='商品名称及规格型号' # text='境外收货人\nDERCOCHILEREPUESTOSS.A.' # text='合同协议号\n2019CICSA473-A' text = '运抵国(地区)\n智利' tokens = tokenizer.tokenize(text) # ['[CLS]', '语', '言', '模', '型', '[SEP]'] print('tokens', tokens) indices, segments = tokenizer.encode(first=text, max_len=512) print(indices[:10])
class Embeddings(object): def __init__(self, name, path='./embedding-registry.json', lang='en', extension='vec', use_ELMo=False, use_BERT=False, use_cache=True, load=True): self.name = name self.embed_size = 0 self.static_embed_size = 0 self.vocab_size = 0 self.model = {} self.registry = self._load_embedding_registry(path) self.lang = lang self.extension = extension self.embedding_lmdb_path = None if self.registry is not None: self.embedding_lmdb_path = self.registry["embedding-lmdb-path"] self.env = None if load: self.make_embeddings_simple(name) self.static_embed_size = self.embed_size self.bilm = None self.use_cache = use_cache # below init for using ELMo embeddings self.use_ELMo = use_ELMo if use_ELMo: self.make_ELMo() self.embed_size = ELMo_embed_size + self.embed_size description = self.get_description('elmo-' + self.lang) self.env_ELMo = None if description and description["cache-training"] and self.use_cache: self.embedding_ELMo_cache = os.path.join( description["path-cache"], "cache") # clean possible remaining cache self.clean_ELMo_cache() # create and load a cache in write mode, it will be used only for training self.env_ELMo = lmdb.open(self.embedding_ELMo_cache, map_size=map_size) # below init for using BERT embeddings (extracted features only, not fine tuning), # similar to ELMo for this usage self.use_BERT = use_BERT if use_BERT: # to avoid issue with tf graph and thread, we maintain in the class its own graph and session #self.session = tf.Session() self.graph = tf.get_default_graph() #self.session.run(tf.global_variables_initializer()) self.make_BERT() self.embed_size = BERT_embed_size + self.embed_size description = self.get_description('bert-base-' + self.lang) self.env_BERT = None if description and description["cache-training"] and self.use_cache: self.embedding_BERT_cache = os.path.join( description["path-cache"], "cache") # clean possible remaining cache self.clean_BERT_cache() # create and load a cache in write mode, it will be used only for training self.env_BERT = lmdb.open(self.embedding_BERT_cache, map_size=map_size) def __getattr__(self, name): return getattr(self.model, name) def _load_embedding_registry(self, path='./embedding-registry.json'): """ Load the description of available embeddings. Each description provides a name, a file path (used only if necessary) and a embeddings type (to take into account small variation of format) """ registry_json = open(path).read() return json.loads(registry_json) def make_embeddings_simple_in_memory(self, name="fasttext-crawl"): nbWords = 0 print('loading embeddings...') begin = True description = self.get_description(name) if description is not None: embeddings_path = description["path"] self.lang = description["lang"] print("path:", embeddings_path) if self.extension == 'bin': self.model = fastText.load_model(embeddings_path) nbWords = len(self.model.get_words()) self.embed_size = self.model.get_dimension() else: with open(embeddings_path, encoding='utf8') as f: for line in f: line = line.strip() line = line.split(' ') if begin: begin = False nb_words, embed_size = _fetch_header_if_available( line) # we parse the header if nb_words > 0 and embed_size > 0: nbWords = nb_words self.embed_size = embed_size continue word = line[0] vector = np.array( [float(val) for val in line[1:len(line)]], dtype='float32') #else: # vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32') if self.embed_size == 0: self.embed_size = len(vector) self.model[word] = vector if nbWords == 0: nbWords = len(self.model) print('embeddings loaded for', nbWords, "words and", self.embed_size, "dimensions") def make_embeddings_lmdb(self, name="fasttext-crawl"): print( '\nCompiling embeddings... (this is done only one time per embeddings at first usage)' ) description = self.get_description(name) if description is None: print( '\nNo description found in embeddings registry for embeddings', name) return if description is not None: # the following method will possibly download the mebedding file if not available locally embeddings_path = self.get_embedding_path(description) if embeddings_path is None: print('\nCould not locate a usable resource for embeddings', name) return self.load_embeddings_from_file(embeddings_path) # cleaning possible downloaded embeddings self.clean_downloads() def load_embeddings_from_file(self, embeddings_path): begin = True nbWords = 0 txn = self.env.begin(write=True) # batch_size = 1024 i = 0 nb_lines = 0 # read number of lines first embedding_file = open_embedding_file(embeddings_path) if embedding_file is None: print("Error: could not open embeddings file", embeddings_path) return for line in embedding_file: nb_lines += 1 embedding_file.close() embedding_file = open_embedding_file(embeddings_path) #with open(embeddings_path, encoding='utf8') as f: for line in tqdm(embedding_file, total=nb_lines): line = line.decode() line = line.split(' ') if begin: begin = False nb_words, embed_size = _fetch_header_if_available(line) if nb_words > 0 and embed_size > 0: nbWords = nb_words self.embed_size = embed_size continue word = line[0] try: if line[len(line) - 1] == '\n': vector = np.array( [float(val) for val in line[1:len(line) - 1]], dtype='float32') else: vector = np.array( [float(val) for val in line[1:len(line)]], dtype='float32') #vector = np.array([float(val) for val in line[1:len(line)]], dtype='float32') except: print(len(line)) print(line[1:len(line)]) #else: # vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32') if self.embed_size == 0: self.embed_size = len(vector) if len(word.encode(encoding='UTF-8')) < self.env.max_key_size(): txn.put(word.encode(encoding='UTF-8'), _serialize_pickle(vector)) #txn.put(word.encode(encoding='UTF-8'), _serialize_byteio(vector)) i += 1 # commit batch # if i % batch_size == 0: # txn.commit() # txn = self.env.begin(write=True) embedding_file.close() #if i % batch_size != 0: txn.commit() if nbWords == 0: nbWords = i self.vocab_size = nbWords print('embeddings loaded for', nbWords, "words and", self.embed_size, "dimensions") def clean_downloads(self): # cleaning possible downloaded embeddings for filename in os.listdir(self.registry['embedding-download-path']): file_path = os.path.join(self.registry['embedding-download-path'], filename) try: if os.path.isfile(file_path) or os.path.islink(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: print('Failed to delete %s. Reason: %s' % (file_path, e)) def make_embeddings_simple(self, name="fasttext-crawl"): description = self.get_description(name) if description is not None: self.extension = description["format"] if self.extension == "bin": if fasttext_support == True: print( "embeddings are of .bin format, so they will be loaded in memory..." ) self.make_embeddings_simple_in_memory(name) else: if not (sys.platform == 'linux' or sys.platform == 'darwin'): raise ValueError( 'FastText .bin format not supported for your platform') else: raise ValueError( 'Go to the documentation to get more information on how to install FastText .bin support' ) elif self.embedding_lmdb_path is None or self.embedding_lmdb_path == "None": print( "embedding_lmdb_path is not specified in the embeddings registry, so the embeddings will be loaded in memory..." ) self.make_embeddings_simple_in_memory(name) else: # if the path to the lmdb database files does not exist, we create it if not os.path.isdir(self.embedding_lmdb_path): # conservative check (likely very useless) if not os.path.exists(self.embedding_lmdb_path): os.makedirs(self.embedding_lmdb_path) # check if the lmdb database exists envFilePath = os.path.join(self.embedding_lmdb_path, name) load_db = True if os.path.isdir(envFilePath): description = self.get_description(name) if description is not None: self.lang = description["lang"] # open the database in read mode self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=4) if self.env: # we need to set self.embed_size and self.vocab_size with self.env.begin() as txn: stats = txn.stat() size = stats['entries'] self.vocab_size = size with self.env.begin() as txn: cursor = txn.cursor() for key, value in cursor: vector = _deserialize_pickle(value) self.embed_size = vector.shape[0] break cursor.close() if self.vocab_size > 100 and self.embed_size > 10: # lmdb database exists and looks valid load_db = False # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env.close() self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=2) if load_db: # create and load the database in write mode self.env = lmdb.open(envFilePath, map_size=map_size) self.make_embeddings_lmdb(name) def make_ELMo(self): # Location of pretrained BiLM for the specified language # TBD check if ELMo language resources are present description = self.get_description('elmo-' + self.lang) if description is not None: self.lang = description["lang"] vocab_file = description["path-vocab"] options_file = description["path-config"] weight_file = description["path_weights"] print('init ELMo') # Create a Batcher to map text to character ids self.batcher = Batcher(vocab_file, 50) # Build the biLM graph. self.bilm = BidirectionalLanguageModel(self.lang, options_file, weight_file) # Input placeholders to the biLM. self.character_ids = tf.placeholder('int32', shape=(None, None, 50)) with tf.variable_scope(self.lang, reuse=tf.AUTO_REUSE): # the reuse=True scope reuses weights from the whole context self.embeddings_op = self.bilm(self.character_ids) self.elmo_input = weight_layers('input', self.embeddings_op, l2_coef=0.0) def make_BERT(self): # Location of BERT model description = self.get_description('bert-base-' + self.lang) if description is not None: self.lang = description["lang"] config_file = description["path-config"] weight_file = description["path-weights"] vocab_file = description["path-vocab"] print('init BERT') # load the pretrained model with self.graph.as_default(): # there are different typical pooling strategies for getting BERT features: # - concatenation of 4 last layers (the one from the original BERT paper, BERT_embed_size is then 3072) # - last layer (BERT_embed_size is 768) # - average of 4 last layers (BERT_embed_size is 768) # - sum of the 4 last layers (BERT_embed_size is 768) self.bert_model = load_trained_model_from_checkpoint( config_file, weight_file, output_layer_num=4) self.bert_model.summary(line_length=120) self.bert_model._make_predict_function() # init the tokenizer token_dict = {} with codecs.open(vocab_file, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) print('token_dict size:', len(token_dict)) self.bert_tokenizer = Tokenizer(token_dict, cased=True) def get_sentence_vector_only_ELMo(self, token_list): """ Return the ELMo embeddings only for a full sentence """ if not self.use_ELMo: print( "Warning: ELMo embeddings requested but embeddings object wrongly initialised" ) return # Create batches of data local_token_ids = self.batcher.batch_sentences(token_list) max_size_sentence = local_token_ids[0].shape[0] # check lmdb cache elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence) if elmo_result is not None: return elmo_result with tf.Session() as sess: # weird, for this cpu is faster than gpu (1080Ti !) with tf.device("/cpu:0"): # It is necessary to initialize variables once before running inference sess.run(tf.global_variables_initializer()) # Compute ELMo representations (2 times as a heavy warm-up) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) #cache computation self.cache_ELMo_lmdb_vector(token_list, elmo_result) return elmo_result def get_sentence_vector_with_ELMo(self, token_list): """ Return a concatenation of standard embeddings (e.g. Glove) and ELMo embeddings for a full sentence """ if not self.use_ELMo: print( "Warning: ELMo embeddings requested but embeddings object wrongly initialised" ) return #print("\ntoken_list:", token_list) local_token_ids = self.batcher.batch_sentences(token_list) #print("local_token_ids:", local_token_ids) max_size_sentence = local_token_ids[0].shape[0] elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence) if elmo_result is None: with tf.Session() as sess: # weird, for this cpu is faster than gpu (1080Ti !) with tf.device("/cpu:0"): # It is necessary to initialize variables once before running inference sess.run(tf.global_variables_initializer()) # Compute ELMo representations (2 times as a heavy warm-up) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) #cache computation self.cache_ELMo_lmdb_vector(token_list, elmo_result) concatenated_result = np.zeros( (len(token_list), max_size_sentence - 2, self.embed_size), dtype=np.float32) #concatenated_result = np.random.rand(elmo_result.shape[0], max_size_sentence-2, self.embed_size) for i in range(0, len(token_list)): for j in range(0, len(token_list[i])): #if is_int(token_list[i][j]) or is_float(token_list[i][j]): #dummy_result = np.zeros((elmo_result.shape[2]), dtype=np.float32) #concatenated_result[i][j] = np.concatenate((dummy_result, self.get_word_vector(token_list[i][j])), ) #else: concatenated_result[i][j] = np.concatenate( (elmo_result[i][j], self.get_word_vector( token_list[i][j]).astype('float32')), ) #concatenated_result[i][j] = np.concatenate((self.get_word_vector(token_list[i][j]), elmo_result[i][j]), ) return concatenated_result def get_sentence_vector_only_BERT(self, token_list): """ Return the BERT extracted embeddings only for a full sentence """ if not self.use_BERT: print( "Warning: BERT embeddings requested but embeddings object wrongly initialised" ) return #print("local_token_ids:", local_token_ids) max_size_token_list = 0 for i, sentence in enumerate(token_list): if len(sentence) > max_size_token_list: max_size_token_list = len(sentence) # retokenize with BERT tokenizer max_size = BERT_sentence_size max_size_sentence = 0 new_token_list = [] bert_results = np.zeros((len(token_list), max_size, BERT_embed_size), dtype=np.float32) for i, sentence in enumerate(token_list): local_text = " ".join(sentence) local_tokens = self.bert_tokenizer.tokenize(local_text) bert_result = self.get_BERT_lmdb_vector(sentence) if bert_result is None: indices, segments = self.bert_tokenizer.encode( local_text, max_len=max_size) with self.graph.as_default(): bert_result = self.bert_model.predict( [np.array([indices]), np.array([segments])])[0] #cache computation if bert_result is not None: self.cache_BERT_lmdb_vector(sentence, bert_result) # Realign BERT tokenization with the provided tokenization. Normally BERT segmenter always # over-segment as compared to DeLFT segmenter. # There are two obvious possibilities to combine subtoken embeddings into token embeddings, # either take the embeddings of the last subtoken, of use the average vector of the subtokens. new_bert_result = np.zeros((max_size, BERT_embed_size), dtype=np.float32) token_tensor = [] tid = 0 buffer = '' #print(sentence) #print(local_tokens) for j, t in enumerate(local_tokens): if j >= max_size: break if t == '[CLS]' or t == '[SEP]': continue else: if t.startswith('##'): t = t[2:] buffer += t #print(buffer) token_tensor.append(bert_result[j]) if buffer == sentence[tid]: # average vector of the subtokens new_bert_result[tid] = np.stack(token_tensor).mean( axis=0) # or last subtoken vector #new_bert_result[tid] = token_tensor[-1] token_tensor = [] buffer = '' tid += 1 bert_result = new_bert_result if bert_result is not None: bert_results[i] = bert_result # we need to squeze the vector to max_size_token_list squeezed_bert_results = np.zeros( (len(token_list), max_size_token_list, BERT_embed_size), dtype=np.float32) for i, sentence in enumerate(token_list): squeezed_bert_results[i] = bert_results[i][:max_size_token_list] return squeezed_bert_results def get_sentence_vector_with_BERT(self, token_list): """ Return a concatenation of standard embeddings (e.g. Glove) and BERT extracted embeddings for a full sentence """ if not self.use_BERT: print( "Warning: BERT embeddings requested but embeddings object wrongly initialised" ) return max_size_token_list = 0 for i, sentence in enumerate(token_list): if len(sentence) > max_size_token_list: max_size_token_list = len(sentence) squeezed_bert_results = self.get_sentence_vector_only_BERT(token_list) concatenated_squeezed_result = np.zeros( (len(token_list), max_size_token_list, self.embed_size), dtype=np.float32) for i, sentence in enumerate(token_list): for j in range(0, len(token_list[i])): concatenated_squeezed_result[i][j] = np.concatenate( (squeezed_bert_results[i][j], self.get_word_vector( token_list[i][j]).astype('float32')), ) return concatenated_squeezed_result def get_description(self, name): for emb in self.registry["embeddings"]: if emb["name"] == name: return emb for emb in self.registry["embeddings-contextualized"]: if emb["name"] == name: return emb for emb in self.registry["transformers"]: if emb["name"] == name: return emb return None def get_word_vector(self, word): """ Get static embeddings (e.g. glove) for a given token """ if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'): # the pre-trained embeddings are not cased word = word.lower() if self.env is None or self.extension == 'bin': # db not available or embeddings in bin format, the embeddings should be available in memory (normally!) return self.get_word_vector_in_memory(word) try: with self.env.begin() as txn: vector = txn.get(word.encode(encoding='UTF-8')) if vector: word_vector = _deserialize_pickle(vector) vector = None else: word_vector = np.zeros((self.static_embed_size, ), dtype=np.float32) # alternatively, initialize with random negative values #word_vector = np.random.uniform(low=-0.5, high=0.0, size=(self.embed_size,)) # alternatively use fasttext OOV ngram possibilities (if ngram available) except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env.close() envFilePath = os.path.join(self.embedding_lmdb_path, self.name) self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_word_vector(word) return word_vector def get_ELMo_lmdb_vector(self, token_list, max_size_sentence): """ Try to get the ELMo embeddings for a sequence cached in LMDB """ if self.env_ELMo is None: # db cache not available, we don't cache ELMo stuff return None try: ELMo_vector = np.zeros( (len(token_list), max_size_sentence - 2, ELMo_embed_size), dtype='float32') with self.env_ELMo.begin() as txn: for i in range(0, len(token_list)): txn = self.env_ELMo.begin() # get a hash for the token_list the_hash = list_digest(token_list[i]) vector = txn.get(the_hash.encode(encoding='UTF-8')) if vector: # adapt expected shape/padding local_embeddings = _deserialize_pickle(vector) if local_embeddings.shape[0] > max_size_sentence - 2: # squeeze the extra padding space ELMo_vector[ i] = local_embeddings[:max_size_sentence - 2, ] elif local_embeddings.shape[ 0] == max_size_sentence - 2: # bingo~! ELMo_vector[i] = local_embeddings else: # fill the missing space with padding filler = np.zeros((max_size_sentence - (local_embeddings.shape[0] + 2), ELMo_embed_size), dtype='float32') ELMo_vector[i] = np.concatenate( (local_embeddings, filler)) vector = None else: return None except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env_ELMo.close() self.env_ELMo = lmdb.open(self.embedding_ELMo_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_ELMo_lmdb_vector(token_list) return ELMo_vector def get_BERT_lmdb_vector(self, sentence): """ Try to get the BERT extracted embeddings for a sequence cached in LMDB """ if self.env_BERT is None: # db cache not available, we don't cache ELMo stuff return None try: BERT_vector = np.zeros((BERT_sentence_size, BERT_embed_size), dtype='float32') with self.env_BERT.begin() as txn: txn = self.env_BERT.begin() # get a hash for the token_list the_hash = list_digest(sentence) vector = txn.get(the_hash.encode(encoding='UTF-8')) if vector: # adapt expected shape/padding BERT_vector = _deserialize_pickle(vector) ''' if local_embeddings.shape[0] > max_size_sentence: # squeeze the extra padding space BERT_vector = local_embeddings[:max_size_sentence,] elif local_embeddings.shape[0] == max_size_sentence: # bingo~! BERT_vector = local_embeddings else: # fill the missing space with padding filler = np.zeros((max_size_sentence-(local_embeddings.shape[0]), BERT_embed_size), dtype='float32') BERT_vector = np.concatenate((local_embeddings, filler)) ''' vector = None else: return None except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env_BERT.close() self.env_BERT = lmdb.open(self.embedding_BERT_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_BERT_lmdb_vector(sentence) return BERT_vector def cache_ELMo_lmdb_vector(self, token_list, ELMo_vector): """ Cache in LMDB the ELMo embeddings for a given sequence """ if self.env_ELMo is None: # db cache not available, we don't cache ELMo stuff return None txn = self.env_ELMo.begin(write=True) for i in range(0, len(token_list)): # get a hash for the token_list the_hash = list_digest(token_list[i]) txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(ELMo_vector[i])) txn.commit() def cache_BERT_lmdb_vector(self, sentence, BERT_vector): """ Cache in LMDB the BERT embeddings for a given sequence """ if self.env_BERT is None: # db cache not available, we don't cache BERT stuff return None txn = self.env_BERT.begin(write=True) #for i in range(0, len(sentence)): # get a hash for the token_list the_hash = list_digest(sentence) txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(BERT_vector)) txn.commit() def clean_ELMo_cache(self): """ Delete ELMo embeddings cache, this takes place normally after the completion of a training """ if self.env_ELMo is None: # db cache not available, nothing to clean return else: self.env_ELMo.close() self.env_ELMo = None for file in os.listdir(self.embedding_ELMo_cache): file_path = os.path.join(self.embedding_ELMo_cache, file) if os.path.isfile(file_path): os.remove(file_path) os.rmdir(self.embedding_ELMo_cache) def clean_BERT_cache(self): """ Delete BERT embeddings cache, this takes place normally after the completion of a training """ # if cache subdirectory does not exist, we create it if not os.path.exists(self.embedding_BERT_cache): os.makedirs(self.embedding_BERT_cache) return if self.env_BERT is None: # db cache not available, nothing to clean return else: self.env_BERT.close() self.env_BERT = None for file in os.listdir(self.embedding_BERT_cache): file_path = os.path.join(self.embedding_BERT_cache, file) if os.path.isfile(file_path): os.remove(file_path) os.rmdir(self.embedding_BERT_cache) def get_word_vector_in_memory(self, word): if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'): # the pre-trained embeddings are not cased word = word.lower() if self.extension == 'bin': return self.model.get_word_vector(word) if word in self.model: return self.model[word] else: # for unknown word, we use a vector filled with 0.0 return np.zeros((self.static_embed_size, ), dtype=np.float32) # alternatively, initialize with random negative values #return np.random.uniform(low=-0.5, high=0.0, size=(self.embed_size,)) # alternatively use fasttext OOV ngram possibilities (if ngram available) def get_embedding_path(self, description): embeddings_path = None if "path" in description: embeddings_path = description["path"] self.lang = description["lang"] if embeddings_path is None or not os.path.isfile(embeddings_path): print("error: embedding path for", description['name'], "is not valid", embeddings_path) if "url" in description and len(description["url"]) > 0: url = description["url"] download_path = self.registry['embedding-download-path'] # if the download path does not exist, we create it if not os.path.isdir(download_path): try: os.mkdir(download_path) except OSError: print("Creation of the download directory", download_path, "failed") print("Downloading resource file for", description['name'], "...") embeddings_path = download_file(url, download_path) if embeddings_path != None and os.path.isfile(embeddings_path): print("Download sucessful:", embeddings_path) else: print( "no download url available for this embeddings resource, please review the embedding registry for", description['name']) return embeddings_path
config_path, checkpoint_path, dict_path = tuple(sys.argv[1:]) model = load_trained_model_from_checkpoint(config_path, checkpoint_path) model.summary(line_length=120) token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) tokenizer = Tokenizer(token_dict) text = '语言模型' tokens = tokenizer.tokenize(text) print('Tokens:', tokens) indices, segments = tokenizer.encode(first='语言模型', max_len=512) predicts = model.predict([np.array([indices]), np.array([segments])])[0] for i, token in enumerate(tokens): print(token, predicts[i].tolist()[:5]) """Official outputs: { "linex_index": 0, "features": [ { "token": "[CLS]", "layers": [ { "index": -1, "values": [-0.63251, 0.203023, 0.079366, -0.032843, 0.566809, ...] }
class Punc_DataLoader(): def __init__(self, config, training=True): self.train = training self.init_all(config) self.vocab_featurizer = TextFeaturizer(config['punc_vocab']) self.bd_featurizer = TextFeaturizer(config['punc_biaodian']) self.bd = self.bd_featurizer.vocab_array self.batch = config['running_config']['batch_size'] self.epochs = 1 def init_bert(self, config, checkpoint): model = load_trained_model_from_checkpoint(config, checkpoint, trainable=False, seq_len=None) return model def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info( 'history train list not equal new load train list ,data loader use init state' ) self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def return_data_types(self): return (tf.int32, tf.int32, tf.float32) def return_data_shape(self): return (tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None, 768])) def get_per_epoch_steps(self): return len(self.train_texts) // self.batch def eval_per_epoch_steps(self): return len(self.test_texts) // self.batch def init_all(self, config): if self.train: bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.init_bert(bert_config, bert_checkpoint) self.get_sentence( config['train_list'] if self.train else config['eval_list'], training=self.train) def get_sentence(self, data_path, training): from tqdm import tqdm with open(data_path, encoding='utf-8') as f: data = f.readlines() txts = [] for txt in tqdm(data): txt = txt.strip() if len(txt) > 150: continue txts.append(txt) if training: num = len(txts) train = txts[:int(num * 0.99)] test = txts[int(num * 0.99):] self.train_list, self.test_list = train, test self.train_offset = 0 self.test_offset = 0 else: self.test_texts = txts self.offset = 0 def preprocess(self, txts): x = [] for txt in txts: x_ = [self.vocab_featurizer.startid()] for i in txt: x_.append(self.vocab_featurizer.token_to_index[i]) x_.append(self.vocab_featurizer.endid()) x.append(np.array(x_)) return x def bert_decode(self, x, x2=None): tokens, segs = [], [] if x2 is not None: for i, j in zip(x, x2): t, s = self.bert_token.encode(''.join(i)) index = np.where(j == 2)[0] if len(index) > 0: for n in index: t[int(n)] = 103 tokens.append(t) segs.append(s) else: for i in x: t, s = self.bert_token.encode(''.join(i)) tokens.append(t) segs.append(s) return tokens, segs def pad(self, x, mode=1): length = 0 for i in x: length = max(length, len(i)) if mode == 2: for i in range(len(x)): pading = np.ones([length - len(x[i]), x[i].shape[1]]) * -10. x[i] = np.vstack((x[i], pading)) elif mode == 3: for i in range(len(x)): pading = np.zeros([length - len(x[i]), x[i].shape[1]]) x[i] = np.vstack((x[i], pading)) else: x = pad_sequences(x, length, padding='post', truncating='post') return x def get_bert_feature(self, bert_t, bert_s): length = [len(i) for i in bert_t] max_len = max(length) bert_s = tf.keras.preprocessing.sequence.pad_sequences( bert_s, max_len, padding='post', truncating='post') bert_t = tf.keras.preprocessing.sequence.pad_sequences( bert_t, max_len, padding='post', truncating='post') features = self.bert.predict([bert_t, bert_s]) for idx, l in enumerate(length): features[idx, l:] = -10. return features def get_target(self, text): bd = self.bd zh = [] bd_ = [[0]] for n in text: if n in bd: bd_[-1].append(bd.index(n)) else: zh.append(n) bd_.append([0]) zh_txt = ''.join(zh) bd_txt = bd_ + [[0]] return zh_txt, bd_txt def process_punc(self, puncs): x = [] for punc in puncs: x_ = [] for i in range(len(punc)): if len(punc[i]) == 1: x_ += [1] else: x_ += punc[i][-1:] x.append(np.array(x_, 'int32')) return x def check_valid(self, txt, vocab_list): if len(txt) == 0: return False for n in txt: if n in vocab_list: pass else: return n return True def generate(self, train): trainx = [] trainy = [] for i in range(self.batch * 10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 line = line.strip() if len(line) < 30: extra = random.sample(self.train_list, 1)[0] extra = extra.strip() line += extra if self.check_valid(line, self.vocab_featurizer.vocab_array + self.bd) is not True: continue try: x, y = self.get_target(line) except: continue trainx.append(x) trainy.append(y) if len(trainx) == self.batch: break inp_tokens = self.preprocess(trainx) e_bert_t, e_bert_s = self.bert_decode(trainx) e_features = self.get_bert_feature(e_bert_t, e_bert_s) trainy = self.process_punc(trainy) inp_tokens = self.pad(inp_tokens) trainy = self.pad(trainy) e_features = self.pad(e_features, 2) inp_tokens = np.array(inp_tokens) trainy = np.array(trainy) e_features = np.array(e_features, dtype='float32') return inp_tokens, trainy, e_features def generator(self, train=True): while 1: x, y, features = self.generate(train) if x.shape[1] != y.shape[1] and y.shape[1] != features.shape[1]: logging.info('bad batch,skip') continue yield x, y, features