def test_extract_embeddings_invalid_pooling(self): with self.assertRaises(ValueError): extract_embeddings( self.model_path, [ ('all work and no play', 'makes jack a dull boy'), ('makes jack a dull boy', 'all work and no play'), ], poolings=['invalid'], )
def main(args): # pipeline flow print('=================>loading source data') df_tags = pd.read_hdf('../data/split.hdf', 'part' + str(args.split_part)) df_tags = df_tags.head(10) print('part shape is ', df_tags.shape) # print('=================>loading source data done!') text = df_tags.label_content.values all_text = [] all_len = [] for i in text: all_text.append(i) all_len.append(len(i)) all_text_one = list(chain.from_iterable(all_text)) print('=================>generating emebddings!') embeddings = extract_embeddings(args.checkpoint, all_text_one, output_layer_num=1, poolings=[POOL_NSP, POOL_MAX]) print('=================>generating emebddings done!') final_emb = [] before = 0 for i in range(df_tags.shape[0]): final_emb.append(embeddings[before:before + all_len[i]]) before += all_len[i] df_tags['embeddings'] = final_emb print('dumping data shape ', df_tags.shape) pickle.dump( df_tags, open( './output/multi_gpu_ljj_range_' + str(args.start) + '_' + str(args.end) + '.pickle', 'wb'))
def test_extract_embeddings_default(self): embeddings = extract_embeddings( self.model_path, ['all work and no play', 'makes jack a dull boy~']) self.assertEqual(2, len(embeddings)) self.assertEqual((7, 4), embeddings[0].shape) self.assertEqual((8, 4), embeddings[1].shape)
def test_extract_embeddings_variable_lengths(self): tokens = [ '[PAD]', '[UNK]', '[CLS]', '[SEP]', 'all', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', '~', ] token_dict = {token: i for i, token in enumerate(tokens)} inputs, outputs = get_model( token_num=len(tokens), pos_num=20, seq_len=None, embed_dim=13, transformer_num=1, feed_forward_dim=17, head_num=1, training=False, ) model = keras.models.Model(inputs, outputs) embeddings = extract_embeddings( model, [ ('all work and no play', 'makes jack'), ('a dull boy', 'all work and no play and no play'), ], vocabs=token_dict, batch_size=2, ) self.assertEqual(2, len(embeddings)) self.assertEqual((10, 13), embeddings[0].shape) self.assertEqual((14, 13), embeddings[1].shape)
def run(self): #set enviornment os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(self._gpuid) #load models #every worker only need to load model one time paths = get_checkpoint_paths(self._bert_checkpoint) model = load_trained_model_from_checkpoint( config_file=paths.config, checkpoint_file=paths.checkpoint, output_layer_num=1, ) vocabs = load_vocabulary(paths.vocab) print('model init done', self._gpuid) while True: xfile = self._queue.get() if xfile == None: self._queue.put(None) break embeddings = extract_embeddings(model=model, vocabs=vocabs, texts=xfile[1], output_layer_num=1, poolings=[POOL_NSP, POOL_MAX]) print('woker running', self._gpuid, len(self.return_list)) self.return_list.append({ 'worker': self._gpuid, 'id': xfile[0], 'content': xfile[1], 'embeddings': embeddings }) print('worker predict done at gpu:', self._gpuid)
def extract(self, sentences, granularity): feats = extract_embeddings(self.model_path, sentences) if granularity == 'token': feats = np.array([feat[1:-1] for feat in feats]) elif granularity == 'cls': feats = np.array([feat[0] for feat in feats]) return feats
def test_extract_embeddings_pair(self): embeddings = extract_embeddings( self.model_path, [ ('all work and no play', 'makes jack a dull boy'), ('makes jack a dull boy', 'all work and no play'), ], ) self.assertEqual(2, len(embeddings)) self.assertEqual((13, 4), embeddings[0].shape)
def process_data(data, batch_size, model_dir, output_dir): # get embeddings # save as numpy array # batch process to avoid memory excess total = int(len(data) / batch_size) +1 for idx, batch in tqdm(enumerate(get_batches(data, batch_size)), total=total): embedded_data = extract_embeddings(model_dir, batch) batch_array = np.array(embedded_data) output = os.path.join(output_dir, 'batch_{}'.format(idx)) np.save(output, batch_array)
def test_extract_embeddings_single_pooling(self): embeddings = extract_embeddings( self.model_path, [ ('all work and no play', 'makes jack a dull boy'), ('makes jack a dull boy', 'all work and no play'), ], poolings=POOL_NSP, ) self.assertEqual(2, len(embeddings)) self.assertEqual((4,), embeddings[0].shape)
def test_extract_embeddings_multi_pooling(self): embeddings = extract_embeddings( self.model_path, [ ('all work and no play', 'makes jack a dull boy'), ('makes jack a dull boy', 'all work and no play'), ], poolings=[POOL_NSP, POOL_MAX, POOL_AVE], ) self.assertEqual(2, len(embeddings)) self.assertEqual((12, ), embeddings[0].shape)
def predict_cancer(file_path): tb._SYMBOLIC_SCOPE.value = True x = np.array([open(file_path, 'r').read()]) embedding = np.array(extract_embeddings(BIO_BERT_PATH, x)) embedding = np.mean(embedding, axis=1) # model = load_model(TRAINED_MODEL_PATH_1) # y_pred_1 = model.predict(embedding) # model = load_model(TRAINED_MODEL_PATH_2) # y_pred_2 = model.predict(embedding) # model = load_model(TRAINED_MODEL_PATH_3) # y_pred_3 = model.predict(embedding) # model = load_model(TRAINED_MODEL_PATH_4) # y_pred_4 = model.predict(embedding) # model = load_model(TRAINED_MODEL_PATH_5) # y_pred_5 = model.predict(embedding) # # y_pred = (y_pred_1 + y_pred_2 + y_pred_3 + y_pred_4 + y_pred_5) / 3 model = load_model(TRAINED_MODEL_PATH) y_pred = model.predict(embedding) cancer_prob = int(y_pred[0][0] * 100) return cancer_prob
#Tokenizer自带的_tokenize会自动去掉空格,然后有些字符会粘在一块输出,导致tokenize之后的列表不等于原来字符串的长度了,这样如果做序列标注的任务会很麻烦。 class OurTokenizer(Tokenizer): def _tokenize(self, text): R = [] for c in text: if c in self._token_dict: R.append(c) elif self._is_space(c): R.append('[unused1]') # space类用未经训练的[unused1]表示 else: R.append('[UNK]') # 剩余的字符是[UNK] return R if __name__ == "__main__": #得到字典 token_dict = {} with codecs.open(vocab_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) tokenizer = OurTokenizer(token_dict) tem = tokenizer.tokenize(u'现在的年轻人') #print(tem) # 输出是 ['[CLS]', u'现', u'在', u'的', u'年', u'轻', u'人', '[SEP]'] #直接得到[1,8,768]维的词向量矩阵 embeddings = np.array(extract_embeddings(model_path, ['现在的年轻人']))
def encode(self, texts): embeddings = extract_embeddings(self.model, texts, vocabs=self.vocabs) # result = [np.max(x, axis=0) for x in embeddings] result = [np.max(x, axis=0).tolist() for x in embeddings] return result
callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)], ) inputs, output_layer = get_model( token_num=len(token_dict), head_num=5, transformer_num=12, embed_dim=25, feed_forward_dim=100, seq_len=20, pos_num=20, dropout_rate=0.05, training=False, # 当`training`是`False`,返回值是输入和输出 trainable=False, # 模型是否可训练,默认值和`training`相同 output_layer_num=4, # 最后几层的输出将合并在一起作为最终的输出,只有当`training`是`False`有效 ) plot_model(model, to_file="model.png", show_shapes=True) from keras_bert import extract_embeddings, POOL_NSP, POOL_MAX # model_path = 'xxx/yyy/uncased_L-12_H-768_A-12' texts = [ ('all work and no play', 'makes jack a dull boy'), ('makes jack a dull boy', 'all work and no play'), ] embeddings = extract_embeddings(model_path, texts, output_layer_num=4, poolings=[POOL_NSP, POOL_MAX])
# -*- coding: utf-8 -*- # author: Jclian91 # place: Pudong Shanghai # time: 2020-02-12 12:20 from keras_bert import extract_embeddings model_path = 'chinese_L-12_H-768_A-12' texts = ['今晚(2月11日),钟南山院士接受总台央视记者独家专访,通过央视回应了近日媒体报道“钟南山的最新论文发现新冠肺炎潜伏期最长可达24天”的问题。', '英国苏格兰政府首席大臣、苏格兰民族党党魁妮古拉·斯特金11日在伦敦说,苏格兰人应有权重新选择是否独立。', '教育部门明确提出“延期开学”是假期的延续,各校均不得以任何形式集体组织上新课,也不得举行任何形式的线下教学活动和集体活动。' ] embeddings = extract_embeddings(model_path, texts) print(type(embeddings)) print(embeddings) for _ in embeddings: print(_[0].shape)
# X_train = np.stack(X_train, axis=0) # X_test = np.stack(X_test, axis=0) # print(X_test) # texts = [get_word(idx) for idx in range(len(token_dict))] # print(vocab) try: with open('bert_embedding_sent.pkl', 'rb') as f: print('loading existing bert embedding...') bert_embedding = pickle.load(f) print('loaded') except: print('loading bert embedding') bert_embedding = extract_embeddings( 'uncased_L-12_H-768_A-12', [" ".join(sentence) for sentence in train_sentences + test_sentences]) # bert_embedding = extract_embeddings('uncased_L-12_H-768_A-12', vocab) # bert_embedding = extract_embeddings('uncased_L-12_H-768_A-12', ["[PAD]"]) # maxlen = max([len(sentence) for sentence in bert_embedding]) print(maxlen) for i, sentence in enumerate(bert_embedding): while len(bert_embedding[i]) < maxlen: bert_embedding[i] = np.append(bert_embedding[i], [np.zeros(768)], axis=0) with open('bert_embedding_sent.pkl', 'wb') as f: pickle.dump(bert_embedding, f) print('loaded and saved as a pickle') # pad_emd = extract_embeddings('uncased_L-12_H-768_A-12', [" [PAD] "])
vocab_path = os.path.join(pretrained_path, 'vocab.txt') x_train = np.load('x_train.npy', allow_pickle=True) y_train = np.load('y_train.npy', allow_pickle=True) x_test = np.load('x_test.npy', allow_pickle=True) y_test = np.load('y_test.npy', allow_pickle=True) for i, d in tqdm(enumerate(x_train)): sentence = ' '.join(x_train[i]) x_train[i] = sentence for i, d in tqdm(enumerate(x_test)): sentence = ' '.join(x_test[i]) x_test[i] = sentence x_train = extract_embeddings(pretrained_path, x_train) x_train = np.array(x_train) x_train = keras.preprocessing.sequence.pad_sequences( x_train, maxlen=MAX_SEQUENCE_LENGTH) model = keras.models.Sequential() model.add( keras.layers.LSTM(units=256, input_shape=(MAX_SEQUENCE_LENGTH, 768), return_sequences=True)) model.add(keras.layers.Dropout(0.2)) model.add(keras.layers.LSTM(128)) model.add(keras.layers.Dropout(0.2)) model.add(keras.layers.Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy',
X_train = np.stack(X_train, axis=0) X_test = np.stack(X_test, axis=0) print(X_test) # texts = [get_word(idx) for idx in range(len(token_dict))] # print(vocab) try: with open('bert_embedding_sent.pkl', 'rb') as f: print('loading existing bert embedding...') bert_embedding = pickle.load(f) print('loaded') except: print('loading bert embedding') bert_embedding = extract_embeddings( 'uncased_L-12_H-768_A-12', [" ".join(sentence) for sentence in tokens]) # bert_embedding = extract_embeddings('uncased_L-12_H-768_A-12', vocab) with open('bert_embedding_sent.pkl', 'wb') as f: pickle.dump(bert_embedding, f) print('loaded and saved as a pickle') X_train = bert_embedding[:len(train_sentences)] X_test = bert_embedding[len(train_sentences):] print(X_test[:5]) X_train = np.stack(X_train, axis=0) X_test = np.stack(X_test, axis=0) print(X_test) input = Input(shape=(max_length, ))
from keras_bert import extract_embeddings import tensorflow as tf import numpy as np from tempfile import TemporaryFile flags = tf.compat.v1.flags FLAGS = flags.FLAGS flags.DEFINE_string("input_file", None, "") outfile = TemporaryFile() BEFORE_CANCER_PATH = "/home/chennuri/CliNER/bef_canc_mod_treatments/" BIO_BERT_PATH = '/home/gowtham/biobert_v1.1_pubmed' with open(BEFORE_CANCER_PATH + file_name, 'r') as f: data = f.readlines() f.close() output = extract_embeddings(BIO_BERT_PATH, data) np.save(outfile, output)
def bert_extract(self, texts): embeddings = extract_embeddings(self.model_path, texts) return np.array(embeddings)
from sklearn.metrics.pairwise import cosine_similarity from keras_bert import extract_embeddings model_path = '/home/zju/buwenfeng/bert/chinese_L-12_H-768_A-12/' texts = LDA_data[3] topics =[ '支持苹果手机se喜欢', '外观好看屏幕小合适', '配置内存性能很满意', '价格便宜市场发布不错', '喜欢系统功能设计良好' ] topics1 =['深圳禁摩', '电动车自行车', '交警执法', '快递外卖', '摩托车电摩三轮车', '整治非法拘留' ] import numpy as np text_embeddings = extract_embeddings(model_path, texts) topic_embeddings = extract_embeddings(model_path, topics) texteb1 = extract_embeddings(model_path, [LDA_data[2][20]]) for ix,x in enumerate(text_embeddings): print(texts[ix]+'\n') if ix==40: break for it,t in enumerate(topic_embeddings): same_bank = np.average(np.matmul(x,np.transpose(t))) print(topics[it] + ' '+str(same_bank)+'\n')
def test_extract_embeddings_from_file(self): with codecs.open(os.path.join(self.model_path, 'vocab.txt'), 'r', 'utf8') as reader: texts = map(lambda x: x.strip(), reader) embeddings = extract_embeddings(self.model_path, texts) self.assertEqual(15, len(embeddings))