Ejemplo n.º 1
0
 def test_extract_embeddings_invalid_pooling(self):
     with self.assertRaises(ValueError):
         extract_embeddings(
             self.model_path,
             [
                 ('all work and no play', 'makes jack a dull boy'),
                 ('makes jack a dull boy', 'all work and no play'),
             ],
             poolings=['invalid'],
         )
Ejemplo n.º 2
0
def main(args):
    # pipeline flow
    print('=================>loading source data')
    df_tags = pd.read_hdf('../data/split.hdf', 'part' + str(args.split_part))
    df_tags = df_tags.head(10)
    print('part shape is ', df_tags.shape)  #
    print('=================>loading source data done!')
    text = df_tags.label_content.values
    all_text = []
    all_len = []
    for i in text:
        all_text.append(i)
        all_len.append(len(i))
    all_text_one = list(chain.from_iterable(all_text))

    print('=================>generating emebddings!')
    embeddings = extract_embeddings(args.checkpoint,
                                    all_text_one,
                                    output_layer_num=1,
                                    poolings=[POOL_NSP, POOL_MAX])
    print('=================>generating emebddings done!')
    final_emb = []
    before = 0
    for i in range(df_tags.shape[0]):
        final_emb.append(embeddings[before:before + all_len[i]])
        before += all_len[i]

    df_tags['embeddings'] = final_emb
    print('dumping data shape ', df_tags.shape)
    pickle.dump(
        df_tags,
        open(
            './output/multi_gpu_ljj_range_' + str(args.start) + '_' +
            str(args.end) + '.pickle', 'wb'))
Ejemplo n.º 3
0
 def test_extract_embeddings_default(self):
     embeddings = extract_embeddings(
         self.model_path,
         ['all work and no play', 'makes jack a dull boy~'])
     self.assertEqual(2, len(embeddings))
     self.assertEqual((7, 4), embeddings[0].shape)
     self.assertEqual((8, 4), embeddings[1].shape)
Ejemplo n.º 4
0
 def test_extract_embeddings_variable_lengths(self):
     tokens = [
         '[PAD]', '[UNK]', '[CLS]', '[SEP]',
         'all', 'work', 'and', 'no', 'play',
         'makes', 'jack', 'a', 'dull', 'boy', '~',
     ]
     token_dict = {token: i for i, token in enumerate(tokens)}
     inputs, outputs = get_model(
         token_num=len(tokens),
         pos_num=20,
         seq_len=None,
         embed_dim=13,
         transformer_num=1,
         feed_forward_dim=17,
         head_num=1,
         training=False,
     )
     model = keras.models.Model(inputs, outputs)
     embeddings = extract_embeddings(
         model,
         [
             ('all work and no play', 'makes jack'),
             ('a dull boy', 'all work and no play and no play'),
         ],
         vocabs=token_dict,
         batch_size=2,
     )
     self.assertEqual(2, len(embeddings))
     self.assertEqual((10, 13), embeddings[0].shape)
     self.assertEqual((14, 13), embeddings[1].shape)
Ejemplo n.º 5
0
    def run(self):
        #set enviornment
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(self._gpuid)

        #load models
        #every worker only need to load model one time
        paths = get_checkpoint_paths(self._bert_checkpoint)
        model = load_trained_model_from_checkpoint(
            config_file=paths.config,
            checkpoint_file=paths.checkpoint,
            output_layer_num=1,
        )
        vocabs = load_vocabulary(paths.vocab)
        print('model init done', self._gpuid)

        while True:
            xfile = self._queue.get()
            if xfile == None:
                self._queue.put(None)
                break
            embeddings = extract_embeddings(model=model,
                                            vocabs=vocabs,
                                            texts=xfile[1],
                                            output_layer_num=1,
                                            poolings=[POOL_NSP, POOL_MAX])
            print('woker running', self._gpuid, len(self.return_list))
            self.return_list.append({
                'worker': self._gpuid,
                'id': xfile[0],
                'content': xfile[1],
                'embeddings': embeddings
            })

        print('worker predict done at gpu:', self._gpuid)
Ejemplo n.º 6
0
    def extract(self, sentences, granularity):

        feats = extract_embeddings(self.model_path, sentences)
        if granularity == 'token':
            feats = np.array([feat[1:-1] for feat in feats])
        elif granularity == 'cls':
            feats = np.array([feat[0] for feat in feats])

        return feats
Ejemplo n.º 7
0
 def test_extract_embeddings_pair(self):
     embeddings = extract_embeddings(
         self.model_path,
         [
             ('all work and no play', 'makes jack a dull boy'),
             ('makes jack a dull boy', 'all work and no play'),
         ],
     )
     self.assertEqual(2, len(embeddings))
     self.assertEqual((13, 4), embeddings[0].shape)
Ejemplo n.º 8
0
def process_data(data, batch_size, model_dir, output_dir):
  # get embeddings
  # save as numpy array
  # batch process to avoid memory excess
  total = int(len(data) / batch_size) +1
  for idx, batch in tqdm(enumerate(get_batches(data, batch_size)), total=total):
    embedded_data = extract_embeddings(model_dir, batch)
    batch_array = np.array(embedded_data)
    output = os.path.join(output_dir, 'batch_{}'.format(idx))
    np.save(output, batch_array)
Ejemplo n.º 9
0
 def test_extract_embeddings_single_pooling(self):
     embeddings = extract_embeddings(
         self.model_path,
         [
             ('all work and no play', 'makes jack a dull boy'),
             ('makes jack a dull boy', 'all work and no play'),
         ],
         poolings=POOL_NSP,
     )
     self.assertEqual(2, len(embeddings))
     self.assertEqual((4,), embeddings[0].shape)
Ejemplo n.º 10
0
 def test_extract_embeddings_multi_pooling(self):
     embeddings = extract_embeddings(
         self.model_path,
         [
             ('all work and no play', 'makes jack a dull boy'),
             ('makes jack a dull boy', 'all work and no play'),
         ],
         poolings=[POOL_NSP, POOL_MAX, POOL_AVE],
     )
     self.assertEqual(2, len(embeddings))
     self.assertEqual((12, ), embeddings[0].shape)
Ejemplo n.º 11
0
def predict_cancer(file_path):
    tb._SYMBOLIC_SCOPE.value = True
    x = np.array([open(file_path, 'r').read()])
    embedding = np.array(extract_embeddings(BIO_BERT_PATH, x))
    embedding = np.mean(embedding, axis=1)

    # model = load_model(TRAINED_MODEL_PATH_1)
    # y_pred_1 = model.predict(embedding)
    # model = load_model(TRAINED_MODEL_PATH_2)
    # y_pred_2 = model.predict(embedding)
    # model = load_model(TRAINED_MODEL_PATH_3)
    # y_pred_3 = model.predict(embedding)
    # model = load_model(TRAINED_MODEL_PATH_4)
    # y_pred_4 = model.predict(embedding)
    # model = load_model(TRAINED_MODEL_PATH_5)
    # y_pred_5 = model.predict(embedding)
    #
    # y_pred = (y_pred_1 + y_pred_2 + y_pred_3 + y_pred_4 + y_pred_5) / 3

    model = load_model(TRAINED_MODEL_PATH)
    y_pred = model.predict(embedding)
    cancer_prob = int(y_pred[0][0] * 100)
    return cancer_prob
Ejemplo n.º 12
0

#Tokenizer自带的_tokenize会自动去掉空格,然后有些字符会粘在一块输出,导致tokenize之后的列表不等于原来字符串的长度了,这样如果做序列标注的任务会很麻烦。
class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]')  # space类用未经训练的[unused1]表示
            else:
                R.append('[UNK]')  # 剩余的字符是[UNK]
        return R


if __name__ == "__main__":
    #得到字典
    token_dict = {}
    with codecs.open(vocab_path, 'r', 'utf8') as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)

    tokenizer = OurTokenizer(token_dict)
    tem = tokenizer.tokenize(u'现在的年轻人')
    #print(tem)
    # 输出是 ['[CLS]', u'现', u'在', u'的', u'年', u'轻', u'人', '[SEP]']
    #直接得到[1,8,768]维的词向量矩阵
    embeddings = np.array(extract_embeddings(model_path, ['现在的年轻人']))
Ejemplo n.º 13
0
 def encode(self, texts):
     embeddings = extract_embeddings(self.model, texts, vocabs=self.vocabs)
     # result = [np.max(x, axis=0) for x in embeddings]
     result = [np.max(x, axis=0).tolist() for x in embeddings]
     return result
Ejemplo n.º 14
0
    callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)],
)

inputs, output_layer = get_model(
    token_num=len(token_dict),
    head_num=5,
    transformer_num=12,
    embed_dim=25,
    feed_forward_dim=100,
    seq_len=20,
    pos_num=20,
    dropout_rate=0.05,
    training=False,  # 当`training`是`False`,返回值是输入和输出
    trainable=False,  # 模型是否可训练,默认值和`training`相同
    output_layer_num=4,  # 最后几层的输出将合并在一起作为最终的输出,只有当`training`是`False`有效
)
plot_model(model, to_file="model.png", show_shapes=True)

from keras_bert import extract_embeddings, POOL_NSP, POOL_MAX

# model_path = 'xxx/yyy/uncased_L-12_H-768_A-12'
texts = [
    ('all work and no play', 'makes jack a dull boy'),
    ('makes jack a dull boy', 'all work and no play'),
]

embeddings = extract_embeddings(model_path,
                                texts,
                                output_layer_num=4,
                                poolings=[POOL_NSP, POOL_MAX])
# -*- coding: utf-8 -*-
# author: Jclian91
# place: Pudong Shanghai
# time: 2020-02-12 12:20

from keras_bert import extract_embeddings

model_path = 'chinese_L-12_H-768_A-12'
texts = ['今晚(2月11日),钟南山院士接受总台央视记者独家专访,通过央视回应了近日媒体报道“钟南山的最新论文发现新冠肺炎潜伏期最长可达24天”的问题。',
         '英国苏格兰政府首席大臣、苏格兰民族党党魁妮古拉·斯特金11日在伦敦说,苏格兰人应有权重新选择是否独立。',
         '教育部门明确提出“延期开学”是假期的延续,各校均不得以任何形式集体组织上新课,也不得举行任何形式的线下教学活动和集体活动。'
         ]

embeddings = extract_embeddings(model_path, texts)

print(type(embeddings))
print(embeddings)

for _ in embeddings:
    print(_[0].shape)
Ejemplo n.º 16
0
# X_train = np.stack(X_train, axis=0)
# X_test = np.stack(X_test, axis=0)
# print(X_test)

# texts = [get_word(idx) for idx in range(len(token_dict))]
# print(vocab)

try:
    with open('bert_embedding_sent.pkl', 'rb') as f:
        print('loading existing bert embedding...')
        bert_embedding = pickle.load(f)
        print('loaded')
except:
    print('loading bert embedding')
    bert_embedding = extract_embeddings(
        'uncased_L-12_H-768_A-12',
        [" ".join(sentence) for sentence in train_sentences + test_sentences])
    # bert_embedding = extract_embeddings('uncased_L-12_H-768_A-12', vocab)
    # bert_embedding = extract_embeddings('uncased_L-12_H-768_A-12', ["[PAD]"])
    #
    maxlen = max([len(sentence) for sentence in bert_embedding])
    print(maxlen)
    for i, sentence in enumerate(bert_embedding):
        while len(bert_embedding[i]) < maxlen:
            bert_embedding[i] = np.append(bert_embedding[i], [np.zeros(768)],
                                          axis=0)

    with open('bert_embedding_sent.pkl', 'wb') as f:
        pickle.dump(bert_embedding, f)
        print('loaded and saved as a pickle')
    # pad_emd = extract_embeddings('uncased_L-12_H-768_A-12', [" [PAD] "])
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

x_train = np.load('x_train.npy', allow_pickle=True)
y_train = np.load('y_train.npy', allow_pickle=True)
x_test = np.load('x_test.npy', allow_pickle=True)
y_test = np.load('y_test.npy', allow_pickle=True)

for i, d in tqdm(enumerate(x_train)):
    sentence = ' '.join(x_train[i])
    x_train[i] = sentence

for i, d in tqdm(enumerate(x_test)):
    sentence = ' '.join(x_test[i])
    x_test[i] = sentence

x_train = extract_embeddings(pretrained_path, x_train)
x_train = np.array(x_train)
x_train = keras.preprocessing.sequence.pad_sequences(
    x_train, maxlen=MAX_SEQUENCE_LENGTH)

model = keras.models.Sequential()
model.add(
    keras.layers.LSTM(units=256,
                      input_shape=(MAX_SEQUENCE_LENGTH, 768),
                      return_sequences=True))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.LSTM(128))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
Ejemplo n.º 18
0
    X_train = np.stack(X_train, axis=0)
    X_test = np.stack(X_test, axis=0)
    print(X_test)

    # texts = [get_word(idx) for idx in range(len(token_dict))]
    # print(vocab)

    try:
        with open('bert_embedding_sent.pkl', 'rb') as f:
            print('loading existing bert embedding...')
            bert_embedding = pickle.load(f)
            print('loaded')
    except:
        print('loading bert embedding')
        bert_embedding = extract_embeddings(
            'uncased_L-12_H-768_A-12',
            [" ".join(sentence) for sentence in tokens])
        # bert_embedding = extract_embeddings('uncased_L-12_H-768_A-12', vocab)
        with open('bert_embedding_sent.pkl', 'wb') as f:
            pickle.dump(bert_embedding, f)
        print('loaded and saved as a pickle')

    X_train = bert_embedding[:len(train_sentences)]
    X_test = bert_embedding[len(train_sentences):]
    print(X_test[:5])
    X_train = np.stack(X_train, axis=0)
    X_test = np.stack(X_test, axis=0)
    print(X_test)

input = Input(shape=(max_length, ))
Ejemplo n.º 19
0
from keras_bert import extract_embeddings
import tensorflow as tf
import numpy as np
from tempfile import TemporaryFile

flags = tf.compat.v1.flags
FLAGS = flags.FLAGS

flags.DEFINE_string("input_file", None, "")

outfile = TemporaryFile()

BEFORE_CANCER_PATH = "/home/chennuri/CliNER/bef_canc_mod_treatments/"
BIO_BERT_PATH = '/home/gowtham/biobert_v1.1_pubmed'

with open(BEFORE_CANCER_PATH + file_name, 'r') as f:
    data = f.readlines()
    f.close()

output = extract_embeddings(BIO_BERT_PATH, data)
np.save(outfile, output)
	def bert_extract(self, texts):
		embeddings = extract_embeddings(self.model_path, texts)
		return np.array(embeddings)
Ejemplo n.º 21
0
from sklearn.metrics.pairwise import cosine_similarity
from keras_bert import extract_embeddings

model_path = '/home/zju/buwenfeng/bert/chinese_L-12_H-768_A-12/'
texts = LDA_data[3]
topics =[
'支持苹果手机se喜欢',
'外观好看屏幕小合适',
'配置内存性能很满意',
'价格便宜市场发布不错',
'喜欢系统功能设计良好' ]
topics1 =['深圳禁摩',
'电动车自行车',
'交警执法',
'快递外卖',
'摩托车电摩三轮车',
'整治非法拘留' ]
import numpy as np
text_embeddings = extract_embeddings(model_path, texts)
topic_embeddings = extract_embeddings(model_path, topics)
texteb1 = extract_embeddings(model_path, [LDA_data[2][20]])
for ix,x in enumerate(text_embeddings):
    print(texts[ix]+'\n')
    if ix==40:
        break
    for it,t in enumerate(topic_embeddings):
        same_bank = np.average(np.matmul(x,np.transpose(t)))
        print(topics[it] + ' '+str(same_bank)+'\n')
Ejemplo n.º 22
0
 def test_extract_embeddings_from_file(self):
     with codecs.open(os.path.join(self.model_path, 'vocab.txt'), 'r', 'utf8') as reader:
         texts = map(lambda x: x.strip(), reader)
         embeddings = extract_embeddings(self.model_path, texts)
     self.assertEqual(15, len(embeddings))