Esempio n. 1
0
    def test_padding(self):
        tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]']
        token_dict = {token: i for i, token in enumerate(tokens)}
        tokenizer = Tokenizer(token_dict)
        text = '\u535A\u63A8'

        # single
        indices, segments = tokenizer.encode(first=text, max_len=100)
        expected = [2, 1, 1, 3] + [0] * 96
        self.assertEqual(expected, indices)
        expected = [0] * 100
        self.assertEqual(expected, segments)
        decoded = tokenizer.decode(indices)
        self.assertEqual(['[UNK]', '[UNK]'], decoded)
        indices, segments = tokenizer.encode(first=text, max_len=3)
        self.assertEqual([2, 1, 3], indices)
        self.assertEqual([0, 0, 0], segments)

        # paired
        indices, segments = tokenizer.encode(first=text, second=text, max_len=100)
        expected = [2, 1, 1, 3, 1, 1, 3] + [0] * 93
        self.assertEqual(expected, indices)
        expected = [0, 0, 0, 0, 1, 1, 1] + [0] * 93
        self.assertEqual(expected, segments)
        decoded = tokenizer.decode(indices)
        self.assertEqual((['[UNK]', '[UNK]'], ['[UNK]', '[UNK]']), decoded)
        indices, segments = tokenizer.encode(first=text, second=text, max_len=4)
        self.assertEqual([2, 1, 3, 3], indices)
        self.assertEqual([0, 0, 0, 1], segments)
Esempio n. 2
0
class twitterProcessor():
    def __init__(self, vocab_path, data_dir, SEQ_LEN):
        self.vocab_path = vocab_path
        self.data_dir = data_dir
        self.seq_len = SEQ_LEN

    def get_train_examples(self, data_dir):
        token_dict = {}
        with codecs.open(self.vocab_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                token_dict[token] = len(token_dict)
        self.tokenizer = Tokenizer(token_dict)

        with open(data_dir, 'r', encoding='utf-8') as f:
            reader = f.readlines()
            x_train, y_train = self.create_examples(reader, "train")
        return x_train, y_train

    def create_examples(self, lines, set_type):
        examples = []
        indices, labels = [], []
        for index, line in enumerate(lines):
            guid = "%s-%s" % (set_type, index)
            split_line = line.strip().split('+++$+++')
            ids, segments = self.tokenizer.encode(split_line[1],
                                                  max_len=self.seq_len)
            sentiment = split_line[0]
            indices.append(ids)
            labels.append(sentiment)
        return [indices, np.zeros_like(indices)], np.array(labels)

    def get_test_examples(self, data_dir):
        token_dict = {}
        with codecs.open(self.vocab_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                token_dict[token] = len(token_dict)
        self.tokenizer = Tokenizer(token_dict)

        with open(data_dir, 'r', encoding='utf-8') as f:
            reader = f.readlines()
            x_test = self.create_test_examples(reader, "train")
            return x_test

    def create_test_examples(self, lines, set_type):
        examples = []
        indices = []
        for index, line in enumerate(lines):
            guid = "%s-%s" % (set_type, index)
            ids, segments = self.tokenizer.encode(line.strip(),
                                                  max_len=self.seq_len)
            indices.append(ids)
        return [indices, np.zeros_like(indices)]
    def work1(self, text1):
        out = []
        if type(text1) == str:
            text1 = [text1]
        for i in text1:
            resu = i.replace('|', '').replace(' ', '').replace('“', '“').replace('”', '”') \
                .replace('‘', '‘').replace('’', '’').replace('〔', '(').replace('〕', ')').replace('/', '') \
                .replace('·', '·').replace('•', '·').replace("\\n", "\n").replace("\\r", "\r").replace("\\t",
                                                                                                              "\t")
            resu = re.split(r'\s+', resu)
            dr = re.compile(r'<[^>]+>', re.S)
            dd = dr.sub('', '。'.join(resu))
            line = re.sub(self.restr, '', dd)
            eng = [",", "!", "?", ":", ";", "(", ")", "[", "]", "$", "。。"]
            chi = [",", "!", "?", ":", ";", "(", ")", "【", "】", "¥", '。']
            for i, j in zip(eng, chi):
                line = line.replace(i, j)
            out.append(line[:28])
        token_dict = {}
        dict_path = "../chinese_L-12_H-768_A-12/vocab.txt"
        with codecs.open(dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                token_dict[token] = len(token_dict)

        tokenizer = Tokenizer(token_dict)
        x1, x2 = [], []
        for text in out:
            indices, segments = tokenizer.encode(first=text, max_len=512)
            x1.append(indices)
            x2.append(segments)
        return x1, x2
Esempio n. 4
0
def load_task2_testX(dict_path, data_dir):
    if not os.path.exists(os.path.join(
            data_dir, 'task2_testX.npy')) or not os.path.exists(
                os.path.join(data_dir, 'task2_test_seg.npy')):
        df = pd.read_csv(os.path.join(data_dir, 'task2_public_testset.csv'),
                         dtype=str)
        abstract = df.values[:, 2]

        # collect words
        token_dict = load_vocabulary(dict_path)
        tokenizer = Tokenizer(token_dict)
        input_data = []
        input_seg = []
        seq_len = 512  # maximum should be 638, while bert-BASE only support up to 512
        for i in tqdm(abstract):
            j = i.replace('$$$', ' ')
            idx, seg = tokenizer.encode(j, max_len=seq_len)
            input_data.append(idx)
            input_seg.append(seg)
        X = np.asarray(input_data)
        seg = np.asarray(input_seg)

        np.save(os.path.join(data_dir, 'task2_testX.npy'), X)
        np.save(os.path.join(data_dir, 'task2_test_seg.npy'), seg)
    else:
        X, seg = np.load(os.path.join(data_dir, 'task2_testX.npy')), np.load(
            os.path.join(data_dir, 'task2_test_seg.npy'))
    return X, seg
Esempio n. 5
0
    def test_uncased(self):
        tokens = [
            '[PAD]', '[UNK]', '[CLS]', '[SEP]', 'want', '##want',
            '##ed', 'wa', 'un', 'runn', '##ing', ',',
            '\u535A', '\u63A8',
        ]
        token_dict = {token: i for i, token in enumerate(tokens)}
        tokenizer = Tokenizer(token_dict)
        text = u"UNwant\u00E9d, running  \nah\u535A\u63A8zzz\u00AD"
        tokens = tokenizer.tokenize(text)
        expected = [
            '[CLS]', 'un', '##want', '##ed', ',', 'runn', '##ing',
            'a', '##h', '\u535A', '\u63A8', 'z', '##z', '##z',
            '[SEP]',
        ]
        self.assertEqual(expected, tokens)
        indices, segments = tokenizer.encode(text)
        expected = [2, 8, 5, 6, 11, 9, 10, 1, 1, 12, 13, 1, 1, 1, 3]
        self.assertEqual(expected, indices)
        expected = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        self.assertEqual(expected, segments)

        decoded = tokenizer.decode(indices)
        expected = [
            'un', '##want', '##ed', ',', 'runn', '##ing',
            '[UNK]', '[UNK]', '\u535A', '\u63A8', '[UNK]', '[UNK]', '[UNK]',
        ]
        self.assertEqual(expected, decoded)
Esempio n. 6
0
	def _text_process(self, text):
		Tokener = Tokenizer(self.vocab_dict)
		encoder = [Tokener.encode(first=doc[0],second=doc[1], max_len=self.max_seq_len) for doc in text]
		input_ids = [i[0] for i in encoder]
		input_type = [i[1] for i in encoder]
		input_mask = [[0 if l==0 else 1 for l in i] for i in input_ids]
		return (input_ids,input_mask,input_type)
Esempio n. 7
0
def load_bert_data(raw_file, train=True):
    config = Config()
    dict_path = './corpus/vocab.txt'
    token_dict = {}
    with codecs.open(dict_path, 'r', 'utf8') as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)
    tags2id, id2tags = load_tags('tags.txt')
    x_ids = list()
    x_segments = list()
    x_label = list()
    with codecs.open(raw_file, encoding='utf-8') as f:
        for line in f:
            x = json.loads(line)
            input_sent = x['title']
            tokenizer = Tokenizer(token_dict)
            x_sent_id, x_sent_segment = tokenizer.encode(
                input_sent, max_len=config.max_len_word)
            x_ids.append(x_sent_id)
            x_segments.append(x_sent_segment)
            if train:
                y = load_label(x, tags2id)
                x_label.append(y)
    x_label = np.asarray(x_label)
    return x_ids, x_segments, x_label, id2tags, None
Esempio n. 8
0
 def PreProcessInputData(self, text):
     tokenizer = Tokenizer(self.vocab)
     word_labels = []
     seq_types = []
     for sequence in text:
         code = tokenizer.encode(first=sequence, max_len=self.max_seq_length)
         word_labels.append(code[0])
         seq_types.append(code[1])
     return word_labels, seq_types
Esempio n. 9
0
 def test_empty(self):
     tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]']
     token_dict = {token: i for i, token in enumerate(tokens)}
     tokenizer = Tokenizer(token_dict)
     text = u''
     self.assertEqual(['[CLS]', '[SEP]'], tokenizer.tokenize(text))
     indices, segments = tokenizer.encode(text)
     self.assertEqual([2, 3], indices)
     self.assertEqual([0, 0], segments)
 def article_preprocess(self):
     tokenizer = Tokenizer(self.token_dict)
     self.text_split = [ele for ele in self.text.split('。') if len(ele) > 0]
     self.sent_num = len(self.text_split)
     tok = [tokenizer.encode(sent)[0] for sent in self.text_split]
     tok_pad = pad_sequences(tok, maxlen=self.seqence_len)
     self.data_in = [
         tok_pad,
         np.zeros(shape=(self.sent_num, self.seqence_len))
     ]
Esempio n. 11
0
 def encode_input_x(self, sentences):
     '''数据X序列化编码  使用BERT的Tokenizer:Token编码, 句子编码   sentences是句子列表,字符串'''
     tokenizer = Tokenizer(self.vocab)
     sent_token_ids = []
     sent_segment_ids = []
     for sequence in sentences:
         token_ids, segment_ids = tokenizer.encode(
             first=sequence, max_len=self.seq_maxlen)  # 输入只有1个句子!
         sent_token_ids.append(token_ids)
         sent_segment_ids.append(segment_ids)
     return [sent_token_ids, sent_segment_ids]
Esempio n. 12
0
def load_data(texts):
    tokenizer = Tokenizer(token_dict)
    indices = []
    indices_mask = []
    for text in tqdm(texts):
        ids,masked_ids = tokenizer.encode(text[0],text[1],max_len=SEQ_LEN)
        indices.append(ids)
        indices_mask.append(masked_ids)
    indices = np.array(indices)
    indices_mask = np.array(indices_mask)
    return [indices, indices_mask]
Esempio n. 13
0
def get_infer_input(input_file, out_file):
    id_type = pd.read_pickle('../data/id_type.pkl')
    type_index = pd.read_pickle('../data/type_index.pkl')
    entity_id = pd.read_pickle('../data/entity_id.pkl')

    id_text = pd.read_pickle('../data/id_text.pkl')

    token_dict = get_token_dict()
    tokenizer = Tokenizer(token_dict)
    out_file = open(out_file, 'w')
    file_index = 0
    with open(input_file) as f:
        for line in f:
            if file_index % 100 == 0:
                print(file_index)
            file_index += 1

            temDict = json.loads(line)
            text = temDict['text']
            mention_data = temDict['mention_data']
            for men in mention_data:
                mention = men['mention']

                offset = int(men['offset'])
                begin = int(offset) + 1
                end = begin + len(mention)

                link_id = get_link_entity_test(mention, entity_id)
                men['link_id'] = link_id
                link_data = {
                    'ids': [],
                    'seg': [],
                    'begin': [],
                    'end': [],
                    'en_type': []
                }
                for id in link_id:

                    kb_text = id_text[id]
                    kb_type = type_index[id_type[id][0]]
                    indice, segment = tokenizer.encode(first=text,
                                                       second=kb_text,
                                                       max_len=256)
                    link_data['ids'].append(indice)
                    link_data['seg'].append(segment)
                    link_data['begin'].append([begin])
                    link_data['end'].append([end])
                    link_data['en_type'].append([kb_type])
                men['link_data'] = link_data

            out_file.write(json.dumps(temDict, ensure_ascii=False))
            out_file.write('\n')
Esempio n. 14
0
def bert_sen_token(token_dict, traininstance, maxlen):
    tokenizer = Tokenizer(token_dict)
    train_indices = []
    train_segments = []
    train_text = []
    for text in traininstance:
        tokens = tokenizer.tokenize(text)
        indices, segments = tokenizer.encode(first=text, max_len=maxlen)
        train_indices.append(indices)
        train_segments.append(segments)
        train_text.append(tokens)

    return train_indices, train_segments, train_text
Esempio n. 15
0
class batchGen:
    def __init__(self, label, bs = 16, token_dict = None):
        self.batch_size = bs
        self.random = random
        self.ans = label
        self.maxlen_doc = 512
        self.tokenizer = Tokenizer(token_dict)
        self.iter_index = np.arange(len(self.ans))
    
    def __len__(self):
        return len(self.ans)
    
    def flow(self):
        '''
        Get a batch of data
        '''
        n = len(self.ans)
        i=0
        while(True):
            batch_doc = []
            batch_doc2 = []
            batch_query = []
            batch_labels = []
            for b in range(self.batch_size):
                if(i == 0): # Shuffle the dataset
                    np.random.shuffle(self.iter_index)
                index = self.iter_index[i] # choose a data
                doc, query, label = self.GetData(index)
                while doc == None:
                    i = (i+1) % n
                    index = self.iter_index[i] # choose a data
                    doc, query, label = self.GetData(index)
                x1, x2 = self.tokenizer.encode(first=query, second=doc, max_len=self.maxlen_doc)
                batch_doc.append(x1)
                batch_doc2.append(x2)
                batch_labels.append(label)
                i = (i+1) % n

            batch_doc = np.array(batch_doc, dtype = np.float32)
            batch_doc2 = np.array(batch_doc2, dtype=np.float32)
            batch_labels = np.array(batch_labels, dtype = np.float32)

            yield [batch_doc, batch_doc2], batch_labels


    def GetData(self, index):
        [query_fn,doc_fn], label = self.ans[index]
        doc = open('./doc/'+doc_fn).read()
        query = open('./train/query/' + query_fn).read()
        return  doc, query, int(label)
Esempio n. 16
0
 def _text_process(self, text):
     Tokener = Tokenizer(self.vocab_dict)
     encoder = [
         Tokener.encode(first=doc[0], max_len=self.max_seq_len)
         for doc in text
     ]
     input_ids = [i[0] for i in encoder]
     input_type = [i[1] for i in encoder]
     input_mask = [[0 if l == 0 else 1 for l in i] for i in input_ids]
     input_pos = [[0] + [self._pos2id.get(t, 0) for t in doc[1]] + [0]
                  for doc in text]
     input_pos = pad_sequences(input_pos,
                               self.max_seq_len,
                               padding="post",
                               truncating="post")
     return (input_ids, input_mask, input_type, input_pos)
Esempio n. 17
0
 def test_cased(self):
     tokens = [
         '[UNK]', u'[CLS]', '[SEP]', 'want', '##want',
         u'##\u00E9d', 'wa', 'UN', 'runn', '##ing', ',',
     ]
     token_dict = {token: i for i, token in enumerate(tokens)}
     tokenizer = Tokenizer(token_dict, cased=True)
     text = u"UNwant\u00E9d, running"
     tokens = tokenizer.tokenize(text)
     expected = ['[CLS]', 'UN', '##want', u'##\u00E9d', ',', 'runn', '##ing', '[SEP]']
     self.assertEqual(expected, tokens)
     indices, segments = tokenizer.encode(text)
     expected = [1, 7, 4, 5, 10, 8, 9, 2]
     self.assertEqual(expected, indices)
     expected = [0, 0, 0, 0, 0, 0, 0, 0]
     self.assertEqual(expected, segments)
Esempio n. 18
0
def load_task2_trainXY(dict_path, data_dir):
    if not os.path.exists(os.path.join(
            data_dir, 'task2_trainX.npy')) or not os.path.exists(
                os.path.join(
                    data_dir, 'task2_trainY.npy')) or not os.path.exists(
                        os.path.join(data_dir, 'task2_train_seg.npy')):
        df = pd.read_csv(os.path.join(data_dir, 'task2_trainset.csv'),
                         dtype=str)
        cate = df.values[:, -1]

        # generating Y
        Y = np.zeros((cate.shape[0], 4))
        name = {
            'THEORETICAL': 0,
            'ENGINEERING': 1,
            'EMPIRICAL': 2,
            'OTHERS': 3
        }
        for i in range(cate.shape[0]):
            for c in cate[i].split(' '):
                Y[i, name[c]] += 1

        # generating X
        abstract = df.values[:, 2]

        # collect words
        token_dict = load_vocabulary(dict_path)
        tokenizer = Tokenizer(token_dict)
        input_data = []
        input_seg = []
        for i in tqdm(abstract):
            j = i.replace('$$$', ' ')
            idx, seg = tokenizer.encode(j, max_len=512)
            input_data.append(idx)
            input_seg.append(seg)
        X = np.array(input_data)
        seg = np.array(input_seg)
        np.save(os.path.join(data_dir, 'task2_trainX.npy'), X)
        np.save(os.path.join(data_dir, 'task2_trainY.npy'), Y)
        np.save(os.path.join(data_dir, 'task2_train_seg.npy'), seg)
    else:
        X, Y, seg = np.load(os.path.join(
            data_dir, 'task2_trainX.npy')), np.load(
                os.path.join(data_dir, 'task2_trainY.npy')), np.load(
                    os.path.join(data_dir, 'task2_train_seg.npy'))
    return X, Y, seg
Esempio n. 19
0
def get_encode(text_list,token_dict):
    """

    :param text_list:
    :param token_dict:
    :return:
    """
    X1 = []
    X2 = []
    tokenizer = Tokenizer(token_dict)
    for line in text_list:
        x1, x2 = tokenizer.encode(first=line)
        X1.append(x1)
        X2.append(x2)
    X1 = sequence.pad_sequences(X1, maxlen=maxlen, padding='post', truncating='post')
    X2 = sequence.pad_sequences(X2, maxlen=maxlen, padding="post", truncating='post')
    return [X1, X2]
Esempio n. 20
0
class FineTuneBert:
    def __init__(self, gpu_name, gpu_num, seq_max_len, batch_size):
        print('--' * 10 + ' Load BERT model start ' + '--' * 10)
        gpu_option(gpu_name, gpu_num)
        self.seq_max_len = seq_max_len  # same to train
        self.batch_size = batch_size
        model_path = 'models/BERT/pretrained_model/uncased_L-24_H-1024_A-16'
        vocab_path = os.path.join(model_path, 'vocab.txt')
        # load Tokenizer
        token_dict = load_vocabulary(vocab_path)
        self.tokenizer = Tokenizer(token_dict)
        MODEL_SAVE_PATH = 'models/BERT/fine_tune_model/bert_fine_tune.hdf5'
        model = load_model(MODEL_SAVE_PATH,
                           custom_objects=get_custom_objects(),
                           compile=False)
        if gpu_num >= 2:
            self.par_model = multi_gpu_model(model, gpus=gpu_num)
        else:
            self.par_model = model
        print('--' * 10 + ' Load BERT model end ' + '--' * 10)

    def data_generator(self, data):
        steps = len(data) // self.batch_size
        if len(data) % self.batch_size != 0:
            steps += 1
        X1, X2 = [], []
        for i in range(len(data)):
            d = data[i]
            text1 = d[0]
            text2 = d[1]
            x1, x2 = self.tokenizer.encode(first=text1,
                                           second=text2,
                                           max_len=self.seq_max_len)  # 512
            X1.append(x1)
            X2.append(x2)
            if len(X1) == self.batch_size or i == (len(data) - 1):
                yield np.array(X1), np.array(X2)
                X1, X2 = [], []

    def classify(self, texts):
        pred = []
        my_iter = self.data_generator(texts)
        for indices, segments in my_iter:
            p = self.par_model.predict([indices, segments])
            pred += sum(p.tolist(), [])
        return pred
Esempio n. 21
0
def extract(max_len=512):
    '''
    :param max_len: 文本最大长度
    :return: 字典形式,key: kb_id  value: kb_id对应描述文本形成的向量
    '''
    model = get_model(max_len)
    token_dict = get_token_dict()
    tokenizer = Tokenizer(token_dict)
    id_text = pd.read_pickle('data/id_text.pkl')
    id_embedding = {}
    for id in id_text:
        if int(id) % 10000 == 0:
            print(id)
        text = id_text[id]
        indices, segments = tokenizer.encode(first=text, max_len=512)
        predicts = model.predict([[indices], [segments]], verbose=2)
        id_embedding[id] = predicts[0]
    pd.to_pickle(id_embedding, 'data/id_embedding.pkl')
Esempio n. 22
0
def get_single_infer_input(ner_result):
    id_type = pd.read_pickle('../data/id_type.pkl')
    type_index = pd.read_pickle('../data/type_index.pkl')
    entity_id = pd.read_pickle('../data/entity_id.pkl')
    id_text = pd.read_pickle('../data/id_text.pkl')

    token_dict = get_token_dict()
    tokenizer = Tokenizer(token_dict)

    temDict = json.loads(ner_result)
    text = temDict['text']
    mention_data = temDict['mention_data']
    for men in mention_data:
        mention = men['mention']
        offset = int(men['offset'])
        begin = int(offset) + 1
        end = begin + len(mention)

        link_id = get_link_entity_test(mention, entity_id)
        men['link_id'] = link_id
        link_data = {
            'ids': [],
            'seg': [],
            'begin': [],
            'end': [],
            'en_type': []
        }
        for id in link_id:

            kb_text = id_text[id]
            kb_type = type_index[id_type[id][0]]
            indice, segment = tokenizer.encode(first=text,
                                               second=kb_text,
                                               max_len=256)
            link_data['ids'].append(indice)
            link_data['seg'].append(segment)
            link_data['begin'].append([begin])
            link_data['end'].append([end])
            link_data['en_type'].append([kb_type])
        men['link_data'] = link_data

    return json.dumps(temDict)
Esempio n. 23
0
    def getBERTScore(self, queries_df):

        tweets = queries_df['cleanTweet']

        token_dict = {}
        with codecs.open(self.vocab_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                token_dict[token] = len(token_dict)
        tokenizer = Tokenizer(token_dict)

        indices = []
        for index, line in enumerate(tweets):
            ids, segments = tokenizer.encode(line.strip(), max_len=128)
            indices.append(ids)

        x_test = [indices, np.zeros_like(indices)]
        predictions = self.model.predict(x_test)

        return predictions
Esempio n. 24
0
class KerasBERT:
    def __init__(self, batch_size, gpu_num, gpu_name):
        gpu_option(gpu_name, gpu_num)
        self.batch_size = batch_size
        print("##### load KerasBERT start #####")
        # Path
        model_path = 'models/BERT/pretrained_model/uncased_L-24_H-1024_A-16'
        config_path = os.path.join(model_path, 'bert_config.json')
        checkpoint_path = os.path.join(model_path, 'bert_model.ckpt')
        vocab_path = os.path.join(model_path, 'vocab.txt')
        token_dict = load_vocabulary(vocab_path)
        model = load_trained_model_from_checkpoint(config_path,
                                                   checkpoint_path)
        if gpu_num >= 2:
            self.par_model = multi_gpu_model(model, gpus=gpu_num)
        else:
            self.par_model = model
        self.tokenizer = Tokenizer(token_dict)
        print("##### load KerasBERT end #####")

    def bert_encode(self, texts):
        predicts = []

        def create_array():
            data = []
            for text in texts:
                indices, segments = self.tokenizer.encode(first=text,
                                                          max_len=512)
                data.append([indices, segments])
            return data

        array = create_array()
        my_iter = data_iter(array, batch_size=self.batch_size)
        for w1, w2 in my_iter:
            m_indices = np.array(w1)
            m_segments = np.array(w2)
            predict = self.par_model.predict([m_indices, m_segments])
            batch_predict = predict[:, 0].tolist()  # 每句话取第一个word([CLS])的编码
            predicts += batch_predict
        return predicts
Esempio n. 25
0
def prepare_data(data, is_test):
    token_dict = {}
    with codecs.open(DICT_PATH, 'r', 'utf8') as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)
    tokenizer = Tokenizer(token_dict)
    indices = []
    segments = []
    results = []
    i = 0
    for row in data:
        text1 = clean_text(row[0])
        text2 = clean_text(row[1])
        # In training set, max len is 201. Idk yet what max token count is in test set.
        # Still, pretrained BERT has width of 512, so that's what we will use.
        row_indices, row_segments = tokenizer.encode(first=text1,
                                                     second=text2,
                                                     max_len=512)
        indices.append(row_indices)
        segments.append(row_segments)
        #print(tokenizer.tokenize(text1))
        #print(tokenizer.tokenize(text2))
        #print(row_indices)
        #print(row_segments)
        if not is_test:
            results.append(row[2])
        if i % 100 is 0:
            print("i=", i)
        i += 1
    print("Num rows processed: ", i)
    if is_test:
        return np.array(indices), np.array(segments)
    else:
        return np.array(indices), np.array(segments), np.array(results,
                                                               dtype="float32")
Esempio n. 26
0
print('Tokens:', tokens)

indices = np.array([[token_dict[token]
                     for token in tokens] + [0] * (512 - len(tokens))])
segments = np.array([[0] * len(tokens) + [0] * (512 - len(tokens))])
masks = np.array([[0, 1, 1] + [0] * (512 - 3)])

predicts = model.predict([indices, segments,
                          masks])[0].argmax(axis=-1).tolist()
print('Fill with: ', list(map(lambda x: token_dict_inv[x], predicts[0][1:3])))

sentence_1 = '数学是利用符号语言研究數量、结构、变化以及空间等概念的一門学科。'
sentence_2 = '从某种角度看屬於形式科學的一種。'
print('Tokens:', tokenizer.tokenize(first=sentence_1, second=sentence_2))
indices, segments = tokenizer.encode(first=sentence_1,
                                     second=sentence_2,
                                     max_len=512)
masks = np.array([[0] * 512])

predicts = model.predict([np.array([indices]), np.array([segments]), masks])[1]
print('%s is random next: ' % sentence_2,
      bool(np.argmax(predicts, axis=-1)[0]))

sentence_2 = '任何一个希尔伯特空间都有一族标准正交基。'
print('Tokens:', tokenizer.tokenize(first=sentence_1, second=sentence_2))
indices, segments = tokenizer.encode(first=sentence_1,
                                     second=sentence_2,
                                     max_len=512)

predicts = model.predict([np.array([indices]), np.array([segments]), masks])[1]
print('%s is random next: ' % sentence_2,
Esempio n. 27
0
from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
from keras_bert import load_trained_model_from_checkpoint, Tokenizer, get_model

bert_model = load_trained_model_from_checkpoint(config_path,
                                                checkpoint_path,
                                                seq_len=None)
for l in bert_model.layers:
    l.trainable = True

x1_in = Input(shape=(None, ))
x2_in = Input(shape=(None, ))

x = bert_model([x1_in, x2_in])

# Tokenization
from keras_bert import Tokenizer

tokenizer = Tokenizer(token_dict)
# text = '语言模型 chinese is great'
# text='商品名称及规格型号'
# text='境外收货人\nDERCOCHILEREPUESTOSS.A.'
# text='合同协议号\n2019CICSA473-A'
text = '运抵国(地区)\n智利'
tokens = tokenizer.tokenize(text)
# ['[CLS]', '语', '言', '模', '型', '[SEP]']
print('tokens', tokens)
indices, segments = tokenizer.encode(first=text, max_len=512)
print(indices[:10])
Esempio n. 28
0
class Embeddings(object):
    def __init__(self,
                 name,
                 path='./embedding-registry.json',
                 lang='en',
                 extension='vec',
                 use_ELMo=False,
                 use_BERT=False,
                 use_cache=True,
                 load=True):
        self.name = name
        self.embed_size = 0
        self.static_embed_size = 0
        self.vocab_size = 0
        self.model = {}
        self.registry = self._load_embedding_registry(path)
        self.lang = lang
        self.extension = extension
        self.embedding_lmdb_path = None
        if self.registry is not None:
            self.embedding_lmdb_path = self.registry["embedding-lmdb-path"]
        self.env = None
        if load:
            self.make_embeddings_simple(name)
        self.static_embed_size = self.embed_size
        self.bilm = None

        self.use_cache = use_cache
        # below init for using ELMo embeddings
        self.use_ELMo = use_ELMo
        if use_ELMo:
            self.make_ELMo()
            self.embed_size = ELMo_embed_size + self.embed_size
            description = self.get_description('elmo-' + self.lang)
            self.env_ELMo = None
            if description and description["cache-training"] and self.use_cache:
                self.embedding_ELMo_cache = os.path.join(
                    description["path-cache"], "cache")
                # clean possible remaining cache
                self.clean_ELMo_cache()
                # create and load a cache in write mode, it will be used only for training
                self.env_ELMo = lmdb.open(self.embedding_ELMo_cache,
                                          map_size=map_size)

        # below init for using BERT embeddings (extracted features only, not fine tuning),
        # similar to ELMo for this usage
        self.use_BERT = use_BERT
        if use_BERT:
            # to avoid issue with tf graph and thread, we maintain in the class its own graph and session
            #self.session = tf.Session()
            self.graph = tf.get_default_graph()
            #self.session.run(tf.global_variables_initializer())
            self.make_BERT()
            self.embed_size = BERT_embed_size + self.embed_size
            description = self.get_description('bert-base-' + self.lang)
            self.env_BERT = None
            if description and description["cache-training"] and self.use_cache:
                self.embedding_BERT_cache = os.path.join(
                    description["path-cache"], "cache")
                # clean possible remaining cache
                self.clean_BERT_cache()
                # create and load a cache in write mode, it will be used only for training
                self.env_BERT = lmdb.open(self.embedding_BERT_cache,
                                          map_size=map_size)

    def __getattr__(self, name):
        return getattr(self.model, name)

    def _load_embedding_registry(self, path='./embedding-registry.json'):
        """
        Load the description of available embeddings. Each description provides a name, 
        a file path (used only if necessary) and a embeddings type (to take into account
        small variation of format)
        """
        registry_json = open(path).read()
        return json.loads(registry_json)

    def make_embeddings_simple_in_memory(self, name="fasttext-crawl"):
        nbWords = 0
        print('loading embeddings...')
        begin = True
        description = self.get_description(name)
        if description is not None:
            embeddings_path = description["path"]
            self.lang = description["lang"]
            print("path:", embeddings_path)
            if self.extension == 'bin':
                self.model = fastText.load_model(embeddings_path)
                nbWords = len(self.model.get_words())
                self.embed_size = self.model.get_dimension()
            else:
                with open(embeddings_path, encoding='utf8') as f:
                    for line in f:
                        line = line.strip()
                        line = line.split(' ')
                        if begin:
                            begin = False
                            nb_words, embed_size = _fetch_header_if_available(
                                line)

                            # we parse the header
                            if nb_words > 0 and embed_size > 0:
                                nbWords = nb_words
                                self.embed_size = embed_size
                                continue

                        word = line[0]
                        vector = np.array(
                            [float(val) for val in line[1:len(line)]],
                            dtype='float32')
                        #else:
                        #    vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32')
                        if self.embed_size == 0:
                            self.embed_size = len(vector)
                        self.model[word] = vector
                if nbWords == 0:
                    nbWords = len(self.model)
            print('embeddings loaded for', nbWords, "words and",
                  self.embed_size, "dimensions")

    def make_embeddings_lmdb(self, name="fasttext-crawl"):
        print(
            '\nCompiling embeddings... (this is done only one time per embeddings at first usage)'
        )
        description = self.get_description(name)

        if description is None:
            print(
                '\nNo description found in embeddings registry for embeddings',
                name)
            return

        if description is not None:
            # the following method will possibly download the mebedding file if not available locally
            embeddings_path = self.get_embedding_path(description)
            if embeddings_path is None:
                print('\nCould not locate a usable resource for embeddings',
                      name)
                return

            self.load_embeddings_from_file(embeddings_path)

        # cleaning possible downloaded embeddings
        self.clean_downloads()

    def load_embeddings_from_file(self, embeddings_path):
        begin = True
        nbWords = 0
        txn = self.env.begin(write=True)
        # batch_size = 1024
        i = 0
        nb_lines = 0

        # read number of lines first
        embedding_file = open_embedding_file(embeddings_path)
        if embedding_file is None:
            print("Error: could not open embeddings file", embeddings_path)
            return

        for line in embedding_file:
            nb_lines += 1
        embedding_file.close()

        embedding_file = open_embedding_file(embeddings_path)
        #with open(embeddings_path, encoding='utf8') as f:
        for line in tqdm(embedding_file, total=nb_lines):
            line = line.decode()
            line = line.split(' ')
            if begin:
                begin = False
                nb_words, embed_size = _fetch_header_if_available(line)

                if nb_words > 0 and embed_size > 0:
                    nbWords = nb_words
                    self.embed_size = embed_size
                    continue

            word = line[0]
            try:
                if line[len(line) - 1] == '\n':
                    vector = np.array(
                        [float(val) for val in line[1:len(line) - 1]],
                        dtype='float32')
                else:
                    vector = np.array(
                        [float(val) for val in line[1:len(line)]],
                        dtype='float32')

                #vector = np.array([float(val) for val in line[1:len(line)]], dtype='float32')
            except:
                print(len(line))
                print(line[1:len(line)])
            #else:
            #    vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32')
            if self.embed_size == 0:
                self.embed_size = len(vector)

            if len(word.encode(encoding='UTF-8')) < self.env.max_key_size():
                txn.put(word.encode(encoding='UTF-8'),
                        _serialize_pickle(vector))
                #txn.put(word.encode(encoding='UTF-8'), _serialize_byteio(vector))
                i += 1

            # commit batch
            # if i % batch_size == 0:
            #     txn.commit()
            #     txn = self.env.begin(write=True)

        embedding_file.close()

        #if i % batch_size != 0:
        txn.commit()
        if nbWords == 0:
            nbWords = i
        self.vocab_size = nbWords
        print('embeddings loaded for', nbWords, "words and", self.embed_size,
              "dimensions")

    def clean_downloads(self):
        # cleaning possible downloaded embeddings
        for filename in os.listdir(self.registry['embedding-download-path']):
            file_path = os.path.join(self.registry['embedding-download-path'],
                                     filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print('Failed to delete %s. Reason: %s' % (file_path, e))

    def make_embeddings_simple(self, name="fasttext-crawl"):
        description = self.get_description(name)
        if description is not None:
            self.extension = description["format"]

        if self.extension == "bin":
            if fasttext_support == True:
                print(
                    "embeddings are of .bin format, so they will be loaded in memory..."
                )
                self.make_embeddings_simple_in_memory(name)
            else:
                if not (sys.platform == 'linux' or sys.platform == 'darwin'):
                    raise ValueError(
                        'FastText .bin format not supported for your platform')
                else:
                    raise ValueError(
                        'Go to the documentation to get more information on how to install FastText .bin support'
                    )

        elif self.embedding_lmdb_path is None or self.embedding_lmdb_path == "None":
            print(
                "embedding_lmdb_path is not specified in the embeddings registry, so the embeddings will be loaded in memory..."
            )
            self.make_embeddings_simple_in_memory(name)
        else:
            # if the path to the lmdb database files does not exist, we create it
            if not os.path.isdir(self.embedding_lmdb_path):
                # conservative check (likely very useless)
                if not os.path.exists(self.embedding_lmdb_path):
                    os.makedirs(self.embedding_lmdb_path)

            # check if the lmdb database exists
            envFilePath = os.path.join(self.embedding_lmdb_path, name)
            load_db = True
            if os.path.isdir(envFilePath):
                description = self.get_description(name)
                if description is not None:
                    self.lang = description["lang"]

                # open the database in read mode
                self.env = lmdb.open(envFilePath,
                                     readonly=True,
                                     max_readers=2048,
                                     max_spare_txns=4)
                if self.env:
                    # we need to set self.embed_size and self.vocab_size
                    with self.env.begin() as txn:
                        stats = txn.stat()
                        size = stats['entries']
                        self.vocab_size = size

                    with self.env.begin() as txn:
                        cursor = txn.cursor()
                        for key, value in cursor:
                            vector = _deserialize_pickle(value)
                            self.embed_size = vector.shape[0]
                            break
                        cursor.close()

                    if self.vocab_size > 100 and self.embed_size > 10:
                        # lmdb database exists and looks valid
                        load_db = False

                        # no idea why, but we need to close and reopen the environment to avoid
                        # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
                        # when opening new transaction !
                        self.env.close()
                        self.env = lmdb.open(envFilePath,
                                             readonly=True,
                                             max_readers=2048,
                                             max_spare_txns=2)

            if load_db:
                # create and load the database in write mode
                self.env = lmdb.open(envFilePath, map_size=map_size)
                self.make_embeddings_lmdb(name)

    def make_ELMo(self):
        # Location of pretrained BiLM for the specified language
        # TBD check if ELMo language resources are present
        description = self.get_description('elmo-' + self.lang)
        if description is not None:
            self.lang = description["lang"]
            vocab_file = description["path-vocab"]
            options_file = description["path-config"]
            weight_file = description["path_weights"]

            print('init ELMo')

            # Create a Batcher to map text to character ids
            self.batcher = Batcher(vocab_file, 50)

            # Build the biLM graph.
            self.bilm = BidirectionalLanguageModel(self.lang, options_file,
                                                   weight_file)

            # Input placeholders to the biLM.
            self.character_ids = tf.placeholder('int32',
                                                shape=(None, None, 50))

            with tf.variable_scope(self.lang, reuse=tf.AUTO_REUSE):
                # the reuse=True scope reuses weights from the whole context
                self.embeddings_op = self.bilm(self.character_ids)
                self.elmo_input = weight_layers('input',
                                                self.embeddings_op,
                                                l2_coef=0.0)

    def make_BERT(self):
        # Location of BERT model
        description = self.get_description('bert-base-' + self.lang)
        if description is not None:
            self.lang = description["lang"]
            config_file = description["path-config"]
            weight_file = description["path-weights"]
            vocab_file = description["path-vocab"]

            print('init BERT')

            # load the pretrained model
            with self.graph.as_default():
                # there are different typical pooling strategies for getting BERT features:
                # - concatenation of 4 last layers (the one from the original BERT paper, BERT_embed_size is then 3072)
                # - last layer (BERT_embed_size is 768)
                # - average of 4 last layers (BERT_embed_size is 768)
                # - sum of the 4 last layers (BERT_embed_size is 768)
                self.bert_model = load_trained_model_from_checkpoint(
                    config_file, weight_file, output_layer_num=4)
                self.bert_model.summary(line_length=120)
                self.bert_model._make_predict_function()

            # init the tokenizer
            token_dict = {}
            with codecs.open(vocab_file, 'r', 'utf8') as reader:
                for line in reader:
                    token = line.strip()
                    token_dict[token] = len(token_dict)
            print('token_dict size:', len(token_dict))
            self.bert_tokenizer = Tokenizer(token_dict, cased=True)

    def get_sentence_vector_only_ELMo(self, token_list):
        """
            Return the ELMo embeddings only for a full sentence
        """

        if not self.use_ELMo:
            print(
                "Warning: ELMo embeddings requested but embeddings object wrongly initialised"
            )
            return

        # Create batches of data
        local_token_ids = self.batcher.batch_sentences(token_list)
        max_size_sentence = local_token_ids[0].shape[0]
        # check lmdb cache
        elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence)
        if elmo_result is not None:
            return elmo_result

        with tf.Session() as sess:
            # weird, for this cpu is faster than gpu (1080Ti !)
            with tf.device("/cpu:0"):
                # It is necessary to initialize variables once before running inference
                sess.run(tf.global_variables_initializer())

                # Compute ELMo representations (2 times as a heavy warm-up)
                elmo_result = sess.run(
                    self.elmo_input['weighted_op'],
                    feed_dict={self.character_ids: local_token_ids})
                elmo_result = sess.run(
                    self.elmo_input['weighted_op'],
                    feed_dict={self.character_ids: local_token_ids})
                #cache computation
                self.cache_ELMo_lmdb_vector(token_list, elmo_result)
        return elmo_result

    def get_sentence_vector_with_ELMo(self, token_list):
        """
            Return a concatenation of standard embeddings (e.g. Glove) and ELMo embeddings 
            for a full sentence
        """
        if not self.use_ELMo:
            print(
                "Warning: ELMo embeddings requested but embeddings object wrongly initialised"
            )
            return

        #print("\ntoken_list:", token_list)
        local_token_ids = self.batcher.batch_sentences(token_list)
        #print("local_token_ids:", local_token_ids)
        max_size_sentence = local_token_ids[0].shape[0]

        elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence)
        if elmo_result is None:
            with tf.Session() as sess:
                # weird, for this cpu is faster than gpu (1080Ti !)
                with tf.device("/cpu:0"):
                    # It is necessary to initialize variables once before running inference
                    sess.run(tf.global_variables_initializer())

                    # Compute ELMo representations (2 times as a heavy warm-up)
                    elmo_result = sess.run(
                        self.elmo_input['weighted_op'],
                        feed_dict={self.character_ids: local_token_ids})
                    elmo_result = sess.run(
                        self.elmo_input['weighted_op'],
                        feed_dict={self.character_ids: local_token_ids})
                    #cache computation
                    self.cache_ELMo_lmdb_vector(token_list, elmo_result)

        concatenated_result = np.zeros(
            (len(token_list), max_size_sentence - 2, self.embed_size),
            dtype=np.float32)
        #concatenated_result = np.random.rand(elmo_result.shape[0], max_size_sentence-2, self.embed_size)
        for i in range(0, len(token_list)):
            for j in range(0, len(token_list[i])):
                #if is_int(token_list[i][j]) or is_float(token_list[i][j]):
                #dummy_result = np.zeros((elmo_result.shape[2]), dtype=np.float32)
                #concatenated_result[i][j] = np.concatenate((dummy_result, self.get_word_vector(token_list[i][j])), )
                #else:
                concatenated_result[i][j] = np.concatenate(
                    (elmo_result[i][j], self.get_word_vector(
                        token_list[i][j]).astype('float32')), )
                #concatenated_result[i][j] = np.concatenate((self.get_word_vector(token_list[i][j]), elmo_result[i][j]), )
        return concatenated_result

    def get_sentence_vector_only_BERT(self, token_list):
        """
            Return the BERT extracted embeddings only for a full sentence
        """
        if not self.use_BERT:
            print(
                "Warning: BERT embeddings requested but embeddings object wrongly initialised"
            )
            return

        #print("local_token_ids:", local_token_ids)
        max_size_token_list = 0
        for i, sentence in enumerate(token_list):
            if len(sentence) > max_size_token_list:
                max_size_token_list = len(sentence)

        # retokenize with BERT tokenizer
        max_size = BERT_sentence_size
        max_size_sentence = 0
        new_token_list = []
        bert_results = np.zeros((len(token_list), max_size, BERT_embed_size),
                                dtype=np.float32)
        for i, sentence in enumerate(token_list):
            local_text = " ".join(sentence)
            local_tokens = self.bert_tokenizer.tokenize(local_text)

            bert_result = self.get_BERT_lmdb_vector(sentence)
            if bert_result is None:
                indices, segments = self.bert_tokenizer.encode(
                    local_text, max_len=max_size)
                with self.graph.as_default():
                    bert_result = self.bert_model.predict(
                        [np.array([indices]),
                         np.array([segments])])[0]
                    #cache computation
                    if bert_result is not None:
                        self.cache_BERT_lmdb_vector(sentence, bert_result)

            # Realign BERT tokenization with the provided tokenization. Normally BERT segmenter always
            # over-segment as compared to DeLFT segmenter.
            # There are two obvious possibilities to combine subtoken embeddings into token embeddings,
            # either take the embeddings of the last subtoken, of use the average vector of the subtokens.
            new_bert_result = np.zeros((max_size, BERT_embed_size),
                                       dtype=np.float32)
            token_tensor = []
            tid = 0
            buffer = ''
            #print(sentence)
            #print(local_tokens)
            for j, t in enumerate(local_tokens):
                if j >= max_size:
                    break
                if t == '[CLS]' or t == '[SEP]':
                    continue
                else:
                    if t.startswith('##'):
                        t = t[2:]
                    buffer += t
                    #print(buffer)
                    token_tensor.append(bert_result[j])
                    if buffer == sentence[tid]:
                        # average vector of the subtokens
                        new_bert_result[tid] = np.stack(token_tensor).mean(
                            axis=0)
                        # or last subtoken vector
                        #new_bert_result[tid] = token_tensor[-1]
                        token_tensor = []
                        buffer = ''
                        tid += 1
            bert_result = new_bert_result

            if bert_result is not None:
                bert_results[i] = bert_result

        # we need to squeze the vector to max_size_token_list
        squeezed_bert_results = np.zeros(
            (len(token_list), max_size_token_list, BERT_embed_size),
            dtype=np.float32)
        for i, sentence in enumerate(token_list):
            squeezed_bert_results[i] = bert_results[i][:max_size_token_list]

        return squeezed_bert_results

    def get_sentence_vector_with_BERT(self, token_list):
        """
            Return a concatenation of standard embeddings (e.g. Glove) and BERT extracted embeddings  
            for a full sentence
        """
        if not self.use_BERT:
            print(
                "Warning: BERT embeddings requested but embeddings object wrongly initialised"
            )
            return

        max_size_token_list = 0
        for i, sentence in enumerate(token_list):
            if len(sentence) > max_size_token_list:
                max_size_token_list = len(sentence)

        squeezed_bert_results = self.get_sentence_vector_only_BERT(token_list)

        concatenated_squeezed_result = np.zeros(
            (len(token_list), max_size_token_list, self.embed_size),
            dtype=np.float32)
        for i, sentence in enumerate(token_list):
            for j in range(0, len(token_list[i])):
                concatenated_squeezed_result[i][j] = np.concatenate(
                    (squeezed_bert_results[i][j],
                     self.get_word_vector(
                         token_list[i][j]).astype('float32')), )

        return concatenated_squeezed_result

    def get_description(self, name):
        for emb in self.registry["embeddings"]:
            if emb["name"] == name:
                return emb
        for emb in self.registry["embeddings-contextualized"]:
            if emb["name"] == name:
                return emb
        for emb in self.registry["transformers"]:
            if emb["name"] == name:
                return emb
        return None

    def get_word_vector(self, word):
        """
            Get static embeddings (e.g. glove) for a given token
        """
        if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'):
            # the pre-trained embeddings are not cased
            word = word.lower()
        if self.env is None or self.extension == 'bin':
            # db not available or embeddings in bin format, the embeddings should be available in memory (normally!)
            return self.get_word_vector_in_memory(word)
        try:
            with self.env.begin() as txn:
                vector = txn.get(word.encode(encoding='UTF-8'))
                if vector:
                    word_vector = _deserialize_pickle(vector)
                    vector = None
                else:
                    word_vector = np.zeros((self.static_embed_size, ),
                                           dtype=np.float32)
                    # alternatively, initialize with random negative values
                    #word_vector = np.random.uniform(low=-0.5, high=0.0, size=(self.embed_size,))
                    # alternatively use fasttext OOV ngram possibilities (if ngram available)
        except lmdb.Error:
            # no idea why, but we need to close and reopen the environment to avoid
            # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
            # when opening new transaction !
            self.env.close()
            envFilePath = os.path.join(self.embedding_lmdb_path, self.name)
            self.env = lmdb.open(envFilePath,
                                 readonly=True,
                                 max_readers=2048,
                                 max_spare_txns=2,
                                 lock=False)
            return self.get_word_vector(word)
        return word_vector

    def get_ELMo_lmdb_vector(self, token_list, max_size_sentence):
        """
            Try to get the ELMo embeddings for a sequence cached in LMDB
        """
        if self.env_ELMo is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        try:
            ELMo_vector = np.zeros(
                (len(token_list), max_size_sentence - 2, ELMo_embed_size),
                dtype='float32')
            with self.env_ELMo.begin() as txn:
                for i in range(0, len(token_list)):
                    txn = self.env_ELMo.begin()
                    # get a hash for the token_list
                    the_hash = list_digest(token_list[i])
                    vector = txn.get(the_hash.encode(encoding='UTF-8'))
                    if vector:
                        # adapt expected shape/padding
                        local_embeddings = _deserialize_pickle(vector)
                        if local_embeddings.shape[0] > max_size_sentence - 2:
                            # squeeze the extra padding space
                            ELMo_vector[
                                i] = local_embeddings[:max_size_sentence - 2, ]
                        elif local_embeddings.shape[
                                0] == max_size_sentence - 2:
                            # bingo~!
                            ELMo_vector[i] = local_embeddings
                        else:
                            # fill the missing space with padding
                            filler = np.zeros((max_size_sentence -
                                               (local_embeddings.shape[0] + 2),
                                               ELMo_embed_size),
                                              dtype='float32')
                            ELMo_vector[i] = np.concatenate(
                                (local_embeddings, filler))
                        vector = None
                    else:
                        return None
        except lmdb.Error:
            # no idea why, but we need to close and reopen the environment to avoid
            # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
            # when opening new transaction !
            self.env_ELMo.close()
            self.env_ELMo = lmdb.open(self.embedding_ELMo_cache,
                                      readonly=True,
                                      max_readers=2048,
                                      max_spare_txns=2,
                                      lock=False)
            return self.get_ELMo_lmdb_vector(token_list)
        return ELMo_vector

    def get_BERT_lmdb_vector(self, sentence):
        """
            Try to get the BERT extracted embeddings for a sequence cached in LMDB
        """
        if self.env_BERT is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        try:
            BERT_vector = np.zeros((BERT_sentence_size, BERT_embed_size),
                                   dtype='float32')
            with self.env_BERT.begin() as txn:
                txn = self.env_BERT.begin()
                # get a hash for the token_list
                the_hash = list_digest(sentence)
                vector = txn.get(the_hash.encode(encoding='UTF-8'))

                if vector:
                    # adapt expected shape/padding
                    BERT_vector = _deserialize_pickle(vector)
                    '''
                    if local_embeddings.shape[0] > max_size_sentence:
                        # squeeze the extra padding space
                        BERT_vector = local_embeddings[:max_size_sentence,]
                    elif local_embeddings.shape[0] == max_size_sentence:
                        # bingo~!
                        BERT_vector = local_embeddings
                    else:
                        # fill the missing space with padding
                        filler = np.zeros((max_size_sentence-(local_embeddings.shape[0]), BERT_embed_size), dtype='float32')
                        BERT_vector = np.concatenate((local_embeddings, filler))
                    '''
                    vector = None
                else:
                    return None

        except lmdb.Error:
            # no idea why, but we need to close and reopen the environment to avoid
            # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
            # when opening new transaction !
            self.env_BERT.close()
            self.env_BERT = lmdb.open(self.embedding_BERT_cache,
                                      readonly=True,
                                      max_readers=2048,
                                      max_spare_txns=2,
                                      lock=False)
            return self.get_BERT_lmdb_vector(sentence)
        return BERT_vector

    def cache_ELMo_lmdb_vector(self, token_list, ELMo_vector):
        """
            Cache in LMDB the ELMo embeddings for a given sequence 
        """
        if self.env_ELMo is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        txn = self.env_ELMo.begin(write=True)
        for i in range(0, len(token_list)):
            # get a hash for the token_list
            the_hash = list_digest(token_list[i])
            txn.put(the_hash.encode(encoding='UTF-8'),
                    _serialize_pickle(ELMo_vector[i]))
        txn.commit()

    def cache_BERT_lmdb_vector(self, sentence, BERT_vector):
        """
            Cache in LMDB the BERT embeddings for a given sequence 
        """
        if self.env_BERT is None:
            # db cache not available, we don't cache BERT stuff
            return None
        txn = self.env_BERT.begin(write=True)
        #for i in range(0, len(sentence)):
        # get a hash for the token_list
        the_hash = list_digest(sentence)
        txn.put(the_hash.encode(encoding='UTF-8'),
                _serialize_pickle(BERT_vector))
        txn.commit()

    def clean_ELMo_cache(self):
        """
            Delete ELMo embeddings cache, this takes place normally after the completion of a training
        """
        if self.env_ELMo is None:
            # db cache not available, nothing to clean
            return
        else:
            self.env_ELMo.close()
            self.env_ELMo = None
            for file in os.listdir(self.embedding_ELMo_cache):
                file_path = os.path.join(self.embedding_ELMo_cache, file)
                if os.path.isfile(file_path):
                    os.remove(file_path)
            os.rmdir(self.embedding_ELMo_cache)

    def clean_BERT_cache(self):
        """
            Delete BERT embeddings cache, this takes place normally after the completion of a training
        """
        # if cache subdirectory does not exist, we create it
        if not os.path.exists(self.embedding_BERT_cache):
            os.makedirs(self.embedding_BERT_cache)
            return

        if self.env_BERT is None:
            # db cache not available, nothing to clean
            return
        else:
            self.env_BERT.close()
            self.env_BERT = None
            for file in os.listdir(self.embedding_BERT_cache):
                file_path = os.path.join(self.embedding_BERT_cache, file)
                if os.path.isfile(file_path):
                    os.remove(file_path)
            os.rmdir(self.embedding_BERT_cache)

    def get_word_vector_in_memory(self, word):
        if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'):
            # the pre-trained embeddings are not cased
            word = word.lower()
        if self.extension == 'bin':
            return self.model.get_word_vector(word)
        if word in self.model:
            return self.model[word]
        else:
            # for unknown word, we use a vector filled with 0.0
            return np.zeros((self.static_embed_size, ), dtype=np.float32)
            # alternatively, initialize with random negative values
            #return np.random.uniform(low=-0.5, high=0.0, size=(self.embed_size,))
            # alternatively use fasttext OOV ngram possibilities (if ngram available)

    def get_embedding_path(self, description):
        embeddings_path = None
        if "path" in description:
            embeddings_path = description["path"]
        self.lang = description["lang"]

        if embeddings_path is None or not os.path.isfile(embeddings_path):
            print("error: embedding path for", description['name'],
                  "is not valid", embeddings_path)
            if "url" in description and len(description["url"]) > 0:
                url = description["url"]
                download_path = self.registry['embedding-download-path']
                # if the download path does not exist, we create it
                if not os.path.isdir(download_path):
                    try:
                        os.mkdir(download_path)
                    except OSError:
                        print("Creation of the download directory",
                              download_path, "failed")

                print("Downloading resource file for", description['name'],
                      "...")
                embeddings_path = download_file(url, download_path)
                if embeddings_path != None and os.path.isfile(embeddings_path):
                    print("Download sucessful:", embeddings_path)
            else:
                print(
                    "no download url available for this embeddings resource, please review the embedding registry for",
                    description['name'])
        return embeddings_path
Esempio n. 29
0
config_path, checkpoint_path, dict_path = tuple(sys.argv[1:])

model = load_trained_model_from_checkpoint(config_path, checkpoint_path)
model.summary(line_length=120)

token_dict = {}
with codecs.open(dict_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

tokenizer = Tokenizer(token_dict)
text = '语言模型'
tokens = tokenizer.tokenize(text)
print('Tokens:', tokens)
indices, segments = tokenizer.encode(first='语言模型', max_len=512)

predicts = model.predict([np.array([indices]), np.array([segments])])[0]
for i, token in enumerate(tokens):
    print(token, predicts[i].tolist()[:5])
"""Official outputs:
{
  "linex_index": 0,
  "features": [
    {
      "token": "[CLS]",
      "layers": [
        {
          "index": -1,
          "values": [-0.63251, 0.203023, 0.079366, -0.032843, 0.566809, ...]
        }
Esempio n. 30
0
class Punc_DataLoader():
    def __init__(self, config, training=True):
        self.train = training
        self.init_all(config)

        self.vocab_featurizer = TextFeaturizer(config['punc_vocab'])
        self.bd_featurizer = TextFeaturizer(config['punc_biaodian'])
        self.bd = self.bd_featurizer.vocab_array
        self.batch = config['running_config']['batch_size']
        self.epochs = 1

    def init_bert(self, config, checkpoint):
        model = load_trained_model_from_checkpoint(config,
                                                   checkpoint,
                                                   trainable=False,
                                                   seq_len=None)
        return model

    def load_state(self, outdir):
        try:
            dg_state = np.load(os.path.join(outdir, 'dg_state.npz'))
            self.epochs = int(dg_state['epoch'])
            self.train_offset = int(dg_state['train_offset'])
            train_list = dg_state['train_list'].tolist()
            if len(train_list) != len(self.train_list):
                logging.info(
                    'history train list not equal new load train list ,data loader use init state'
                )
                self.epochs = 0
                self.train_offset = 0
        except FileNotFoundError:
            logging.info('not found state file,init state')
        except:
            logging.info('load state falied,use init state')

    def save_state(self, outdir):

        np.savez(os.path.join(outdir, 'dg_state.npz'),
                 epoch=self.epochs,
                 train_offset=self.train_offset,
                 train_list=self.train_list)

    def return_data_types(self):

        return (tf.int32, tf.int32, tf.float32)

    def return_data_shape(self):

        return (tf.TensorShape([None, None]), tf.TensorShape([None, None]),
                tf.TensorShape([None, None, 768]))

    def get_per_epoch_steps(self):
        return len(self.train_texts) // self.batch

    def eval_per_epoch_steps(self):
        return len(self.test_texts) // self.batch

    def init_all(self, config):
        if self.train:
            bert_config = config['bert']['config_json']
            bert_checkpoint = config['bert']['bert_ckpt']
            bert_vocab = config['bert']['bert_vocab']
            bert_vocabs = load_vocabulary(bert_vocab)
            self.bert_token = Tokenizer(bert_vocabs)
            self.bert = self.init_bert(bert_config, bert_checkpoint)
        self.get_sentence(
            config['train_list'] if self.train else config['eval_list'],
            training=self.train)

    def get_sentence(self, data_path, training):
        from tqdm import tqdm

        with open(data_path, encoding='utf-8') as f:
            data = f.readlines()

        txts = []
        for txt in tqdm(data):
            txt = txt.strip()
            if len(txt) > 150:
                continue
            txts.append(txt)
        if training:
            num = len(txts)
            train = txts[:int(num * 0.99)]
            test = txts[int(num * 0.99):]
            self.train_list, self.test_list = train, test
            self.train_offset = 0
            self.test_offset = 0
        else:
            self.test_texts = txts
            self.offset = 0

    def preprocess(self, txts):
        x = []
        for txt in txts:
            x_ = [self.vocab_featurizer.startid()]
            for i in txt:
                x_.append(self.vocab_featurizer.token_to_index[i])
            x_.append(self.vocab_featurizer.endid())
            x.append(np.array(x_))
        return x

    def bert_decode(self, x, x2=None):
        tokens, segs = [], []
        if x2 is not None:
            for i, j in zip(x, x2):
                t, s = self.bert_token.encode(''.join(i))
                index = np.where(j == 2)[0]
                if len(index) > 0:
                    for n in index:
                        t[int(n)] = 103
                tokens.append(t)
                segs.append(s)
        else:
            for i in x:
                t, s = self.bert_token.encode(''.join(i))
                tokens.append(t)
                segs.append(s)
        return tokens, segs

    def pad(self, x, mode=1):
        length = 0

        for i in x:
            length = max(length, len(i))
        if mode == 2:
            for i in range(len(x)):
                pading = np.ones([length - len(x[i]), x[i].shape[1]]) * -10.
                x[i] = np.vstack((x[i], pading))
        elif mode == 3:
            for i in range(len(x)):
                pading = np.zeros([length - len(x[i]), x[i].shape[1]])
                x[i] = np.vstack((x[i], pading))
        else:
            x = pad_sequences(x, length, padding='post', truncating='post')
        return x

    def get_bert_feature(self, bert_t, bert_s):

        length = [len(i) for i in bert_t]
        max_len = max(length)
        bert_s = tf.keras.preprocessing.sequence.pad_sequences(
            bert_s, max_len, padding='post', truncating='post')
        bert_t = tf.keras.preprocessing.sequence.pad_sequences(
            bert_t, max_len, padding='post', truncating='post')
        features = self.bert.predict([bert_t, bert_s])

        for idx, l in enumerate(length):
            features[idx, l:] = -10.

        return features

    def get_target(self, text):

        bd = self.bd
        zh = []
        bd_ = [[0]]
        for n in text:
            if n in bd:
                bd_[-1].append(bd.index(n))
            else:
                zh.append(n)
                bd_.append([0])
        zh_txt = ''.join(zh)
        bd_txt = bd_ + [[0]]
        return zh_txt, bd_txt

    def process_punc(self, puncs):
        x = []
        for punc in puncs:
            x_ = []
            for i in range(len(punc)):
                if len(punc[i]) == 1:
                    x_ += [1]
                else:
                    x_ += punc[i][-1:]
            x.append(np.array(x_, 'int32'))
        return x

    def check_valid(self, txt, vocab_list):
        if len(txt) == 0:
            return False
        for n in txt:
            if n in vocab_list:
                pass
            else:
                return n
        return True

    def generate(self, train):

        trainx = []
        trainy = []
        for i in range(self.batch * 10):
            if train:
                line = self.train_list[self.train_offset]
                self.train_offset += 1
                if self.train_offset > len(self.train_list) - 1:
                    self.train_offset = 0
                    np.random.shuffle(self.train_list)
                    self.epochs += 1
            else:
                line = self.test_list[self.test_offset]
                self.test_offset += 1
                if self.test_offset > len(self.test_list) - 1:
                    self.test_offset = 0

            line = line.strip()
            if len(line) < 30:
                extra = random.sample(self.train_list, 1)[0]
                extra = extra.strip()
                line += extra

            if self.check_valid(line, self.vocab_featurizer.vocab_array +
                                self.bd) is not True:
                continue
            try:
                x, y = self.get_target(line)
            except:
                continue
            trainx.append(x)
            trainy.append(y)
            if len(trainx) == self.batch:
                break

        inp_tokens = self.preprocess(trainx)
        e_bert_t, e_bert_s = self.bert_decode(trainx)
        e_features = self.get_bert_feature(e_bert_t, e_bert_s)
        trainy = self.process_punc(trainy)
        inp_tokens = self.pad(inp_tokens)
        trainy = self.pad(trainy)
        e_features = self.pad(e_features, 2)
        inp_tokens = np.array(inp_tokens)
        trainy = np.array(trainy)
        e_features = np.array(e_features, dtype='float32')

        return inp_tokens, trainy, e_features

    def generator(self, train=True):
        while 1:
            x, y, features = self.generate(train)
            if x.shape[1] != y.shape[1] and y.shape[1] != features.shape[1]:
                logging.info('bad batch,skip')
                continue
            yield x, y, features