Python Tokenizer 예제들, bert4keras.utils.Tokenizer Python 예제들

예제 #1

0

파일 보기

파일: predict.py 프로젝트: aiedward/keras-bert-ner-1

def build_trained_model(args):
    if args.device_map != "cpu":
        os.environ["CUDA_VISIBLE_DEVICES"] = args.device_map
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = ""

    token_dict = {}
    with codecs.open(args.bert_vocab, "r", encoding="utf-8") as f:
        for line in f:
            token = line.strip()
            token_dict[token] = len(token_dict)

    tokenizer = Tokenizer(token_dict)

    with codecs.open(os.path.join(args.model_path, "tag2id.pkl"), "rb") as f:
        tag2id = pickle.load(f)
    with codecs.open(os.path.join(args.model_path, "id2tag.pkl"), "rb") as f:
        id2tag = pickle.load(f)

    mask_tag = "X"
    crf_loss = CRF_Loss(tag2id=tag2id, mask_tag=mask_tag).crf_loss
    crf_accuracy = CRF_Accuracy(tag2id=tag2id, mask_tag=mask_tag).crf_accuracy

    custom_objects["CRF"] = CRF
    custom_objects["crf_loss"] = crf_loss
    custom_objects["crf_accuracy"] = crf_accuracy

    model = load_model(os.path.join(args.model_path, args.model_name),
                       custom_objects=custom_objects)

    viterbi_decoder = Viterbi(model, len(id2tag))

    return tokenizer, id2tag, viterbi_decoder

예제 #2

0

파일 보기

파일: task_classify_albert_20191019.py 프로젝트: gswyhq/bert4keras

def process_data(data_file='./data/classify_data.txt'):
    with open(data_file, encoding='utf-8')as f:
        datas = f.readlines()

    chars = set()
    labels = set()
    new_datas = []
    for data in datas:
        data = data.strip()
        if not data:
            continue
        text, label = data.rsplit(maxsplit=1)
        chars.update(set(text))
        labels.add(label)
        new_datas.append([text, label])
    del datas

    label2id = {lab: i for i, lab in enumerate(list(labels))}

    _token_dict = load_vocab(dict_path) # 读取词典
    token_dict, keep_words = {}, []

    for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
        token_dict[c] = len(token_dict)
        keep_words.append(_token_dict[c])

    for c in chars:
        if c in _token_dict:
            token_dict[c] = len(token_dict)
            keep_words.append(_token_dict[c])


    tokenizer = Tokenizer(token_dict) # 建立分词器

    with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
        pickle.dump(tokenizer, f)

    with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
        pickle.dump(keep_words, f)

    with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f:
        pickle.dump(label2id, f)

    if not os.path.exists('./random_order.json'):
        random_order = [i for i in range(len(new_datas))]
        random.shuffle(random_order)
        json.dump(
            random_order,
            open('./random_order.json', 'w'),
            indent=4
        )
    else:
        random_order = json.load(open('./random_order.json'))

    # 按照9:1的比例划分训练集和验证集
    train_data = [new_datas[j] for i, j in enumerate(random_order) if i % 10 != 0]
    valid_data = [new_datas[j] for i, j in enumerate(random_order) if i % 10 == 0]

    return train_data, valid_data, tokenizer, keep_words, label2id

예제 #3

0

파일 보기

class Processor(object):
    def __init__(self, train_path, token_dict):
        self.train_path = train_path
        self.tokenizer = Tokenizer(token_dict)

    def get_tags(self):
        tags = set()
        train_data = self.get_data(self.train_path)
        for item in train_data:
            for tag in item[1].split(" "):
                tags.add(tag)
        # PAD-tag用X
        self.tag2id = {list(tags)[i]: i for i in range(len(tags))}
        self.tag2id["X"] = len(self.tag2id)
        self.id2tag = {self.tag2id[i]: i for i in self.tag2id}
        return self.tag2id, self.id2tag

    def get_data(self, path):
        with codecs.open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return data

    def get_bert_inputs(self, path, max_len):
        srcs = self.get_data(path)
        src_data, src_tags = [], []
        for item in srcs:
            src_data.append(item[0])
            src_tags.append(item[1])
        tokens, segs, tags = [], [], []
        for item in src_data:
            res = self.tokenizer.encode(item, first_length=max_len)
            tokens.append(np.array(res[0]))
            segs.append(np.array(res[1]))
        max_len -= 2
        for item in src_tags:
            len_item = len(item.split(" "))
            if len_item >= max_len:
                tags.append(["X"] + item.split(" ")[:max_len] + ["X"])
            else:
                tags.append(["X"] + item.split(" ") + ["X"] *
                            (max_len - len_item + 1))
        tags = [[self.tag2id[item] for item in term[1:]] for term in tags]
        tags = np.expand_dims(tags, axis=-1)
        return tokens, segs, tags

예제 #4

0

파일 보기

def load_data(filename):
    D = []
    with codecs.open(filename, encoding='utf-8') as f:
        for l in f:
            text1, text2, label = l.strip().split('\t')
            D.append((text1, text2, int(label)))
    return D


# 加载数据集
train_data = load_data('datasets/lcqmc/lcqmc.train.data')
valid_data = load_data('datasets/lcqmc/lcqmc.valid.data')
test_data = load_data('datasets/lcqmc/lcqmc.test.data')

# 建立分词器
tokenizer = Tokenizer(dict_path)


def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([
        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x
        for x in X
    ])


class data_generator:
    def __init__(self, data, batch_size=64):
        self.data = data
        self.batch_size = batch_size

예제 #5

0

파일 보기

def read_texts():
    txts = glob.glob('../../thuctc/THUCNews/*/*.txt')
    np.random.shuffle(txts)
    for txt in txts:
        d = open(txt).read()
        d = d.decode('utf-8').replace(u'\u3000', ' ')
        d = d.split('\n')
        if len(d) > 1:
            title = d[0].strip()
            content = '\n'.join(d[1:]).strip()
            if len(title) <= max_output_len:
                yield content[:max_input_len], title


_token_dict = load_vocab(dict_path)  # 读取词典
_tokenizer = Tokenizer(_token_dict)  # 建立临时分词器

if os.path.exists(seq2seq_config):

    tokens = json.load(open(seq2seq_config))

else:

    def _batch_texts():
        texts = []
        for text in read_texts():
            texts.extend(text)
            if len(texts) == 1000:
                yield texts
                texts = []
        if texts:

예제 #6

0

파일 보기

파일: basic_extract_features.py 프로젝트: dailyncepu/bert4keras

#! -*- coding: utf-8 -*-
# 测试代码可用性: 提取特征

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import Tokenizer
from keras.models import load_model
import numpy as np

config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path)  # 建立模型，加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')

print('\n ===== predicting =====\n')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))
"""
输出：
[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
    0.2575253 ]
  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
    0.03881441]
  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
    0.08222899]
  [-0.2924238   0.6052722   0.49968526 ...  0.8604137  -0.6533166
    0.5369075 ]
  [-0.7473459   0.49431565  0.7185162  ...  0.3848612  -0.74090636

예제 #7

0

파일 보기

 def get_tokenizer(cls,):
     if cls.tokenizer is not None:
         return cls.tokenizer
     else:
         cls.tokenizer = Tokenizer(cls.get_token_dict()[0])
         return cls.tokenizer

예제 #8

0

파일 보기

파일: task_sentiment_albert.py 프로젝트: lgstd/bert4keras

from bert4keras.bert import load_pretrained_model, set_gelu
from bert4keras.utils import Tokenizer, load_vocab
from bert4keras.train import PiecewiseLinearLearningRate
set_gelu('tanh')  # 切换gelu版本

maxlen = 100
config_path = '/root/kg/bert/albert_base_zh/bert_config.json'
checkpoint_path = '/root/kg/bert/albert_base_zh/bert_model.ckpt'
dict_path = '/root/kg/bert/albert_base_zh/vocab.txt'

neg = pd.read_excel('datasets/neg.xls', header=None)
pos = pd.read_excel('datasets/pos.xls', header=None)
data, tokens = [], {}

_token_dict = load_vocab(dict_path)  # 读取词典
_tokenizer = Tokenizer(_token_dict)  # 建立临时分词器

for d in neg[0]:
    data.append((d, 0))
    for t in _tokenizer.tokenize(d):
        tokens[t] = tokens.get(t, 0) + 1

for d in pos[0]:
    data.append((d, 1))
    for t in _tokenizer.tokenize(d):
        tokens[t] = tokens.get(t, 0) + 1

tokens = {i: j for i, j in tokens.items() if j >= 4}
token_dict, keep_words = {}, []  # keep_words是在bert中保留的字表

for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:

예제 #9

0

파일 보기

파일: basic_masked_language_model.py 프로젝트: lgstd/bert4keras

#! -*- coding: utf-8 -*-
# 测试代码可用性: MLM

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import Tokenizer, load_vocab
import numpy as np

config_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '../../kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

token_dict = load_vocab(dict_path)  # 读取词典
tokenizer = Tokenizer(token_dict)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path,
                              with_mlm=True)  # 建立模型，加载权重

token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')

# mask掉“技术”
token_ids[3] = token_ids[4] = token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
print(tokenizer.decode(probas[3:5].argmax(axis=1)))  # 结果正是“技术”

예제 #10

0

파일 보기

파일: similarity.py 프로젝트: zuoshiyang/yousan.ai

import numpy as np


def similarity_count(vec1, vec2, mode='cos'):
    if mode == 'eu':
        return euclidean_distances([vec1, vec2])[0][1]
    if mode == 'cos':
        return cosine_similarity([vec1, vec2])[0][1]


maxlen = 128
config_path = 'albert_tiny_zh_google/albert_config_tiny_g.json'
checkpoint_path = 'albert_tiny_zh_google/albert_model.ckpt'
dict_path = 'albert_tiny_zh_google/vocab.txt'

tokenizer = Tokenizer(dict_path)

# 加载预训练模型
bert = build_bert_model(
    config_path=config_path,
    checkpoint_path=checkpoint_path,
    with_pool=True,
    albert=True,
    return_keras_model=False,
)

model = Model(bert.model.input, bert.model.output)

token_ids1, segment_ids1 = tokenizer.encode(u'我想去北京')
token_ids2, segment_ids2 = tokenizer.encode(u'我想去香港')
token_ids3, segment_ids3 = tokenizer.encode(u'目前的局势，止暴制乱，刻不容缓')

예제 #11

0

파일 보기

파일: basic_extract_features.py 프로젝트: lgstd/bert4keras

#! -*- coding: utf-8 -*-
# 测试代码可用性: 提取特征

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import Tokenizer, load_vocab
from keras.models import load_model
import numpy as np


config_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '../../kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

token_dict = load_vocab(dict_path) # 读取词典
tokenizer = Tokenizer(token_dict) # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path) # 建立模型，加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')

print('\n ===== predicting =====\n')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))

"""
输出：
[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
    0.2575253 ]
  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
    0.03881441]
  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
    0.08222899]

예제 #12

0

파일 보기

import numpy as np


def similarity_count(vec1, vec2, mode='cos'):
    if mode == 'eu':
        return euclidean_distances([vec1, vec2])[0][1]
    if mode == 'cos':
        return cosine_similarity([vec1, vec2])[0][1]


maxlen = 128
config_path = 'albert_tiny_zh_google/albert_config_tiny_g.json'
checkpoint_path = 'albert_tiny_zh_google/albert_model.ckpt'
dict_path = 'albert_tiny_zh_google/vocab.txt'

tokenizer = Tokenizer(dict_path)

# 加载预训练模型
bert = build_bert_model(
    config_path=config_path,
    checkpoint_path=checkpoint_path,
    with_pool=True,
    albert=True,
    return_keras_model=False,
)

model = Model(bert.model.input, bert.model.output)

token_ids1, segment_ids1 = tokenizer.encode(u'我想去北京逛一逛天安门')
token_ids2, segment_ids2 = tokenizer.encode(u'我想去香港')
token_ids3, segment_ids3 = tokenizer.encode(u'我想到天安门广场走一走')

예제 #13

0

파일 보기

                # 所以只需要看第一个，不需要遍历后面的。
                if i == 0 and j > 0:
                    continue
                for k in _topk_arg[j]:
                    _candidate_ids.append(ids + [k + 3])
                    _candidate_scores.append(sco + _log_probas[j][k])
            _topk_arg = np.argsort(_candidate_scores)[-topk:]  # 从中选出新的topk
            target_ids = [_candidate_ids[k] for k in _topk_arg]
            target_scores = [_candidate_scores[k] for k in _topk_arg]
            best_one = np.argmax(target_scores)
            if target_ids[best_one][-1] == self.token_dict.get("[SEP]"):
                return self.tokenizer.decode(target_ids[best_one])
        # 如果max_output_len字都找不到结束符，直接返回
        return self.tokenizer.decode(target_ids[np.argmax(target_scores)])


def get_token_dict(token_file):
    with open(token_file, "r") as f:
        token_list = f.readlines()
        token_dict = {word.strip(): id_ for id_, word in enumerate(token_list)}
    return token_dict


if __name__ == "__main__":
    dict_path = '/opt/developer/wp/nlpapp/train/multilingual_L-12_H-768_A-12/vocab.txt'
    token_dict = get_token_dict(dict_path)
    tokenizer = Tokenizer(token_dict)
    seq_model = trans_infer(tokenizer, token_dict)
    # ans = seq_model.gen_trans(input_.lower(), topk)
    print(seq_model.gen_trans("NLP简直太神奇了".lower(), 2))

예제 #14

0

파일 보기

파일: app.py 프로젝트: wp931120/nlpapp

            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]') # space类用未经训练的[unused1]表示
            else:
                R.append('[UNK]') # 剩余的字符是[UNK]
        return R

dict_path = '/opt/developer/wp/wzcq/roberta_wwm/vocab.txt'
token_dict = get_token_dict(dict_path)
tokenizer = OurTokenizer(token_dict)


trans_dic_path = '/opt/developer/wp/nlpapp/train/multilingual_L-12_H-768_A-12/vocab.txt'
token_dict_trans = get_token_dict(trans_dic_path)
trans_tokenizer = Tokenizer(token_dict_trans)

@app.route('/')
def hello_world():
    data = {}
    return render_template("ci.html", **data)

@app.route('/mc')
def machine_read():
    return render_template('mc.html')

@app.route('/ci')
def generate_ci():
    return render_template('ci.html')

@app.route('/trans')

예제 #15

0

파일 보기

 def __init__(self, train_path, token_dict):
     self.train_path = train_path
     self.tokenizer = Tokenizer(token_dict)