Python Tokenizer.Tokenizerの例

プログラミング言語: Python

名前空間/パッケージ名: bert4keras.utils

クラス/型: Tokenizer

メソッド/関数: Tokenizer

hotexamples.comのコード掲載数: 10

Python Tokenizer.Tokenizer - 10件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのbert4keras.utils.Tokenizer.Tokenizerの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Tokenizer(10)

encode(5)

tokenize(2)

decode(1)

よく使われるメソッド

Tokenizer (10)

encode (5)

tokenize (2)

decode (1)

コード例 #1

ファイルを表示

ファイル: predict.py プロジェクト: aiedward/keras-bert-ner-1

def build_trained_model(args):
    if args.device_map != "cpu":
        os.environ["CUDA_VISIBLE_DEVICES"] = args.device_map
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = ""

    token_dict = {}
    with codecs.open(args.bert_vocab, "r", encoding="utf-8") as f:
        for line in f:
            token = line.strip()
            token_dict[token] = len(token_dict)

    tokenizer = Tokenizer(token_dict)

    with codecs.open(os.path.join(args.model_path, "tag2id.pkl"), "rb") as f:
        tag2id = pickle.load(f)
    with codecs.open(os.path.join(args.model_path, "id2tag.pkl"), "rb") as f:
        id2tag = pickle.load(f)

    mask_tag = "X"
    crf_loss = CRF_Loss(tag2id=tag2id, mask_tag=mask_tag).crf_loss
    crf_accuracy = CRF_Accuracy(tag2id=tag2id, mask_tag=mask_tag).crf_accuracy

    custom_objects["CRF"] = CRF
    custom_objects["crf_loss"] = crf_loss
    custom_objects["crf_accuracy"] = crf_accuracy

    model = load_model(os.path.join(args.model_path, args.model_name),
                       custom_objects=custom_objects)

    viterbi_decoder = Viterbi(model, len(id2tag))

    return tokenizer, id2tag, viterbi_decoder

コード例 #2

ファイルを表示

ファイル: task_classify_albert_20191019.py プロジェクト: gswyhq/bert4keras

def process_data(data_file='./data/classify_data.txt'):
    with open(data_file, encoding='utf-8')as f:
        datas = f.readlines()

    chars = set()
    labels = set()
    new_datas = []
    for data in datas:
        data = data.strip()
        if not data:
            continue
        text, label = data.rsplit(maxsplit=1)
        chars.update(set(text))
        labels.add(label)
        new_datas.append([text, label])
    del datas

    label2id = {lab: i for i, lab in enumerate(list(labels))}

    _token_dict = load_vocab(dict_path) # 读取词典
    token_dict, keep_words = {}, []

    for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
        token_dict[c] = len(token_dict)
        keep_words.append(_token_dict[c])

    for c in chars:
        if c in _token_dict:
            token_dict[c] = len(token_dict)
            keep_words.append(_token_dict[c])


    tokenizer = Tokenizer(token_dict) # 建立分词器

    with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
        pickle.dump(tokenizer, f)

    with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
        pickle.dump(keep_words, f)

    with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f:
        pickle.dump(label2id, f)

    if not os.path.exists('./random_order.json'):
        random_order = [i for i in range(len(new_datas))]
        random.shuffle(random_order)
        json.dump(
            random_order,
            open('./random_order.json', 'w'),
            indent=4
        )
    else:
        random_order = json.load(open('./random_order.json'))

    # 按照9:1的比例划分训练集和验证集
    train_data = [new_datas[j] for i, j in enumerate(random_order) if i % 10 != 0]
    valid_data = [new_datas[j] for i, j in enumerate(random_order) if i % 10 == 0]

    return train_data, valid_data, tokenizer, keep_words, label2id

コード例 #3

ファイルを表示

def load_data(filename):
    D = []
    with codecs.open(filename, encoding='utf-8') as f:
        for l in f:
            text1, text2, label = l.strip().split('\t')
            D.append((text1, text2, int(label)))
    return D


# 加载数据集
train_data = load_data('datasets/lcqmc/lcqmc.train.data')
valid_data = load_data('datasets/lcqmc/lcqmc.valid.data')
test_data = load_data('datasets/lcqmc/lcqmc.test.data')

# 建立分词器
tokenizer = Tokenizer(dict_path)


def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([
        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x
        for x in X
    ])


class data_generator:
    def __init__(self, data, batch_size=64):
        self.data = data
        self.batch_size = batch_size

コード例 #4

ファイルを表示

def read_texts():
    txts = glob.glob('../../thuctc/THUCNews/*/*.txt')
    np.random.shuffle(txts)
    for txt in txts:
        d = open(txt).read()
        d = d.decode('utf-8').replace(u'\u3000', ' ')
        d = d.split('\n')
        if len(d) > 1:
            title = d[0].strip()
            content = '\n'.join(d[1:]).strip()
            if len(title) <= max_output_len:
                yield content[:max_input_len], title


_token_dict = load_vocab(dict_path)  # 读取词典
_tokenizer = Tokenizer(_token_dict)  # 建立临时分词器

if os.path.exists(seq2seq_config):

    tokens = json.load(open(seq2seq_config))

else:

    def _batch_texts():
        texts = []
        for text in read_texts():
            texts.extend(text)
            if len(texts) == 1000:
                yield texts
                texts = []
        if texts:

コード例 #5

ファイルを表示

ファイル: basic_extract_features.py プロジェクト: dailyncepu/bert4keras

#! -*- coding: utf-8 -*-
# 测试代码可用性: 提取特征

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import Tokenizer
from keras.models import load_model
import numpy as np

config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path)  # 建立模型，加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')

print('\n ===== predicting =====\n')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))
"""
输出：
[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
    0.2575253 ]
  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
    0.03881441]
  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
    0.08222899]
  [-0.2924238   0.6052722   0.49968526 ...  0.8604137  -0.6533166
    0.5369075 ]
  [-0.7473459   0.49431565  0.7185162  ...  0.3848612  -0.74090636

コード例 #6

ファイルを表示

 def get_tokenizer(cls,):
     if cls.tokenizer is not None:
         return cls.tokenizer
     else:
         cls.tokenizer = Tokenizer(cls.get_token_dict()[0])
         return cls.tokenizer

コード例 #7

ファイルを表示

ファイル: basic_masked_language_model.py プロジェクト: lgstd/bert4keras

#! -*- coding: utf-8 -*-
# 测试代码可用性: MLM

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import Tokenizer, load_vocab
import numpy as np

config_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '../../kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

token_dict = load_vocab(dict_path)  # 读取词典
tokenizer = Tokenizer(token_dict)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path,
                              with_mlm=True)  # 建立模型，加载权重

token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')

# mask掉“技术”
token_ids[3] = token_ids[4] = token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
print(tokenizer.decode(probas[3:5].argmax(axis=1)))  # 结果正是“技术”

コード例 #8

ファイルを表示

                # 所以只需要看第一个，不需要遍历后面的。
                if i == 0 and j > 0:
                    continue
                for k in _topk_arg[j]:
                    _candidate_ids.append(ids + [k + 3])
                    _candidate_scores.append(sco + _log_probas[j][k])
            _topk_arg = np.argsort(_candidate_scores)[-topk:]  # 从中选出新的topk
            target_ids = [_candidate_ids[k] for k in _topk_arg]
            target_scores = [_candidate_scores[k] for k in _topk_arg]
            best_one = np.argmax(target_scores)
            if target_ids[best_one][-1] == self.token_dict.get("[SEP]"):
                return self.tokenizer.decode(target_ids[best_one])
        # 如果max_output_len字都找不到结束符，直接返回
        return self.tokenizer.decode(target_ids[np.argmax(target_scores)])


def get_token_dict(token_file):
    with open(token_file, "r") as f:
        token_list = f.readlines()
        token_dict = {word.strip(): id_ for id_, word in enumerate(token_list)}
    return token_dict


if __name__ == "__main__":
    dict_path = '/opt/developer/wp/nlpapp/train/multilingual_L-12_H-768_A-12/vocab.txt'
    token_dict = get_token_dict(dict_path)
    tokenizer = Tokenizer(token_dict)
    seq_model = trans_infer(tokenizer, token_dict)
    # ans = seq_model.gen_trans(input_.lower(), topk)
    print(seq_model.gen_trans("NLP简直太神奇了".lower(), 2))

コード例 #9

ファイルを表示

ファイル: app.py プロジェクト: wp931120/nlpapp

            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]') # space类用未经训练的[unused1]表示
            else:
                R.append('[UNK]') # 剩余的字符是[UNK]
        return R

dict_path = '/opt/developer/wp/wzcq/roberta_wwm/vocab.txt'
token_dict = get_token_dict(dict_path)
tokenizer = OurTokenizer(token_dict)


trans_dic_path = '/opt/developer/wp/nlpapp/train/multilingual_L-12_H-768_A-12/vocab.txt'
token_dict_trans = get_token_dict(trans_dic_path)
trans_tokenizer = Tokenizer(token_dict_trans)

@app.route('/')
def hello_world():
    data = {}
    return render_template("ci.html", **data)

@app.route('/mc')
def machine_read():
    return render_template('mc.html')

@app.route('/ci')
def generate_ci():
    return render_template('ci.html')

@app.route('/trans')

コード例 #10

ファイルを表示

 def __init__(self, train_path, token_dict):
     self.train_path = train_path
     self.tokenizer = Tokenizer(token_dict)