Exemple #1
0
# -*- coding: utf-8 -*-
"""
@Time    : 2021/6/1 11:05
@Author  : huangkai21
@file    : basic_feature_extract.py
"""
# ! -*- coding: utf-8 -*-
# 测试代码可用性: 提取特征
"""
将vocab里的每一个字转化为向量形式,保存为json字典
"""
import numpy as np
from bert4keras.backend import keras
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
# from bert4keras.snippets import to_array

def to_array(*args):
    """批量转numpy的array
    """
    results = [np.array(a) for a in args]
    if len(args) == 1:
        return results[0]
    else:
        return results

config_path = r'E:\code\chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = r'E:\code\chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = r'E:\code\chinese_L-12_H-768_A-12/vocab.txt'
tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
Exemple #2
0
import numpy as np
from bert import tokenization
from tqdm import tqdm
from config import Config
import pandas as pd
import os
from bert4keras.tokenizers import Tokenizer
from bert.data_utils import split_text
vocab_file = Config().vocab_file
do_lower_case = True
re_tokenzier = Tokenizer(vocab_file, do_lower_case)
config = Config()


def load_data(data_file):
    """
    读取数据
    :param file:
    :return:
    """
    data_df = pd.read_csv(data_file)
    data_df.fillna('', inplace=True)
    lines = list(
        zip(list(data_df['id']), list(data_df['question']),
            list(data_df['context']), list(data_df['answer']),
            list(data_df['answer_start'])))
    return lines


def create_example(lines):
    examples = []
Exemple #3
0
else:
    random_order = json.load(open('../random_order.json'))

# 划分valid
train_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 != 0]
valid_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 == 0]
train_data.extend(train_data)
train_data.extend(webqa_data)  # 将SogouQA和WebQA按2:1的比例混合

# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        """单条样本格式:[CLS]篇章[SEP]问题[SEP]答案[SEP]
        """
        batch_token_ids, batch_segment_ids = [], []
        for is_end, D in self.sample(random):
            question = D['question']
            answers = [p['answer'] for p in D['passages'] if p['answer']]
            passage = np.random.choice(D['passages'])['passage']
            passage = re.sub(u' |、|;|,', ',', passage)
            final_answer = ''
Exemple #4
0
                tmp = []
                for k in query_sign_map["natural_query"]:
                    query = query_sign_map["natural_query"][k]
                    tmp.append((query, s, k))
                tmps.append(tmp)
    test_data.append(tmps)

# 加载数据集
#data = json.load(open("./mrc_base_data_seed{}.json".format(seed_value),encoding="utf-8"))
#train_data = data["train"]

#valid_data = data["valid"]
#test_data = data["test"]
#print(test_data[0])
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

#构建问答ner,(query,text,start_labels,end_labels
#其次要把query以及[cls,seq] mask掉。利用query长度构建mask[0]*len(query)+[0]+[1]*len(text)+[0],构建mask
#


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=True):
        batch_token_ids, batch_segment_ids, batch_mask, batch_start_labels,  batch_end_labels= [],[],[],[],[]
        for is_end, (query, text, start_label, end_label, T,
                     label) in self.sample(random):
            """
            token_ids, segment_ids = tokenizer.encode(
Exemple #5
0
from bert4keras.snippets import uniout
import tensorflow as tf
from bert4keras.backend import K
from flask import Flask, request, render_template, send_file

app = Flask(__name__)

graph = tf.get_default_graph()
sess = K.get_session()
set_session = K.set_session

config_path = r'GPT_large-tf\gpt_config.json'
checkpoint_path = r'GPT_large-tf\gpt_model.ckpt'
dict_path = r'GPT_large-tf\vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
speakers = [
    tokenizer.token_to_id('[speaker1]'),
    tokenizer.token_to_id('[speaker2]')
]

model = build_transformer_model(config_path=config_path,
                                checkpoint_path=checkpoint_path,
                                model='GPT_OpenAI')  # 建立模型,加载权重


# model.save("gpt.h5")
class ChatBot(AutoRegressiveDecoder):
    """基于随机采样对话机器人
    """
    @AutoRegressiveDecoder.wraps(default_rtype='probas')
Exemple #6
0
#! -*- coding: utf-8 -*-
# 基本测试:中文GPT2模型
# 介绍链接:https://kexue.fm/archives/7292

import numpy as np
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.snippets import AutoRegressiveDecoder
from bert4keras.snippets import uniout

config_path = '/root/gpt2/config.json'
checkpoint_path = '/root/gpt2/model.ckpt-100000'
dict_path = '/root/gpt2/vocab.txt'

tokenizer = Tokenizer(
    dict_path, token_start=None, token_end=None, do_lower_case=True
)  # 建立分词器

model = build_transformer_model(
    config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2_ml'
)  # 建立模型,加载权重


class ArticleCompletion(AutoRegressiveDecoder):
    """基于随机采样的文章续写
    """
    @AutoRegressiveDecoder.wraps(default_rtype='probas')
    def predict(self, inputs, output_ids, states):
        token_ids = np.concatenate([inputs[0], output_ids], 1)
        return model.predict(token_ids)[:, -1]
#! -*- coding: utf-8 -*-
# 测试代码可用性: 提取特征

import numpy as np
from bert4keras.backend import keras
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.snippets import to_array

config_path = './chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = './chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = './chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
model = build_transformer_model(config_path, checkpoint_path)  # 建立模型,加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')
token_ids, segment_ids = to_array([token_ids], [segment_ids])

print('\n ===== predicting =====\n')
print(model.predict([token_ids, segment_ids]))
"""
输出:
[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
    0.2575253 ]
  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
    0.03881441]
  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
    0.08222899]
  [-0.2924238   0.6052722   0.49968526 ...  0.8604137  -0.6533166
 def set_dict_path(self, path):
     self.dict = path
     self.tokenizer = Tokenizer(self.dict, do_lower_case=True)
#! -*- coding: utf-8 -*-
# 测试代码可用性: MLM

from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
import numpy as np

config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
model = build_transformer_model(config_path=config_path,
                                checkpoint_path=checkpoint_path,
                                with_mlm=True)  # 建立模型,加载权重

token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')

# mask掉“技术”
token_ids[3] = token_ids[4] = tokenizer._token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
print(tokenizer.decode(probas[3:5].argmax(axis=1)))  # 结果正是“技术”
Exemple #10
0
                elif this_flag[:1] == 'B':
                    d.append([char, this_flag[2:]])
                else:
                    d[-1][0] += char
                last_flag = this_flag
            D.append(d)
    return D


# 标注数据
train_data = load_data('data/china-people-daily-ner-corpus/example.train')
valid_data = load_data('data/china-people-daily-ner-corpus/example.dev')
test_data = load_data('data/china-people-daily-ner-corpus/example.test')

# 建立分词器
tokenizer = Tokenizer(vocab_path, do_lower_case=True)


class data_generator(DataGenerator):
    """数据生成器
    """

    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_start, batch_end = [], [], [], []
        for is_end, item in self.sample(random):
            for k, v in query_mapping.items():

                query_token_ids, query_segment_ids = tokenizer.encode(v)
                token_ids = query_token_ids.copy()
                start = query_segment_ids.copy()
                end = query_segment_ids.copy()
class AlbertNerModel(object):
    # model=None
    def __init__(self,
                 model_name: str,
                 path: str,
                 config_path: str,
                 checkpoint_path: str,
                 dict_path: str,
                 layers: int = 0,
                 unshared: bool = False):
        """
        Albert 初始化参数
        :param model_name: 模型名称,albert_base/albert_small/albert_tiny, 不推荐albertbase/albertsmall/alberttiny
        :param path: 权重路径
        :param config_path: 预训练模型配置文件
        :param checkpoint_path: 预训练模型文件
        :param dict_path: 预训练模型字典
        :param layers: 可选自定义层数,base最大12层,small最大6层,tiny最大4层
        :param unshared: 是否以Bert形式做层分解,默认为否
        """
        if tf.__version__ >= '2.0':
            print('暂不支持tensorflow 2.0 以上版本')
            raise
        self.weight_path = path
        self.__maxlen = 256
        self.__crf_lr_multiplier = 1000

        if str(model_name).upper() == 'ALBERT_BASE' or str(
                model_name).upper() == 'ALBERTBASE':
            self.albert_layers = 12
        elif str(model_name).upper() == 'ALBERT_SMALL' or str(
                model_name).upper() == 'ALBERTSMALL':
            self.albert_layers = 6
        elif str(model_name).upper() == 'ALBERT_TINY' or str(
                model_name).upper() == 'ALBERTTINY':
            self.albert_layers = 4
        if layers > 0:
            self.albert_layers = layers
        self.pretrain_name = model_name
        self.config = config_path
        self.checkpoint = checkpoint_path
        self.dict = dict_path
        self.unshared = unshared

        self.tokenizer = Tokenizer(self.dict, do_lower_case=True)
        # 类别映射
        labels = ['PER', 'LOC', 'ORG']
        id2label = dict(enumerate(labels))
        # label2id={j: i for i,j in id2label.items()}
        self.__id2label = id2label
        self.__num_labels = len(labels) * 2 + 1
        # label2id = {j: i for i, j in id2label.items()}
        assert self.config and self.checkpoint and self.dict
        # self.__crf= ConditionalRandomField(lr_multiplier=self.crf_lr_multiplier)
        self.__crf = None
        self._model = None

# region 为便于多模型配置调试,对所有配置参数做setter处理,配置完毕需要重新build model

    def set_layers(self, value):
        self.albert_layers = value

    def set_unshared(self, value):
        self.unshared = value

    def set_dict_path(self, path):
        self.dict = path
        self.tokenizer = Tokenizer(self.dict, do_lower_case=True)

    def set_checkpoint_path(self, path):
        self.checkpoint = path

    def set_config_path(self, path):
        self.config = path

    def set_weight_path(self, weight_path):
        self.weight_path = weight_path
# endregion

    @property
    def maxlen(self):
        return self.__maxlen

    @maxlen.setter
    def maxlen(self, value):
        self.__maxlen = value

    @property
    def crf_lr_multiplier(self):
        return self.__crf_lr_multiplier

    @crf_lr_multiplier.setter
    def crf_lr_multiplier(self, value):
        self.__crf_lr_multiplier = value

    @property
    def albert_model(self):
        return self._model

    @albert_model.setter
    def albert_model(self, model_path: str):
        from keras.models import load_model
        from keras.utils import CustomObjectScope
        # self.__model=load_model(model_path,custom_objects={'ConditionalRandomField':
        #                             ConditionalRandomField,
        #                         'sparse_loss':ConditionalRandomField.sparse_loss},
        #                         compile=False)##两种自定义loss加载方式均可
        with CustomObjectScope({
                'ConditionalRandomField':
                ConditionalRandomField,
                'sparse_loss':
                ConditionalRandomField.sparse_loss
        }):
            self._model = load_model(model_path)
            ##此处是重点!!,本机电脑及服务器上model中crf层名字如下,实际情况若名称不一致,需根据模型拓扑结构中的名字更改!!!
            self.__crf = self._model.get_layer('conditional_random_field_1')
            assert isinstance(self.__crf, ConditionalRandomField)

    @albert_model.deleter
    def albert_model(self):
        K.clear_session()
        del self._model

    def build_albert_model(self):
        del self.albert_model
        file_name = f'albert_{self.pretrain_name}_pretrain.h5'  ##这里,为了方便预训练模型加载,我预先将加载后的预训练模型保存为了.h5
        if os.path.exists(file_name):
            pretrain_model = load_model(file_name, compile=False)
        else:
            pretrain_model = build_transformer_model(
                config_path=self.config,
                checkpoint_path=self.checkpoint,
                model='albert_unshared' if self.unshared else 'albert',
                return_keras_model=True)

        if not self.unshared:
            output_layer = 'Transformer-FeedForward-Norm'
            output = pretrain_model.get_layer(output_layer).get_output_at(
                self.albert_layers - 1)
        else:
            output_layer = 'Transformer-%s-FeedForward-Norm' % (
                self.albert_layers - 1)
            output = pretrain_model.get_layer(output_layer).output
        output = Dense(self.__num_labels)(output)
        self.__crf = ConditionalRandomField(
            lr_multiplier=self.crf_lr_multiplier)
        output = self.__crf(output)
        model = Model(pretrain_model.input, output)
        model.load_weights(self.weight_path)
        self._model = model

    def viterbi_decode(self, nodes, trans, starts=[0], ends=[0]):
        """Viterbi算法求最优路径
        """
        num_labels = len(trans)
        non_starts = []
        non_ends = []
        if starts is not None:
            for i in range(num_labels):
                if i not in starts:
                    non_starts.append(i)
        if ends is not None:
            for i in range(num_labels):
                if i not in ends:
                    non_ends.append(i)
                # 预处理
        nodes[0, non_starts] -= np.inf
        nodes[-1, non_ends] -= np.inf
        labels = np.arange(num_labels).reshape((1, -1))
        scores = nodes[0].reshape((-1, 1))
        # scores[1:] -= np.inf  # 第一个标签必然是0
        paths = labels
        for l in range(1, len(nodes)):
            M = scores + trans + nodes[l].reshape((1, -1))
            idxs = M.argmax(0)
            scores = M.max(0).reshape((-1, 1))
            paths = np.concatenate([paths[:, idxs], labels], 0)
        return paths[:, scores[:, 0].argmax()]  # 最优路径

    def recognize(self, text):
        """
        # 识别实体
        :param text:
        :return: entities list
        """
        tokens = self.tokenizer.tokenize(text)
        while len(tokens) > 512:
            tokens.pop(-2)
        try:
            mapping = self.tokenizer.rematch(text, tokens)
            token_ids = self.tokenizer.tokens_to_ids(tokens)
            segment_ids = [0] * len(token_ids)
            nodes = self._model.predict([[token_ids], [segment_ids]])[0]
            # print('nodes:',nodes)
            _trans = K.eval(self.__crf.trans)
            labels = self.viterbi_decode(nodes, trans=_trans)
            entities, starting = [], False
            for i, label in enumerate(labels):
                if label > 0:
                    if label % 2 == 1:
                        starting = True
                        entities.append([[i],
                                         self.__id2label[(label - 1) // 2]])
                    elif starting:
                        entities[-1][0].append(i)
                    else:
                        starting = False
                else:
                    starting = False

            return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
                    for w, l in entities]
        except:
            import traceback
            traceback.print_exc()
Exemple #12
0
                elif this_flag=='O' and last_flag != 'O':
                    d.append([char,'O'])
                elif this_flag[0] == 'B':
                    d.append([char,this_flag[2:]])
                else:
                    d[-1][0] +=char
                last_flag = this_flag
            D.append(d)

    return D

train_data = load_data('./data/example.train')
valid_data = load_data('./data/example.dev')
test_data = load_data('./data/example.test')

tokenizer = Tokenizer(token_dict=dict_path,do_lower_case=True)


#类别映射
labels = ['PER','LOC','ORG']
id2label = dict(enumerate(labels))
label2id = {j:i for i,j in id2label.items()}
num_labels = len(labels)*2 +1 #tag*BI + O

class data_generator(DataGenerator):

    def __iter__(self,random=False):
        batch_token_ids, batch_labels = [], []
        for is_end, item in self.sample(random):
            token_ids, labels = [tokenizer._token_start_id], [0]
            for w, l in item:
Exemple #13
0
taskname = sys.argv[1]
dataset_dir = sys.argv[2]
base_model_path = sys.argv[3]
output_model_path = sys.argv[4]
mode = sys.argv[5]

config_path = os.path.join(base_model_path, 'bert_config.json')
checkpoint_path = os.path.join(base_model_path, 'bert_model.ckpt')
dict_path = os.path.join(base_model_path, 'vocab.txt')

# 加载数据的方法
# {"id": 16, "content": "你也不用说对不起,只是,,,,若相惜", "label": "sadness"}
label_list = ['like', 'happiness', 'sadness', 'anger', 'disgust']  ####
label2index = {label: i for i, label in enumerate(label_list)}  ####
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

like_id = tokenizer.token_to_id(
    u'[unused10]')  # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很'
happiness_id = tokenizer.token_to_id(
    u'[unused11]')  # e.g. '[unused10]. 将负向的token转化为id. 默认值:u'不'
sadness_id = tokenizer.token_to_id(
    u'[unused12]')  # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很'
anger_id = tokenizer.token_to_id(
    u'[unused13]')  # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很'
disgust_id = tokenizer.token_to_id(
    u'[unused14]')  # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很'

label2tokenid_dict = {
    'like': like_id,
    'happiness': happiness_id,
from bert4keras.tokenizers import Tokenizer
from config import config
from sklearn.preprocessing import LabelEncoder
import os
import tensorflow as tf
import numpy as np
from Load_Data import load_data
from tqdm import tqdm

tokenizer = Tokenizer(os.path.join(config.model_dir, 'vocab.txt'),
                      do_lower_case=True)


def _tokenize(text):
    token_id, seg_id = tokenizer.encode(text)
    return token_id, seg_id


def _pad_seuqences(tokens):
    return tf.keras.preprocessing.sequence.pad_sequences(
        tokens, maxlen=config.seq_maxlen, truncating='post', padding='post')


#分割成两个token
def tokenize_data(data):
    token_ids_1 = []
    token_ids_2 = []
    seg_ids_1 = []
    seg_ids_2 = []
    tags = []
    for sent in tqdm(data):
Exemple #15
0
def get_tokenizer(dict_path, pre_tokenize=None):
    """建立分词器
    """
    return Tokenizer(dict_path, do_lower_case=True, pre_tokenize=pre_tokenize)
Exemple #16
0

# 加载数据集
train_data = load_data('datasets/sentiment/sentiment.train.data')
valid_data = load_data('datasets/sentiment/sentiment.valid.data')
test_data = load_data('datasets/sentiment/sentiment.test.data')

# 模拟标注和非标注数据
train_frac = 0.01  # 标注数据的比例
num_labeled = int(len(train_data) * train_frac)
unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]]
train_data = train_data[:num_labeled]
train_data = train_data + unlabeled_data

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

# 对应的任务描述
prefix = u'很满意。'
mask_idx = 1
pos_id = tokenizer.token_to_id(u'很')
neg_id = tokenizer.token_to_id(u'不')


def random_masking(token_ids):
    """对输入进行随机mask
    """
    rands = np.random.random(len(token_ids))
    source, target = [], []
    for r, t in zip(rands, token_ids):
        if r < 0.15 * 0.8:
import argparse
import random
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
parser = argparse.ArgumentParser(description="training set index")
parser.add_argument("--train_set_index", "-t", help="training set index", type=str, default="0")
args = parser.parse_args()
train_set_index = args.train_set_index
assert train_set_index in {"0", "1", "2", "3", "4", "all"}, 'train_set_index must in {"0", "1", "2", "3", "4", "all"}'
from tqdm import tqdm
config_path = 'path/language_model/chinese_roberta_wwm_ext_L-12_H-768_A-12/config.json'
checkpoint_path = 'path/language_model/nezha-gpt/cn_gpt'
dict_path = 'path/language_model/nezha-gpt/vocab.txt'

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

label_en2zh ={'银行': '银行',
 '社区服务': '社区',
 '电商': '电商',
 '支付': '支付',
 '经营养成': '经营',
 '卡牌': '卡牌',
 '借贷': '借贷',
 '驾校': '驾校',
 '理财': '理财',
 '职考': '职考',
 '新闻': '新闻',
 '旅游资讯': '旅游',
 '公共交通': '交通',
 '魔幻': '魔幻',
domain_label2id, domain_id2label, intent_label2id, intent_id2label, slot_label2id, slot_id2label = json.load(
    open('data/labels.json', 'r', encoding='utf-8'))

# 加载数据
data = json.load(open('data/train.json', 'r', encoding='utf-8'))
random.shuffle(data)
valid_data = data[:len(data) // 8]
train_data = data[len(data) // 8:]

# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
    dict_path=vocab_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

tokenizer = Tokenizer(token_dict)


# 数据迭代器
class MyDataGenerator(DataGenerator):
    def __init__(self, data, batch_size=32, buffer_size=None):
        super(MyDataGenerator, self).__init__(data, batch_size, buffer_size)

    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, Y1, Y2, Y3 = [], [], [], [], []

        for is_end, item in self.sample(random):

            text = item['text']
            token_ids, segment_ids = tokenizer.encode(first_text=text.lower())
valid_data = load_data('datasets/opposts/dev_32.json')
test_data = load_data('datasets/opposts/test.json')

# 模拟标注和非标注数据
train_frac = 1  # 0.01  # 标注数据的比例
print("0.length of train_data:", len(train_data))  # 16883
num_labeled = int(len(train_data) * train_frac)
# unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]]
train_data = train_data[:num_labeled]
print("1.num_labeled data used:", num_labeled, " ;train_data:",
      len(train_data))  # 168

# train_data = train_data + unlabeled_data

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

# 对应的任务描述
mask_idx = 1  #5
unused_length = 9  # 9
desc = [
    '[unused%s]' % i for i in range(1, unused_length)
]  # desc: ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]']
desc.insert(
    mask_idx - 1, '[MASK]'
)  # desc: ['[MASK]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]
desc_ids = [tokenizer.token_to_id(t) for t in desc]  # 将token转化为id

pos_id = tokenizer.token_to_id(
    u'很')  # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很'
neg_id = tokenizer.token_to_id(
Exemple #20
0
from window_layers import WindowEmbedding, WindowEmbeddingforword

epochs = 500
batch_size = 1024
# bert_layers = 12
learing_rate = 1e-4  # bert_layers越小,学习率应该要越大
seq_crf_lr_multiplier = 1e-2  # 必要时扩大CRF层的学习率
tag_crf_lr_multiplier = 1e-2
vocab_size = 21128

# bert配置
# config_path = '../../Q_A/publish/bert_config.json'
# checkpoint_path = '../../Q_A/publish/bert_model.ckpt'
dict_path = '../../Q_A/publish/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=False)

labels = [
    'O', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R20', 'R21', 'R22', 'R23',
    'R24', 'R25', 'R30', 'R31', 'R90', 'R99', 'X'
]
seg_labels = ['O', 'B', 'I', 'E']

id2label = dict(enumerate(labels))
label2id = {j: i for i, j in id2label.items()}
num_labels = len(labels)

id2seglabel = dict(enumerate(seg_labels))
seglabel2id = {j: i for i, j in id2seglabel.items()}
num_seglabels = len(seg_labels)
Exemple #21
0
valid_datas = []
for i in range(5):
    valid_data = load_data('{}/dev_{}.json'.format(path,str(i)))
    valid_datas.append(valid_data)
test_data = load_data('{}/test_public.json'.format(path))

# 模拟标注和非标注数据
train_frac = 1# 0.01  # 标注数据的比例
print("0.length of train_data:",len(train_data)) # 16883
num_labeled = int(len(train_data) * train_frac)
train_data = train_data[:num_labeled]
print("1.num_labeled data used:",num_labeled," ;train_data:",len(train_data)) # 168
# train_data = train_data + unlabeled_data

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

# 对应的任务描述
mask_idxs = [1, 2] # [7, 8] # mask_idx = 1 #5
unused_length=9 # 6,13没有效果提升
desc = ['[unused%s]' % i for i in range(1, unused_length)] # desc: ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]']
for mask_id in mask_idxs:
    desc.insert(mask_id - 1, '[MASK]')            # desc: ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[MASK]', '[MASK]', '[unused7]', '[unused8]']
desc_ids = [tokenizer.token_to_id(t) for t in desc] # 将token转化为id

# pos_id = tokenizer.token_to_id(u'很') # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很'
# neg_id = tokenizer.token_to_id(u'不') # e.g. '[unused10]. 将负向的token转化为id. 默认值:u'不'


def random_masking(token_ids):
    """对输入进行mask
Exemple #22
0
dict_path = os.path.join(base_model_path, 'vocab.txt')

# 加载数据的方法
# {"id": 16, "content": "你也不用说对不起,只是,,,,若相惜", "label": "sadness"}
label_list = ['like', 'happiness', 'sadness', 'anger', 'disgust']  #### O.K.
label_en2zh_dict = {
    'like': '喜欢',
    "happiness": "开心",
    "sadness": "伤心",
    "anger": "愤怒",
    "disgust": "厌恶"
}
label_zh_list = [label_en2zh_dict[label_en] for label_en in label_list]
label2index = {label: i for i, label in enumerate(label_list)}  #### O.K.
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

# label2tokenid_dict= {'like':[like_id_1,like_id_2],'happiness':[happiness_id_1,happiness_id_2],'sadness':[sadness_id_1,sadness_id_2],
#                      'anger':[anger_id_1,anger_id_2],'disgust':[disgust_id_1,disgust_id_2]}
label2tokenid_dict = {
}  # {'neutral':[neutral_id_1,neutral_id_2],'entailment':[entailment_id_1,entailment_id_2],'contradiction':[contradiction_id_1,contradiction_id_2]}
for label_en in label_list:
    # label_en= # 'neutral'
    label_zh = label_en2zh_dict[label_en]
    char_id_list = []
    for index, char_zh in enumerate(label_zh):
        char_id_list.append(tokenizer.token_to_id(char_zh))
    label2tokenid_dict[
        label_en] = char_id_list  # e.g. 'neutral':[neutral_id_1,neutral_id_2]
print(
    "###label2tokenid_dict:", label2tokenid_dict
Exemple #23
0

# 加载数据集
train_data = load_data('datasets/sentiment/sentiment.train.data')
valid_data = load_data('datasets/sentiment/sentiment.valid.data')
test_data = load_data('datasets/sentiment/sentiment.test.data')

# 模拟标注和非标注数据
train_frac = 0.01  # 标注数据的比例
num_labeled = int(len(train_data) * train_frac)
unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]]
train_data = train_data[:num_labeled]
# train_data = train_data + unlabeled_data

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

# 对应的任务描述
mask_idx = 5
desc = ['[unused%s]' % i for i in range(1, 9)]
desc.insert(mask_idx - 1, '[MASK]')
desc_ids = [tokenizer.token_to_id(t) for t in desc]
pos_id = tokenizer.token_to_id(u'很')
neg_id = tokenizer.token_to_id(u'不')


def random_masking(token_ids):
    """对输入进行随机mask
    """
    rands = np.random.random(len(token_ids))
    source, target = [], []
Exemple #24
0
import jieba
from roformer import RoFormerTokenizer
from bert4keras.tokenizers import Tokenizer

dict_path = 'pretrained_models/chinese_roformer_base'
text = "12312格ab局A B cdA,.567 861351 684!今天萨达天 气非常好王企。文保鹅按时发放了的撒这些seqetvgsa国内拉手的喀什。、]P[,./()*7656&【;,‘"
#text = "这里基本保留了唐宋遗留下来的坊巷格局和大量明清古建筑,其中各级文保单位29处,被誉为“里坊制度的活化石”“明清建筑博物馆”!"
bert4keras_tokenizer = Tokenizer(
    dict_path + "/vocab.txt",
    do_lower_case=True,
    pre_tokenize=lambda s: jieba.cut(s, HMM=False))
roformer_tokenizer = RoFormerTokenizer.from_pretrained(dict_path)

bert4keras_tokenizer_input_ids = bert4keras_tokenizer.encode(text)[0]
roformer_tokenizer_input_ids = roformer_tokenizer.encode(text)

print(bert4keras_tokenizer_input_ids == roformer_tokenizer_input_ids)
from bert4keras.optimizers import Adam
from bert4keras.backend import K, batch_gather, keras
from bert4keras.layers import LayerNormalization
from keras.layers import *
from keras.models import Model
# import os, sys
# sys.path.append(os.getcwd())
from test4nlp.event_extract_chq.config import event_extract_config as Config
from test4nlp.event_extract_chq.train.utils.data_utils import get_data, data_generator
from tqdm import tqdm
import json
import numpy as np

train_data, valid_data = get_data(Config.read_data_path)
# 建立分词器
tokenizer = Tokenizer(Config.dict_path, do_lower_case=True)


def extrac_trigger(inputs):
    "根据subject_ids从output中取出subject的向量表征"
    output, trigger_ids = inputs
    trigger_ids = K.cast(trigger_ids, 'int32')
    start = batch_gather(output, trigger_ids[:, :1])
    end = batch_gather(output, trigger_ids[:, 1:])
    trigger = K.concatenate([start, end], 2)
    return trigger[:, 0]


def build_model():
    bert_model = build_transformer_model(
        config_path=Config.config_path,
Exemple #26
0
    __, last_part = line.split(':')
    ignore_flag = False
    for dis_word in disallowed_words:
        if dis_word in last_part:
            ignore_flag = True
            break
    if ignore_flag:
        continue
    # 长度不能超过最大长度
    if len(last_part) > max_len - 2:
        continue
    poetry.append(last_part)

# 预训练模型中的词典和分词器
_token_dict = load_vocab(dict_path)
_tokenizer = Tokenizer(dict_path, do_lower_case=True)

# 统计所有词的词频
word_frequency_count = defaultdict(int)
for line in poetry:
    for t in _tokenizer.tokenize(line):
        word_frequency_count[t] += 1
# 过滤掉低频词
tokens = [(token, count) for token, count in word_frequency_count.items()
          if count >= min_word_frequency]
# 按词频排序
tokens = sorted(tokens, key=lambda x: -x[1])
# 去掉词频,只保留词列表
tokens = [token for token, count in tokens]

# 构建新的token->id映射关系、和新词表
Exemple #27
0
keep_tokens = [2] * 106 + keep_tokens
keep_tokens_inv = {j: i for i, j in enumerate(keep_tokens)}

compound_tokens = []
for t, _ in sorted(token_dict.items(), key=lambda s: s[1]):
    if t not in new_token_dict:
        new_token_dict[t] = len(new_token_dict)
        ids = [keep_tokens_inv.get(i, 0) for i in sp_tokenizer.encode(t)[0]]
        compound_tokens.append(ids)

save_vocab(dict_path_2, new_token_dict)

# 构建分词器
tokenizer = Tokenizer(new_token_dict,
                      do_lower_case=True,
                      pre_tokenize=lambda s: jieba.cut(s, HMM=False))


def corpus():
    """语料生成器
    """
    while True:
        f = '/root/data_pretrain/data_shuf.json'
        with open(f) as f:
            for l in f:
                l = json.loads(l)
                for texts in text_process(l['text']):
                    yield texts

Exemple #28
0
#读取schema
with open('/home/ycy/HBT/data/schema.json',encoding='utf8') as f:
    id2predicate, predicate2id, n = {}, {}, 0
    predicate2type = {}
    for l in f:
        l = json.loads(l)
        predicate2type[l['predicate']] = (l['subject_type'], l['object_type'])
        for k, _ in sorted(l['object_type'].items()):
            key = l['predicate'] + '_' + k
            id2predicate[n] = key
            predicate2id[key] = n
            n += 1
# tokenizer = OurTokenizer(vocab_file=BERT_PATH + "vocab.txt")
tokenizer = BertTokenizer.from_pretrained(model_path,do_lower=True)
tokenizer_k = Tokenizer(os.path.join(model_path,'vocab.txt'), do_lower_case=True)
def search(pattern, sequence):
    """从sequence中寻找子串pattern
    如果找到,返回第一个下标;否则返回-1。
    """
    n = len(pattern)
    for i in range(len(sequence)):
        if sequence[i:i + n] == pattern:
            return i
    return -1
def sequence_padding(inputs, length=None, padding=0):
    """Numpy函数,将序列padding到同一长度
    """
    if length is None:
        length = max([len(x) for x in inputs])
Exemple #29
0
                    d[-1][0] += char
                last_flag = this_flag
            D.append(d)
    return D


# 标注数据
train_data = load_data('../data/ner/train.txt')
valid_data = load_data('../data/ner/dev.txt')
test_data = load_data('../data/ner/test.txt')
# train_data = load_data('../data/example_.train')
# valid_data = load_data('../data/example.dev')
# test_data = load_data('../data/example.test')

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

# 类别映射
print(labels)
id2label = dict(enumerate(labels))
label2id = {j: i for i, j in id2label.items()}
num_labels = len(labels) * 2 + 1


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, item in self.sample(random):
            token_ids, labels = [tokenizer._token_start_id], [0]
Exemple #30
0
import os

os.environ["TF_KERAS"] = "1"
import tensorflow as tf
from bert4keras.snippets import to_array
from bert4keras.tokenizers import Tokenizer
from luwu.core.models.text_classifier.transformers import TransformerTextClassification

# 词汇表地址
dict_path = ""
# 训练好的模型保存路径
model_path = ""
# 要预测的文本
sentence = ""
# 编号->标签的映射
id_label_dict = {0: "类别1", 1: "类别2", 2: "类别3"}

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载模型
model = tf.keras.models.load_model(model_path)

# 处理文本数据
token_ids, segment_ids = tokenizer.encode(sentence)
token_ids, segment_ids = to_array([token_ids], [segment_ids])

# 预测
outputs = model.predict([token_ids, segment_ids])
index = int(tf.argmax(outputs[0]))
print("当前文本类别为:{}".format(id_label_dict[index]))