# -*- coding: utf-8 -*- """ @Time : 2021/6/1 11:05 @Author : huangkai21 @file : basic_feature_extract.py """ # ! -*- coding: utf-8 -*- # 测试代码可用性: 提取特征 """ 将vocab里的每一个字转化为向量形式,保存为json字典 """ import numpy as np from bert4keras.backend import keras from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer # from bert4keras.snippets import to_array def to_array(*args): """批量转numpy的array """ results = [np.array(a) for a in args] if len(args) == 1: return results[0] else: return results config_path = r'E:\code\chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = r'E:\code\chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = r'E:\code\chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器
import numpy as np from bert import tokenization from tqdm import tqdm from config import Config import pandas as pd import os from bert4keras.tokenizers import Tokenizer from bert.data_utils import split_text vocab_file = Config().vocab_file do_lower_case = True re_tokenzier = Tokenizer(vocab_file, do_lower_case) config = Config() def load_data(data_file): """ 读取数据 :param file: :return: """ data_df = pd.read_csv(data_file) data_df.fillna('', inplace=True) lines = list( zip(list(data_df['id']), list(data_df['question']), list(data_df['context']), list(data_df['answer']), list(data_df['answer_start']))) return lines def create_example(lines): examples = []
else: random_order = json.load(open('../random_order.json')) # 划分valid train_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 != 0] valid_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 == 0] train_data.extend(train_data) train_data.extend(webqa_data) # 将SogouQA和WebQA按2:1的比例混合 # 加载并精简词表,建立分词器 token_dict, keep_tokens = load_vocab( dict_path=dict_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) tokenizer = Tokenizer(token_dict, do_lower_case=True) class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): """单条样本格式:[CLS]篇章[SEP]问题[SEP]答案[SEP] """ batch_token_ids, batch_segment_ids = [], [] for is_end, D in self.sample(random): question = D['question'] answers = [p['answer'] for p in D['passages'] if p['answer']] passage = np.random.choice(D['passages'])['passage'] passage = re.sub(u' |、|;|,', ',', passage) final_answer = ''
tmp = [] for k in query_sign_map["natural_query"]: query = query_sign_map["natural_query"][k] tmp.append((query, s, k)) tmps.append(tmp) test_data.append(tmps) # 加载数据集 #data = json.load(open("./mrc_base_data_seed{}.json".format(seed_value),encoding="utf-8")) #train_data = data["train"] #valid_data = data["valid"] #test_data = data["test"] #print(test_data[0]) # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) #构建问答ner,(query,text,start_labels,end_labels #其次要把query以及[cls,seq] mask掉。利用query长度构建mask[0]*len(query)+[0]+[1]*len(text)+[0],构建mask # class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=True): batch_token_ids, batch_segment_ids, batch_mask, batch_start_labels, batch_end_labels= [],[],[],[],[] for is_end, (query, text, start_label, end_label, T, label) in self.sample(random): """ token_ids, segment_ids = tokenizer.encode(
from bert4keras.snippets import uniout import tensorflow as tf from bert4keras.backend import K from flask import Flask, request, render_template, send_file app = Flask(__name__) graph = tf.get_default_graph() sess = K.get_session() set_session = K.set_session config_path = r'GPT_large-tf\gpt_config.json' checkpoint_path = r'GPT_large-tf\gpt_model.ckpt' dict_path = r'GPT_large-tf\vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 speakers = [ tokenizer.token_to_id('[speaker1]'), tokenizer.token_to_id('[speaker2]') ] model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='GPT_OpenAI') # 建立模型,加载权重 # model.save("gpt.h5") class ChatBot(AutoRegressiveDecoder): """基于随机采样对话机器人 """ @AutoRegressiveDecoder.wraps(default_rtype='probas')
#! -*- coding: utf-8 -*- # 基本测试:中文GPT2模型 # 介绍链接:https://kexue.fm/archives/7292 import numpy as np from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer from bert4keras.snippets import AutoRegressiveDecoder from bert4keras.snippets import uniout config_path = '/root/gpt2/config.json' checkpoint_path = '/root/gpt2/model.ckpt-100000' dict_path = '/root/gpt2/vocab.txt' tokenizer = Tokenizer( dict_path, token_start=None, token_end=None, do_lower_case=True ) # 建立分词器 model = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2_ml' ) # 建立模型,加载权重 class ArticleCompletion(AutoRegressiveDecoder): """基于随机采样的文章续写 """ @AutoRegressiveDecoder.wraps(default_rtype='probas') def predict(self, inputs, output_ids, states): token_ids = np.concatenate([inputs[0], output_ids], 1) return model.predict(token_ids)[:, -1]
#! -*- coding: utf-8 -*- # 测试代码可用性: 提取特征 import numpy as np from bert4keras.backend import keras from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer from bert4keras.snippets import to_array config_path = './chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = './chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = './chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 model = build_transformer_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') token_ids, segment_ids = to_array([token_ids], [segment_ids]) print('\n ===== predicting =====\n') print(model.predict([token_ids, segment_ids])) """ 输出: [[[-0.63251007 0.2030236 0.07936534 ... 0.49122632 -0.20493352 0.2575253 ] [-0.7588351 0.09651865 1.0718756 ... -0.6109694 0.04312154 0.03881441] [ 0.5477043 -0.792117 0.44435206 ... 0.42449304 0.41105673 0.08222899] [-0.2924238 0.6052722 0.49968526 ... 0.8604137 -0.6533166
def set_dict_path(self, path): self.dict = path self.tokenizer = Tokenizer(self.dict, do_lower_case=True)
#! -*- coding: utf-8 -*- # 测试代码可用性: MLM from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer import numpy as np config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True) # 建立模型,加载权重 token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') # mask掉“技术” token_ids[3] = token_ids[4] = tokenizer._token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”
elif this_flag[:1] == 'B': d.append([char, this_flag[2:]]) else: d[-1][0] += char last_flag = this_flag D.append(d) return D # 标注数据 train_data = load_data('data/china-people-daily-ner-corpus/example.train') valid_data = load_data('data/china-people-daily-ner-corpus/example.dev') test_data = load_data('data/china-people-daily-ner-corpus/example.test') # 建立分词器 tokenizer = Tokenizer(vocab_path, do_lower_case=True) class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_start, batch_end = [], [], [], [] for is_end, item in self.sample(random): for k, v in query_mapping.items(): query_token_ids, query_segment_ids = tokenizer.encode(v) token_ids = query_token_ids.copy() start = query_segment_ids.copy() end = query_segment_ids.copy()
class AlbertNerModel(object): # model=None def __init__(self, model_name: str, path: str, config_path: str, checkpoint_path: str, dict_path: str, layers: int = 0, unshared: bool = False): """ Albert 初始化参数 :param model_name: 模型名称,albert_base/albert_small/albert_tiny, 不推荐albertbase/albertsmall/alberttiny :param path: 权重路径 :param config_path: 预训练模型配置文件 :param checkpoint_path: 预训练模型文件 :param dict_path: 预训练模型字典 :param layers: 可选自定义层数,base最大12层,small最大6层,tiny最大4层 :param unshared: 是否以Bert形式做层分解,默认为否 """ if tf.__version__ >= '2.0': print('暂不支持tensorflow 2.0 以上版本') raise self.weight_path = path self.__maxlen = 256 self.__crf_lr_multiplier = 1000 if str(model_name).upper() == 'ALBERT_BASE' or str( model_name).upper() == 'ALBERTBASE': self.albert_layers = 12 elif str(model_name).upper() == 'ALBERT_SMALL' or str( model_name).upper() == 'ALBERTSMALL': self.albert_layers = 6 elif str(model_name).upper() == 'ALBERT_TINY' or str( model_name).upper() == 'ALBERTTINY': self.albert_layers = 4 if layers > 0: self.albert_layers = layers self.pretrain_name = model_name self.config = config_path self.checkpoint = checkpoint_path self.dict = dict_path self.unshared = unshared self.tokenizer = Tokenizer(self.dict, do_lower_case=True) # 类别映射 labels = ['PER', 'LOC', 'ORG'] id2label = dict(enumerate(labels)) # label2id={j: i for i,j in id2label.items()} self.__id2label = id2label self.__num_labels = len(labels) * 2 + 1 # label2id = {j: i for i, j in id2label.items()} assert self.config and self.checkpoint and self.dict # self.__crf= ConditionalRandomField(lr_multiplier=self.crf_lr_multiplier) self.__crf = None self._model = None # region 为便于多模型配置调试,对所有配置参数做setter处理,配置完毕需要重新build model def set_layers(self, value): self.albert_layers = value def set_unshared(self, value): self.unshared = value def set_dict_path(self, path): self.dict = path self.tokenizer = Tokenizer(self.dict, do_lower_case=True) def set_checkpoint_path(self, path): self.checkpoint = path def set_config_path(self, path): self.config = path def set_weight_path(self, weight_path): self.weight_path = weight_path # endregion @property def maxlen(self): return self.__maxlen @maxlen.setter def maxlen(self, value): self.__maxlen = value @property def crf_lr_multiplier(self): return self.__crf_lr_multiplier @crf_lr_multiplier.setter def crf_lr_multiplier(self, value): self.__crf_lr_multiplier = value @property def albert_model(self): return self._model @albert_model.setter def albert_model(self, model_path: str): from keras.models import load_model from keras.utils import CustomObjectScope # self.__model=load_model(model_path,custom_objects={'ConditionalRandomField': # ConditionalRandomField, # 'sparse_loss':ConditionalRandomField.sparse_loss}, # compile=False)##两种自定义loss加载方式均可 with CustomObjectScope({ 'ConditionalRandomField': ConditionalRandomField, 'sparse_loss': ConditionalRandomField.sparse_loss }): self._model = load_model(model_path) ##此处是重点!!,本机电脑及服务器上model中crf层名字如下,实际情况若名称不一致,需根据模型拓扑结构中的名字更改!!! self.__crf = self._model.get_layer('conditional_random_field_1') assert isinstance(self.__crf, ConditionalRandomField) @albert_model.deleter def albert_model(self): K.clear_session() del self._model def build_albert_model(self): del self.albert_model file_name = f'albert_{self.pretrain_name}_pretrain.h5' ##这里,为了方便预训练模型加载,我预先将加载后的预训练模型保存为了.h5 if os.path.exists(file_name): pretrain_model = load_model(file_name, compile=False) else: pretrain_model = build_transformer_model( config_path=self.config, checkpoint_path=self.checkpoint, model='albert_unshared' if self.unshared else 'albert', return_keras_model=True) if not self.unshared: output_layer = 'Transformer-FeedForward-Norm' output = pretrain_model.get_layer(output_layer).get_output_at( self.albert_layers - 1) else: output_layer = 'Transformer-%s-FeedForward-Norm' % ( self.albert_layers - 1) output = pretrain_model.get_layer(output_layer).output output = Dense(self.__num_labels)(output) self.__crf = ConditionalRandomField( lr_multiplier=self.crf_lr_multiplier) output = self.__crf(output) model = Model(pretrain_model.input, output) model.load_weights(self.weight_path) self._model = model def viterbi_decode(self, nodes, trans, starts=[0], ends=[0]): """Viterbi算法求最优路径 """ num_labels = len(trans) non_starts = [] non_ends = [] if starts is not None: for i in range(num_labels): if i not in starts: non_starts.append(i) if ends is not None: for i in range(num_labels): if i not in ends: non_ends.append(i) # 预处理 nodes[0, non_starts] -= np.inf nodes[-1, non_ends] -= np.inf labels = np.arange(num_labels).reshape((1, -1)) scores = nodes[0].reshape((-1, 1)) # scores[1:] -= np.inf # 第一个标签必然是0 paths = labels for l in range(1, len(nodes)): M = scores + trans + nodes[l].reshape((1, -1)) idxs = M.argmax(0) scores = M.max(0).reshape((-1, 1)) paths = np.concatenate([paths[:, idxs], labels], 0) return paths[:, scores[:, 0].argmax()] # 最优路径 def recognize(self, text): """ # 识别实体 :param text: :return: entities list """ tokens = self.tokenizer.tokenize(text) while len(tokens) > 512: tokens.pop(-2) try: mapping = self.tokenizer.rematch(text, tokens) token_ids = self.tokenizer.tokens_to_ids(tokens) segment_ids = [0] * len(token_ids) nodes = self._model.predict([[token_ids], [segment_ids]])[0] # print('nodes:',nodes) _trans = K.eval(self.__crf.trans) labels = self.viterbi_decode(nodes, trans=_trans) entities, starting = [], False for i, label in enumerate(labels): if label > 0: if label % 2 == 1: starting = True entities.append([[i], self.__id2label[(label - 1) // 2]]) elif starting: entities[-1][0].append(i) else: starting = False else: starting = False return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l) for w, l in entities] except: import traceback traceback.print_exc()
elif this_flag=='O' and last_flag != 'O': d.append([char,'O']) elif this_flag[0] == 'B': d.append([char,this_flag[2:]]) else: d[-1][0] +=char last_flag = this_flag D.append(d) return D train_data = load_data('./data/example.train') valid_data = load_data('./data/example.dev') test_data = load_data('./data/example.test') tokenizer = Tokenizer(token_dict=dict_path,do_lower_case=True) #类别映射 labels = ['PER','LOC','ORG'] id2label = dict(enumerate(labels)) label2id = {j:i for i,j in id2label.items()} num_labels = len(labels)*2 +1 #tag*BI + O class data_generator(DataGenerator): def __iter__(self,random=False): batch_token_ids, batch_labels = [], [] for is_end, item in self.sample(random): token_ids, labels = [tokenizer._token_start_id], [0] for w, l in item:
taskname = sys.argv[1] dataset_dir = sys.argv[2] base_model_path = sys.argv[3] output_model_path = sys.argv[4] mode = sys.argv[5] config_path = os.path.join(base_model_path, 'bert_config.json') checkpoint_path = os.path.join(base_model_path, 'bert_model.ckpt') dict_path = os.path.join(base_model_path, 'vocab.txt') # 加载数据的方法 # {"id": 16, "content": "你也不用说对不起,只是,,,,若相惜", "label": "sadness"} label_list = ['like', 'happiness', 'sadness', 'anger', 'disgust'] #### label2index = {label: i for i, label in enumerate(label_list)} #### # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) like_id = tokenizer.token_to_id( u'[unused10]') # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很' happiness_id = tokenizer.token_to_id( u'[unused11]') # e.g. '[unused10]. 将负向的token转化为id. 默认值:u'不' sadness_id = tokenizer.token_to_id( u'[unused12]') # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很' anger_id = tokenizer.token_to_id( u'[unused13]') # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很' disgust_id = tokenizer.token_to_id( u'[unused14]') # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很' label2tokenid_dict = { 'like': like_id, 'happiness': happiness_id,
from bert4keras.tokenizers import Tokenizer from config import config from sklearn.preprocessing import LabelEncoder import os import tensorflow as tf import numpy as np from Load_Data import load_data from tqdm import tqdm tokenizer = Tokenizer(os.path.join(config.model_dir, 'vocab.txt'), do_lower_case=True) def _tokenize(text): token_id, seg_id = tokenizer.encode(text) return token_id, seg_id def _pad_seuqences(tokens): return tf.keras.preprocessing.sequence.pad_sequences( tokens, maxlen=config.seq_maxlen, truncating='post', padding='post') #分割成两个token def tokenize_data(data): token_ids_1 = [] token_ids_2 = [] seg_ids_1 = [] seg_ids_2 = [] tags = [] for sent in tqdm(data):
def get_tokenizer(dict_path, pre_tokenize=None): """建立分词器 """ return Tokenizer(dict_path, do_lower_case=True, pre_tokenize=pre_tokenize)
# 加载数据集 train_data = load_data('datasets/sentiment/sentiment.train.data') valid_data = load_data('datasets/sentiment/sentiment.valid.data') test_data = load_data('datasets/sentiment/sentiment.test.data') # 模拟标注和非标注数据 train_frac = 0.01 # 标注数据的比例 num_labeled = int(len(train_data) * train_frac) unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]] train_data = train_data[:num_labeled] train_data = train_data + unlabeled_data # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # 对应的任务描述 prefix = u'很满意。' mask_idx = 1 pos_id = tokenizer.token_to_id(u'很') neg_id = tokenizer.token_to_id(u'不') def random_masking(token_ids): """对输入进行随机mask """ rands = np.random.random(len(token_ids)) source, target = [], [] for r, t in zip(rands, token_ids): if r < 0.15 * 0.8:
import argparse import random import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" parser = argparse.ArgumentParser(description="training set index") parser.add_argument("--train_set_index", "-t", help="training set index", type=str, default="0") args = parser.parse_args() train_set_index = args.train_set_index assert train_set_index in {"0", "1", "2", "3", "4", "all"}, 'train_set_index must in {"0", "1", "2", "3", "4", "all"}' from tqdm import tqdm config_path = 'path/language_model/chinese_roberta_wwm_ext_L-12_H-768_A-12/config.json' checkpoint_path = 'path/language_model/nezha-gpt/cn_gpt' dict_path = 'path/language_model/nezha-gpt/vocab.txt' # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) label_en2zh ={'银行': '银行', '社区服务': '社区', '电商': '电商', '支付': '支付', '经营养成': '经营', '卡牌': '卡牌', '借贷': '借贷', '驾校': '驾校', '理财': '理财', '职考': '职考', '新闻': '新闻', '旅游资讯': '旅游', '公共交通': '交通', '魔幻': '魔幻',
domain_label2id, domain_id2label, intent_label2id, intent_id2label, slot_label2id, slot_id2label = json.load( open('data/labels.json', 'r', encoding='utf-8')) # 加载数据 data = json.load(open('data/train.json', 'r', encoding='utf-8')) random.shuffle(data) valid_data = data[:len(data) // 8] train_data = data[len(data) // 8:] # 加载并精简词表,建立分词器 token_dict, keep_tokens = load_vocab( dict_path=vocab_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']) tokenizer = Tokenizer(token_dict) # 数据迭代器 class MyDataGenerator(DataGenerator): def __init__(self, data, batch_size=32, buffer_size=None): super(MyDataGenerator, self).__init__(data, batch_size, buffer_size) def __iter__(self, random=False): batch_token_ids, batch_segment_ids, Y1, Y2, Y3 = [], [], [], [], [] for is_end, item in self.sample(random): text = item['text'] token_ids, segment_ids = tokenizer.encode(first_text=text.lower())
valid_data = load_data('datasets/opposts/dev_32.json') test_data = load_data('datasets/opposts/test.json') # 模拟标注和非标注数据 train_frac = 1 # 0.01 # 标注数据的比例 print("0.length of train_data:", len(train_data)) # 16883 num_labeled = int(len(train_data) * train_frac) # unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]] train_data = train_data[:num_labeled] print("1.num_labeled data used:", num_labeled, " ;train_data:", len(train_data)) # 168 # train_data = train_data + unlabeled_data # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # 对应的任务描述 mask_idx = 1 #5 unused_length = 9 # 9 desc = [ '[unused%s]' % i for i in range(1, unused_length) ] # desc: ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]'] desc.insert( mask_idx - 1, '[MASK]' ) # desc: ['[MASK]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10] desc_ids = [tokenizer.token_to_id(t) for t in desc] # 将token转化为id pos_id = tokenizer.token_to_id( u'很') # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很' neg_id = tokenizer.token_to_id(
from window_layers import WindowEmbedding, WindowEmbeddingforword epochs = 500 batch_size = 1024 # bert_layers = 12 learing_rate = 1e-4 # bert_layers越小,学习率应该要越大 seq_crf_lr_multiplier = 1e-2 # 必要时扩大CRF层的学习率 tag_crf_lr_multiplier = 1e-2 vocab_size = 21128 # bert配置 # config_path = '../../Q_A/publish/bert_config.json' # checkpoint_path = '../../Q_A/publish/bert_model.ckpt' dict_path = '../../Q_A/publish/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=False) labels = [ 'O', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R20', 'R21', 'R22', 'R23', 'R24', 'R25', 'R30', 'R31', 'R90', 'R99', 'X' ] seg_labels = ['O', 'B', 'I', 'E'] id2label = dict(enumerate(labels)) label2id = {j: i for i, j in id2label.items()} num_labels = len(labels) id2seglabel = dict(enumerate(seg_labels)) seglabel2id = {j: i for i, j in id2seglabel.items()} num_seglabels = len(seg_labels)
valid_datas = [] for i in range(5): valid_data = load_data('{}/dev_{}.json'.format(path,str(i))) valid_datas.append(valid_data) test_data = load_data('{}/test_public.json'.format(path)) # 模拟标注和非标注数据 train_frac = 1# 0.01 # 标注数据的比例 print("0.length of train_data:",len(train_data)) # 16883 num_labeled = int(len(train_data) * train_frac) train_data = train_data[:num_labeled] print("1.num_labeled data used:",num_labeled," ;train_data:",len(train_data)) # 168 # train_data = train_data + unlabeled_data # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # 对应的任务描述 mask_idxs = [1, 2] # [7, 8] # mask_idx = 1 #5 unused_length=9 # 6,13没有效果提升 desc = ['[unused%s]' % i for i in range(1, unused_length)] # desc: ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]'] for mask_id in mask_idxs: desc.insert(mask_id - 1, '[MASK]') # desc: ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[MASK]', '[MASK]', '[unused7]', '[unused8]'] desc_ids = [tokenizer.token_to_id(t) for t in desc] # 将token转化为id # pos_id = tokenizer.token_to_id(u'很') # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很' # neg_id = tokenizer.token_to_id(u'不') # e.g. '[unused10]. 将负向的token转化为id. 默认值:u'不' def random_masking(token_ids): """对输入进行mask
dict_path = os.path.join(base_model_path, 'vocab.txt') # 加载数据的方法 # {"id": 16, "content": "你也不用说对不起,只是,,,,若相惜", "label": "sadness"} label_list = ['like', 'happiness', 'sadness', 'anger', 'disgust'] #### O.K. label_en2zh_dict = { 'like': '喜欢', "happiness": "开心", "sadness": "伤心", "anger": "愤怒", "disgust": "厌恶" } label_zh_list = [label_en2zh_dict[label_en] for label_en in label_list] label2index = {label: i for i, label in enumerate(label_list)} #### O.K. # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # label2tokenid_dict= {'like':[like_id_1,like_id_2],'happiness':[happiness_id_1,happiness_id_2],'sadness':[sadness_id_1,sadness_id_2], # 'anger':[anger_id_1,anger_id_2],'disgust':[disgust_id_1,disgust_id_2]} label2tokenid_dict = { } # {'neutral':[neutral_id_1,neutral_id_2],'entailment':[entailment_id_1,entailment_id_2],'contradiction':[contradiction_id_1,contradiction_id_2]} for label_en in label_list: # label_en= # 'neutral' label_zh = label_en2zh_dict[label_en] char_id_list = [] for index, char_zh in enumerate(label_zh): char_id_list.append(tokenizer.token_to_id(char_zh)) label2tokenid_dict[ label_en] = char_id_list # e.g. 'neutral':[neutral_id_1,neutral_id_2] print( "###label2tokenid_dict:", label2tokenid_dict
# 加载数据集 train_data = load_data('datasets/sentiment/sentiment.train.data') valid_data = load_data('datasets/sentiment/sentiment.valid.data') test_data = load_data('datasets/sentiment/sentiment.test.data') # 模拟标注和非标注数据 train_frac = 0.01 # 标注数据的比例 num_labeled = int(len(train_data) * train_frac) unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]] train_data = train_data[:num_labeled] # train_data = train_data + unlabeled_data # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # 对应的任务描述 mask_idx = 5 desc = ['[unused%s]' % i for i in range(1, 9)] desc.insert(mask_idx - 1, '[MASK]') desc_ids = [tokenizer.token_to_id(t) for t in desc] pos_id = tokenizer.token_to_id(u'很') neg_id = tokenizer.token_to_id(u'不') def random_masking(token_ids): """对输入进行随机mask """ rands = np.random.random(len(token_ids)) source, target = [], []
import jieba from roformer import RoFormerTokenizer from bert4keras.tokenizers import Tokenizer dict_path = 'pretrained_models/chinese_roformer_base' text = "12312格ab局A B cdA,.567 861351 684!今天萨达天 气非常好王企。文保鹅按时发放了的撒这些seqetvgsa国内拉手的喀什。、]P[,./()*7656&【;,‘" #text = "这里基本保留了唐宋遗留下来的坊巷格局和大量明清古建筑,其中各级文保单位29处,被誉为“里坊制度的活化石”“明清建筑博物馆”!" bert4keras_tokenizer = Tokenizer( dict_path + "/vocab.txt", do_lower_case=True, pre_tokenize=lambda s: jieba.cut(s, HMM=False)) roformer_tokenizer = RoFormerTokenizer.from_pretrained(dict_path) bert4keras_tokenizer_input_ids = bert4keras_tokenizer.encode(text)[0] roformer_tokenizer_input_ids = roformer_tokenizer.encode(text) print(bert4keras_tokenizer_input_ids == roformer_tokenizer_input_ids)
from bert4keras.optimizers import Adam from bert4keras.backend import K, batch_gather, keras from bert4keras.layers import LayerNormalization from keras.layers import * from keras.models import Model # import os, sys # sys.path.append(os.getcwd()) from test4nlp.event_extract_chq.config import event_extract_config as Config from test4nlp.event_extract_chq.train.utils.data_utils import get_data, data_generator from tqdm import tqdm import json import numpy as np train_data, valid_data = get_data(Config.read_data_path) # 建立分词器 tokenizer = Tokenizer(Config.dict_path, do_lower_case=True) def extrac_trigger(inputs): "根据subject_ids从output中取出subject的向量表征" output, trigger_ids = inputs trigger_ids = K.cast(trigger_ids, 'int32') start = batch_gather(output, trigger_ids[:, :1]) end = batch_gather(output, trigger_ids[:, 1:]) trigger = K.concatenate([start, end], 2) return trigger[:, 0] def build_model(): bert_model = build_transformer_model( config_path=Config.config_path,
__, last_part = line.split(':') ignore_flag = False for dis_word in disallowed_words: if dis_word in last_part: ignore_flag = True break if ignore_flag: continue # 长度不能超过最大长度 if len(last_part) > max_len - 2: continue poetry.append(last_part) # 预训练模型中的词典和分词器 _token_dict = load_vocab(dict_path) _tokenizer = Tokenizer(dict_path, do_lower_case=True) # 统计所有词的词频 word_frequency_count = defaultdict(int) for line in poetry: for t in _tokenizer.tokenize(line): word_frequency_count[t] += 1 # 过滤掉低频词 tokens = [(token, count) for token, count in word_frequency_count.items() if count >= min_word_frequency] # 按词频排序 tokens = sorted(tokens, key=lambda x: -x[1]) # 去掉词频,只保留词列表 tokens = [token for token, count in tokens] # 构建新的token->id映射关系、和新词表
keep_tokens = [2] * 106 + keep_tokens keep_tokens_inv = {j: i for i, j in enumerate(keep_tokens)} compound_tokens = [] for t, _ in sorted(token_dict.items(), key=lambda s: s[1]): if t not in new_token_dict: new_token_dict[t] = len(new_token_dict) ids = [keep_tokens_inv.get(i, 0) for i in sp_tokenizer.encode(t)[0]] compound_tokens.append(ids) save_vocab(dict_path_2, new_token_dict) # 构建分词器 tokenizer = Tokenizer(new_token_dict, do_lower_case=True, pre_tokenize=lambda s: jieba.cut(s, HMM=False)) def corpus(): """语料生成器 """ while True: f = '/root/data_pretrain/data_shuf.json' with open(f) as f: for l in f: l = json.loads(l) for texts in text_process(l['text']): yield texts
#读取schema with open('/home/ycy/HBT/data/schema.json',encoding='utf8') as f: id2predicate, predicate2id, n = {}, {}, 0 predicate2type = {} for l in f: l = json.loads(l) predicate2type[l['predicate']] = (l['subject_type'], l['object_type']) for k, _ in sorted(l['object_type'].items()): key = l['predicate'] + '_' + k id2predicate[n] = key predicate2id[key] = n n += 1 # tokenizer = OurTokenizer(vocab_file=BERT_PATH + "vocab.txt") tokenizer = BertTokenizer.from_pretrained(model_path,do_lower=True) tokenizer_k = Tokenizer(os.path.join(model_path,'vocab.txt'), do_lower_case=True) def search(pattern, sequence): """从sequence中寻找子串pattern 如果找到,返回第一个下标;否则返回-1。 """ n = len(pattern) for i in range(len(sequence)): if sequence[i:i + n] == pattern: return i return -1 def sequence_padding(inputs, length=None, padding=0): """Numpy函数,将序列padding到同一长度 """ if length is None: length = max([len(x) for x in inputs])
d[-1][0] += char last_flag = this_flag D.append(d) return D # 标注数据 train_data = load_data('../data/ner/train.txt') valid_data = load_data('../data/ner/dev.txt') test_data = load_data('../data/ner/test.txt') # train_data = load_data('../data/example_.train') # valid_data = load_data('../data/example.dev') # test_data = load_data('../data/example.test') # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # 类别映射 print(labels) id2label = dict(enumerate(labels)) label2id = {j: i for i, j in id2label.items()} num_labels = len(labels) * 2 + 1 class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, item in self.sample(random): token_ids, labels = [tokenizer._token_start_id], [0]
import os os.environ["TF_KERAS"] = "1" import tensorflow as tf from bert4keras.snippets import to_array from bert4keras.tokenizers import Tokenizer from luwu.core.models.text_classifier.transformers import TransformerTextClassification # 词汇表地址 dict_path = "" # 训练好的模型保存路径 model_path = "" # 要预测的文本 sentence = "" # 编号->标签的映射 id_label_dict = {0: "类别1", 1: "类别2", 2: "类别3"} # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # 加载模型 model = tf.keras.models.load_model(model_path) # 处理文本数据 token_ids, segment_ids = tokenizer.encode(sentence) token_ids, segment_ids = to_array([token_ids], [segment_ids]) # 预测 outputs = model.predict([token_ids, segment_ids]) index = int(tf.argmax(outputs[0])) print("当前文本类别为:{}".format(id_label_dict[index]))