num_labeled = int(len(train_data) * train_frac) unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]] print("length of unlabeled_data0:",len(unlabeled_data)) train_data = train_data[:num_labeled] train_data = train_data + unlabeled_data print("length of train_data1:",len(train_data)) # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # 对应的任务描述 prefix =u'相近的两个句子。' # u'相近的两个句子的意思。' # u'满意。' mask_idx = 1 # 0: neutral, 1: entailment, 2:contradiction neutral_id=tokenizer.token_to_id(u'无') pos_id = tokenizer.token_to_id(u'很') neg_id = tokenizer.token_to_id(u'不') label_list=['neutral','entailment','contradiction'] # 0: neutral, 1: entailment, 2:contradiction label2tokenid_dict={'neutral':neutral_id,'entailment':pos_id,'contradiction':neg_id} label_tokenid_list=[label2tokenid_dict[x] for x in label_list] def random_masking(token_ids): """对输入进行随机mask 在BERT中,mask比例为15%,相比auto-encoder,BERT只预测mask的token,而不是重构整个输入token。 mask过程如下:80%机会使用[MASK],10%机会使用原词,10%机会使用随机词。 """ rands = np.random.random(len(token_ids)) source, target = [], []
valid_data = load_data('datasets/sentiment/sentiment.valid.data') test_data = load_data('datasets/sentiment/sentiment.test.data') # 模拟标注和非标注数据 train_frac = 0.01 # 标注数据的比例 num_labeled = int(len(train_data) * train_frac) unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]] train_data = train_data[:num_labeled] # train_data = train_data + unlabeled_data # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # 对应的任务描述 desc = ['[unused%s]' % i for i in range(1, 9)] desc_ids = [tokenizer.token_to_id(t) for t in desc] pos_id = tokenizer.token_to_id(u'很') neg_id = tokenizer.token_to_id(u'不') class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids = [] for is_end, (text, label) in self.sample(random): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) token_ids = token_ids[:1] + desc_ids[:4] + token_ids[:-1] token_ids = token_ids + desc_ids[4:] if label == 0: token_ids = token_ids + [neg_id]
parser = argparse.ArgumentParser(description="training set index") parser.add_argument("--train_set_index", "-t", help="training set index", type=str, default="0") args = parser.parse_args() train_set_index = args.train_set_index assert train_set_index in {"0", "1", "2", "3", "4", "all"}, 'train_set_index must in {"0", "1", "2", "3", "4", "all"}' from tqdm import tqdm config_path = '/home/stark/workdir/language_model/chinese_roberta_wwm_ext_L-12_H-768_A-12/config.json' checkpoint_path = '/home/stark/workdir/language_model/nezha-gpt/cn_gpt' dict_path = '/home/stark/workdir/language_model/nezha-gpt/vocab.txt' # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) label_dict = {"1": "一", "2": "二", "3": "三", "4": "四", "5": "五", "6": "六", "7": "七"} labels = [v for k, v in label_dict.items()] labels_ids = [tokenizer.token_to_id(v) for v in labels] maxlen = 256 batch_size = 16 num_per_val_file = 42 acc_list = [] # 加载数据的方法 def load_data(filename): D = [] with open(filename, encoding='utf-8') as f: for idx,l in enumerate(f): #print("l:",l) sample=json.loads(l.strip()) # print("json_string:",json_string) answer = sample["answer"] sentence = sample["content"] candidates = sample["candidates"]
label_list = ['neutral', 'entailment', 'contradiction' ] #### O.K. # 0: neutral, 1: entailment, 2:contradiction label_en2zh_dict = {'neutral': '并且', "entailment": "所以", "contradiction": "但是"} label_zh_list = [label_en2zh_dict[label_en] for label_en in label_en2zh_dict] label2index = {label: i for i, label in enumerate(label_list)} #### O.K. # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) label2tokenid_dict = { } # {'neutral':[neutral_id_1,neutral_id_2],'entailment':[entailment_id_1,entailment_id_2],'contradiction':[contradiction_id_1,contradiction_id_2]} for label_en in label_list: # label_en= # 'neutral' label_zh = label_en2zh_dict[label_en] char_id_list = [] for index, char_zh in enumerate(label_zh): char_id_list.append(tokenizer.token_to_id(char_zh)) label2tokenid_dict[ label_en] = char_id_list # e.g. 'neutral':[neutral_id_1,neutral_id_2] # print("###label2tokenid_dict:",label2tokenid_dict) # {'neutral': [704, 4989], 'entailment': [1259, 1419], 'contradiction': [4757, 4688]} label_tokenid_list = [ label2tokenid_dict[x] for x in label_list ] # label_tokenid_list:[[like_id_1,like_id_2],[happiness_id_1,happiness_id_2],[sadness_id_1,sadness_id_2],[anger_id_1,anger_id_2],[disgust_id_1,disgust_id_2]] token_id_list_1 = [x[0] for x in label_tokenid_list] # 标签第一个字组成的列表 token_id_list_2 = [x[1] for x in label_tokenid_list] # 标签第二个字组成的列表 # 对应的任务描述 mask_idxs = [1, 2] unused_length = 9 # 9 desc = [ '[unused%s]' % i for i in range(1, unused_length)
# 模拟标注和非标注数据 train_frac = 1 # TODO 0.01 # 标注数据的比例 num_labeled = int(len(train_data) * train_frac) unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]] print("length of unlabeled_data0:", len(unlabeled_data)) train_data = train_data[:num_labeled] train_data = train_data + unlabeled_data print("length of train_data1:", len(train_data)) # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # 对应的任务描述 prefix = u'很满意。' mask_idx = 1 pos_id = tokenizer.token_to_id(u'很') neg_id = tokenizer.token_to_id(u'不') def random_masking(token_ids): """对输入进行随机mask 在BERT中,mask比例为15%,相比auto-encoder,BERT只预测mask的token,而不是重构整个输入token。 mask过程如下:80%机会使用[MASK],10%机会使用原词,10%机会使用随机词。 """ rands = np.random.random(len(token_ids)) source, target = [], [] for r, t in zip(rands, token_ids): if r < 0.15 * 0.8: source.append(tokenizer._token_mask_id) target.append(t) elif r < 0.15 * 0.9:
# train_data = train_data + unlabeled_data # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # 对应的任务描述 mask_idx = 1 #5 unused_length = 9 # 9 desc = [ '[unused%s]' % i for i in range(1, unused_length) ] # desc: ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]'] desc.insert( mask_idx - 1, '[MASK]' ) # desc: ['[MASK]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10] desc_ids = [tokenizer.token_to_id(t) for t in desc] # 将token转化为id pos_id = tokenizer.token_to_id( u'很') # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很' neg_id = tokenizer.token_to_id( u'不') # e.g. '[unused10]. 将负向的token转化为id. 默认值:u'不' def random_masking(token_ids): """对输入进行mask 在BERT中,mask比例为15%,相比auto-encoder,BERT只预测mask的token,而不是重构整个输入token。 mask过程如下:80%机会使用[MASK],10%机会使用原词,10%机会使用随机词。 """ rands = np.random.random( len(token_ids) ) # rands: array([-0.34792592, 0.13826393, 0.8567176 , 0.32175848, -1.29532141, -0.98499201, -1.11829718, 1.18344819, 1.53478554, 0.24134646])
# bert4keras加载CDial-GPT # https://github.com/bojone/CDial-GPT-tf import numpy as np from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer from bert4keras.snippets import AutoRegressiveDecoder from bert4keras.snippets import uniout config_path = '/root/kg/bert/GPT_LCCC-base-tf/gpt_config.json' checkpoint_path = '/root/kg/bert/GPT_LCCC-base-tf/gpt_model.ckpt' dict_path = '/root/kg/bert/GPT_LCCC-base-tf/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 speakers = [ tokenizer.token_to_id('[speaker1]'), tokenizer.token_to_id('[speaker2]') ] model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='gpt_openai') # 建立模型,加载权重 class ChatBot(AutoRegressiveDecoder): """基于随机采样对话机器人 """ @AutoRegressiveDecoder.wraps(default_rtype='probas') def predict(self, inputs, output_ids, states): token_ids, segment_ids = inputs curr_segment_ids = np.zeros_like(output_ids) + token_ids[0, -1]
val_mask_idxs = [d[2] for d in valid_data] test_mask_idxs = [d[2] for d in test_data] val_labels_list = [encode_candidates(d[3]) for d in valid_data] test_labels_list = [encode_candidates(d[3]) for d in test_data] # train_data = train_data + unlabeled_data # 对应的任务描述 # mask_idxs = [5,6,7,8] # [7, 8] # mask_idx = 1 #5 unused_length = 1 # 不要[unused]标签,每加一个,准确率下降几个点 desc = [ '[unused%s]' % i for i in range(1, unused_length) ] # desc: ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]'] # for mask_id in mask_idxs: # desc.insert(mask_id - 1, '[MASK]') # desc: ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[MASK]', '[MASK]', '[unused7]', '[unused8]'] desc_ids = [tokenizer.token_to_id(t) for t in desc] # 将token转化为id def random_masking(token_ids): """对输入进行mask 在BERT中,mask比例为15%,相比auto-encoder,BERT只预测mask的token,而不是重构整个输入token。 mask过程如下:80%机会使用[MASK],10%机会使用原词,10%机会使用随机词。 """ rands = np.random.random( len(token_ids) ) # rands: array([-0.34792592, 0.13826393, 0.8567176 , 0.32175848, -1.29532141, -0.98499201, -1.11829718, 1.18344819, 1.53478554, 0.24134646]) source, target = [], [] for r, t in zip(rands, token_ids): if r < 0.15 * 0.8: # 80%机会使用[MASK] source.append(tokenizer._token_mask_id) target.append(t)
tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 model = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, model='roformer', application='lm', ) # 建立模型,加载权重 class ArticleCompletion(AutoRegressiveDecoder): """基于随机采样的文章续写 """ @AutoRegressiveDecoder.wraps(default_rtype='probas') def predict(self, inputs, output_ids, states): token_ids = np.concatenate([inputs[0], output_ids], 1) segment_ids = np.zeros_like(token_ids) return self.last_token(model).predict([token_ids, segment_ids]) def generate(self, text, n=1, topp=0.95): token_ids = tokenizer.encode(text)[0][:-1] results = self.random_sample([token_ids], n, topp=topp) # 基于随机采样 return [text + tokenizer.decode(ids) for ids in results] article_completion = ArticleCompletion( start_id=None, end_id=tokenizer.token_to_id(u'。'), maxlen=256, minlen=128 ) print(article_completion.generate(u'今天天气不错'))
# train_frac = 1 # TODO 0.01 # 标注数据的比例 # num_labeled = int(len(train_data) * train_frac) # unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]] # print("length of unlabeled_data0:", len(unlabeled_data)) # train_data = train_data[:num_labeled] # train_data = train_data + unlabeled_data # print("length of train_data1:", len(train_data)) # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # 对应的任务描述 prefix = u'相近的两个句子。' mask_idx = 1 neg_id = tokenizer.token_to_id(u'无') pos_id = tokenizer.token_to_id(u'很') label_list = ['false', 'true'] # 0: false, 1: true label2tokenid_dict = {'false': neg_id, 'true': pos_id} label_tokenid_list = [label2tokenid_dict[x] for x in label_list] def random_masking(token_ids): """对输入进行随机mask 在BERT中,mask比例为15%,相比auto-encoder,BERT只预测mask的token,而不是重构整个输入token。 mask过程如下:80%机会使用[MASK],10%机会使用原词,10%机会使用随机词。 """ rands = np.random.random(len(token_ids)) source, target = [], []
""" from bert4keras.tokenizers import Tokenizer from bert4keras.snippets import sequence_padding, DataGenerator from bert4keras.snippets import open import os from config import * import json # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # 对应的任务描述 desc = ['[unused%s]' % i for i in range(1, 9)] desc_ids = [tokenizer.token_to_id(t) for t in desc] mask_id = tokenizer.token_to_id("[MASK]") current_task_name = TaskName.CECMMNT def get_labels_ids(): # print("TASK_NAME:", TASK_NAME, "-------------------") label_words = labels_map[current_task_name] labels_ids = [] for words in label_words: ids = [] for w in words: ids.append(tokenizer.token_to_id(w)) labels_ids.append(ids)
z = zipfile.ZipFile('result.zip', 'w', zipfile.ZIP_DEFLATED) startdir = "./result" for dirpath, dirnames, filenames in os.walk(startdir): for filename in filenames: z.write(os.path.join(dirpath, filename)) z.close() else: print("测试单条样本") text = " 每粒装0.4g 口服。经期或经前5天一次3~5粒,一日3次,经后可继续服用,一次3~5粒,一日2~3次。 通调气血,止痛调经。用于经期腹痛及因寒所致的月经失调 广东和平药业有限公司 用于经期腹痛及因寒冷所致的月经失调 尚不明确。 尚不明确。 非处方药物(甲类),国家医保目录(乙类) " query = "找出头晕,心悸,小腹胀痛等疾病症状" label = "SYMPTOM" total_model.load_weights('best_model.weights') token_ids = [] segment_ids = [] token_ids.append(tokenizer.token_to_id("[CLS]")) segment_ids.append(0) for i in query: token_ids.append(tokenizer.token_to_id(i)) segment_ids.append(0) token_ids.append(tokenizer.token_to_id("[SEP]")) segment_ids.append(0) for i in text: token_ids.append(tokenizer.token_to_id(i)) segment_ids.append(tokenizer.token_to_id(1)) token_ids.append(tokenizer.token_to_id("[SEP]")) segment_ids.append(1) mask = [0] + [0] * len(query) + [0] + [1] * len(text) + [0] start_logits, end_logits = model.predict( [np.array([token_ids]),
base_model_path = sys.argv[3] output_model_path = sys.argv[4] mode = sys.argv[5] config_path = os.path.join(base_model_path, 'bert_config.json') checkpoint_path = os.path.join(base_model_path, 'bert_model.ckpt') dict_path = os.path.join(base_model_path, 'vocab.txt') # 加载数据的方法 # {"id": 16, "content": "你也不用说对不起,只是,,,,若相惜", "label": "sadness"} label_list = ['like', 'happiness', 'sadness', 'anger', 'disgust'] #### label2index = {label: i for i, label in enumerate(label_list)} #### # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) like_id = tokenizer.token_to_id( u'[unused10]') # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很' happiness_id = tokenizer.token_to_id( u'[unused11]') # e.g. '[unused10]. 将负向的token转化为id. 默认值:u'不' sadness_id = tokenizer.token_to_id( u'[unused12]') # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很' anger_id = tokenizer.token_to_id( u'[unused13]') # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很' disgust_id = tokenizer.token_to_id( u'[unused14]') # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很' label2tokenid_dict = { 'like': like_id, 'happiness': happiness_id, 'sadness': sadness_id, 'anger': anger_id, 'disgust': disgust_id
# train_data = train_data + unlabeled_data # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) # 对应的任务描述 mask_idx = 1 #5 unused_length = 9 # 9 desc = [ '[unused%s]' % i for i in range(1, unused_length) ] # desc: ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]'] desc.insert( mask_idx - 1, '[MASK]' ) # desc: ['[MASK]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10] desc_ids = [tokenizer.token_to_id(t) for t in desc] # 将token转化为id # pos_id = tokenizer.token_to_id(u'很') # e.g. '[unused9]'. 将正向的token转化为id. 默认值:u'很' # neg_id = tokenizer.token_to_id(u'不') # e.g. '[unused10]. 将负向的token转化为id. 默认值:u'不' neutral_id = tokenizer.token_to_id(u'无') pos_id = tokenizer.token_to_id(u'很') neg_id = tokenizer.token_to_id(u'不') def random_masking(token_ids): """对输入进行mask 在BERT中,mask比例为15%,相比auto-encoder,BERT只预测mask的token,而不是重构整个输入token。 mask过程如下:80%机会使用[MASK],10%机会使用原词,10%机会使用随机词。 """ rands = np.random.random( len(token_ids)
from bert4keras.tokenizers import Tokenizer , load_vocab import json import numpy as np dict_path = "vocab.txt" tokenizer = Tokenizer(load_vocab(dict_path)) maskID = tokenizer.token_to_id(tokenizer._token_mask) def write_Json(content,fileName): with open(fileName,"w") as f: json.dump(content,f,indent=2) def read_json(fileName): fp = open(fileName,"r") f = json.load(fp) return f def cal_mask(inputs,corrupts,labels): assert inputs.shape == corrupts.shape and corrupts.shape == labels.shape masked = (labels == 1) correct = (inputs == corrupts) masked = masked.astype(np.float) correct = correct.astype(np.float) mask = masked * correct return mask