def __iter__(self, random=False): """标签含义 0: 单字词; 1: 多字词首字; 2: 多字词中间; 3: 多字词末字 """ batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, item in self.sample(random): token_ids, labels = [tokenizer._token_start_id], [0] for w in item: w_token_ids = tokenizer.encode(w)[0][1:-1] if len(token_ids) + len(w_token_ids) < maxlen: token_ids += w_token_ids if len(w_token_ids) == 1: labels += [0] else: labels += [1] + [2] * (len(w_token_ids) - 2) + [3] else: break token_ids += [tokenizer._token_end_id] labels += [0] segment_ids = [0] * len(token_ids) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(labels) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def __iter__(self, random=False): # TODO 这里的random是指否需要对原始文本进行mask batch_token_ids, batch_segment_ids, batch_output_ids = [], [], [] for is_end, (text, label) in self.sample(random): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) if label != 2: token_ids = token_ids[:1] + desc_ids + token_ids[1:] # # token_ids[:1] = [CLS] segment_ids = [0] * len(desc_ids) + segment_ids if random: # 暂时没有用呢 source_ids, target_ids = random_masking(token_ids) else: source_ids, target_ids = token_ids[:], token_ids[:] #label_ids = tokenizer.encode(label)[0][1:-1] # label_ids: [1092, 752] ;tokenizer.token_to_id(label[0]): 1092. 得到标签(如"财经")对应的词汇表的编码ID。label_ids: [1093, 689]。 e.g. [101, 1093, 689, 102] =[CLS,农,业,SEP]. tokenizer.encode(label): ([101, 1093, 689, 102], [0, 0, 0, 0]) # print("label_ids:",label_ids,";tokenizer.token_to_id(label[0]):",tokenizer.token_to_id(label[0])) for i,mask_id in enumerate(mask_idxs): source_ids[mask_id] = tokenizer._token_mask_id target_ids[mask_id] = tokenizer.token_to_id(label[i]) # token_to_id与tokenizer.encode可以实现类似的效果。 batch_token_ids.append(source_ids) batch_segment_ids.append(segment_ids) batch_output_ids.append(target_ids) if len(batch_token_ids) == self.batch_size or is_end: # padding操作 batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_output_ids = sequence_padding(batch_output_ids) yield [ batch_token_ids, batch_segment_ids, batch_output_ids ], None batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
def __iter__(self, random=False): """单条样本格式:[CLS]篇章[SEP]问题[SEP]答案[SEP] """ batch_token_ids, batch_segment_ids = [], [] for is_end, D in self.sample(random): question = D['question'] answers = [p['answer'] for p in D['passages'] if p['answer']] passage = np.random.choice(D['passages'])['passage'] passage = re.sub(u' |、|;|,', ',', passage) final_answer = '' for answer in answers: if all( [a in passage[:max_p_len - 2] for a in answer.split(' ')]): final_answer = answer.replace(' ', ',') break qa_token_ids, qa_segment_ids = tokenizer.encode( question, final_answer, max_length=max_qa_len + 1) p_token_ids, p_segment_ids = tokenizer.encode(passage, max_length=max_p_len) token_ids = p_token_ids + qa_token_ids[1:] segment_ids = p_segment_ids + qa_segment_ids[1:] batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) yield [batch_token_ids, batch_segment_ids], None batch_token_ids, batch_segment_ids = [], []
def __iter__(self, random=False): """标签含义 0: 单字词; 1: 多字词首字; 2: 多字词中间; 3: 多字词末字 """ idxs = list(range(len(self.data))) if random: np.random.shuffle(idxs) batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for i in idxs: token_ids, labels = [tokenizer._token_cls_id], [0] for w in self.data[i]: w_token_ids = tokenizer.encode(w)[0][1:-1] if len(token_ids) + len(w_token_ids) < maxlen: token_ids += w_token_ids if len(w_token_ids) == 1: labels += [0] else: labels += [1] + [2] * (len(w_token_ids) - 2) + [3] else: break token_ids += [tokenizer._token_sep_id] labels += [0] segment_ids = [0] * len(token_ids) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(labels) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def __iter__(self, random=True): """ 单条样本格式: [cls]错误词汇[sep][mask][mask]..[sep] :param random: :return: """ batch_tokens_ids, batch_segment_ids, batch_right_token_ids = [], [], [] for is_end, D in self.sample(random): wrong, right = D right_token_ids, _ = tokenizer.encode(first_text=right) wrong_token_ids, _ = tokenizer.encode(first_text=wrong) token_ids = wrong_token_ids token_ids += [tokenizer._token_mask_id] * max_len token_ids += [tokenizer._token_end_id] segemnt_ids = [0] * len(token_ids) batch_tokens_ids.append(token_ids) batch_segment_ids.append(segemnt_ids) batch_right_token_ids.append(right_token_ids[1:]) if len(batch_tokens_ids) == self.batch_size or is_end: batch_tokens_ids = sequence_padding(batch_tokens_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_right_token_ids = sequence_padding( batch_right_token_ids, max_len) yield [batch_tokens_ids, batch_segment_ids], batch_right_token_ids batch_tokens_ids, batch_segment_ids, batch_right_token_ids = [], [], []
def __iter__(self, random=False): batch_ori_sentence = [] batch_token_ids, batch_segment_ids = [], [] batch_start, batch_end, batch_insert_pos, batch_start_ner, batch_end_ner = [], [], [], [], [] for is_end, d in self.sample(random): ori_sentence, sentence, token_type, pointer = d["ori_sentence"], d[ "sentence"], d["token_type"], d["pointer"] token_ids, segment_ids = sentence, token_type batch_ori_sentence.append(ori_sentence) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) start, end, insert_pos, start_ner, end_ner = pointer batch_start.append(start) batch_end.append(end) batch_insert_pos.append(insert_pos) batch_start_ner.append(start_ner) batch_end_ner.append(end_ner) if len(batch_token_ids) == self.batch_size or is_end: batch_ori_sentence = np.array(batch_ori_sentence) batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids, padding=1) batch_start = np.array(batch_start) batch_end = np.array(batch_end) batch_insert_pos = np.array(batch_insert_pos) batch_start_ner = np.array(batch_start_ner) batch_end_ner = np.array(batch_end_ner) yield [ batch_token_ids, batch_segment_ids, batch_start, batch_end, batch_insert_pos, batch_start_ner, batch_end_ner ], batch_ori_sentence batch_ori_sentence = [] batch_token_ids, batch_segment_ids = [], [] batch_start, batch_end, batch_insert_pos, batch_start_ner, batch_end_ner = [], [], [], [], []
def evaluate_report(df_data): model = tf.keras.models.load_model('{}-model.h5'.format(model_name)) true_y_list = [i for i in df_data["new_label"].tolist()] pred_y_list = [] for text in df_data["text"].tolist(): tokenizer = Tokenizer(dict_path, do_lower_case=True) token_ids, segment_ids = tokenizer.encode(first_text=text, maxlen=maxlen) token_list = sequence_padding([token_ids]) segment_list = sequence_padding([segment_ids]) label = model.predict([np.array(token_list), np.array(segment_list)]).argmax(axis=1) pred_y_list.append(label[0]) with open("label.json", "r", encoding="utf-8") as f: labels = json.loads(f.read()) target_name_list = list(labels.values()) report = classification_report(true_y_list, pred_y_list, target_names=target_name_list, digits=4, output_dict=True) print(report) df = pd.DataFrame(report).transpose() df.to_csv("{}-report.csv".format(model_type), encoding='utf_8_sig', index=True)
def gen_synonyms(text, n=100, k=20): """"含义: 产生sent的n个相似句,然后返回最相似的k个。 做法:用seq2seq生成,并用encoder算相似度并排序。 效果: >>> gen_synonyms(u'微信和支付宝哪个好?') [ u'微信和支付宝,哪个好?', u'微信和支付宝哪个好', u'支付宝和微信哪个好', u'支付宝和微信哪个好啊', u'微信和支付宝那个好用?', u'微信和支付宝哪个好用', u'支付宝和微信那个更好', u'支付宝和微信哪个好用', u'微信和支付宝用起来哪个好?', u'微信和支付宝选哪个好', ] """ r = synonyms_generator.generate(text, n) r = [i for i in set(r) if i != text] r = [text] + r X, S = [], [] for t in r: x, s = tokenizer.encode(t) X.append(x) S.append(s) X = sequence_padding(X) S = sequence_padding(S) Z = encoder.predict([X, S]) Z /= (Z**2).sum(axis=1, keepdims=True)**0.5 argsort = np.dot(Z[1:], -Z[0]).argsort() return [r[i + 1] for i in argsort[:k]]
def __iter__(self, r=False): idxs = list(range(len(self.data))) np.random.shuffle(idxs) batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for i in idxs: line = self.data.loc[i] if (random.random() < 0.5): s1 = line['s1'].replace('***', '*') s2 = line['s2'].replace('***', '*') else: s2 = line['s1'].replace('***', '*') s1 = line['s2'].replace('***', '*') token_ids, segment_ids = tokenizer.encode(s1, s2, max_length=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([line['label']]) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids, batch_labels], None batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def __iter__(self, _): batch_token_ids, batch_segment_ids = [], [] fpI = open(self.data + '/in.txt', 'r', encoding='utf-8') fpO = open(self.data + '/out.txt', 'r', encoding='utf-8') for lineI in fpI: lineI = lineI.rstrip() lineO = fpO.readline().rstrip() token_ids, segment_ids = tokenizer.encode(lineI, lineO, maxlen=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) token_ids, segment_ids = tokenizer.encode(lineO, lineI, maxlen=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) if len(batch_token_ids) == self.batch_size: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) yield [batch_token_ids, batch_segment_ids], None batch_token_ids, batch_segment_ids = [], [] if batch_token_ids: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) yield [batch_token_ids, batch_segment_ids], None
def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_output_ids = [], [], [] for is_end, (text, label) in self.sample(random): if len(label) == 2: text = prefix + text token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) if random: source_ids, target_ids = random_masking(token_ids) else: source_ids, target_ids = token_ids[:], token_ids[:] if len(label) == 2: label_ids = tokenizer.encode(label)[0][1:-1] for i, j in zip(mask_idxs, label_ids): source_ids[i] = tokenizer._token_mask_id target_ids[i] = j batch_token_ids.append(source_ids) batch_segment_ids.append(segment_ids) batch_output_ids.append(target_ids) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_output_ids = sequence_padding(batch_output_ids) yield [batch_token_ids, batch_segment_ids, batch_output_ids], None batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
def __iter__(self, random=False): batch_token_ids, batch_segment_ids = [], [] batch_output_ids, batch_labels = [], [] for is_end, (question, equation, answer) in self.sample(random): token_ids, segment_ids = tokenizer.encode( question, equation, maxlen=maxlen ) idx = token_ids.index(tokenizer._token_end_id) + 1 masked_token_ids = random_masking(token_ids) source_labels, target_labels = masked_token_ids[:idx], token_ids[idx:] labels = source_labels + target_labels[1:] batch_token_ids.append(masked_token_ids) batch_segment_ids.append(segment_ids) batch_output_ids.append(token_ids) batch_labels.append(labels) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_output_ids = sequence_padding(batch_output_ids) batch_labels = sequence_padding(batch_labels) yield [ batch_token_ids, batch_segment_ids\ batch_output_ids, batch_labels ], None batch_token_ids, batch_segment_ids = [], [] batch_output_ids, batch_labels = [], []
def __iter__(self, random=False): # TODO 这里的random是指否需要对原始文本进行mask batch_token_ids, batch_segment_ids, batch_output_ids = [], [], [] for is_end, (text, label) in self.sample(random): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) if label != 2: token_ids = token_ids[:1] + desc_ids + token_ids[1:] segment_ids = [0] * len(desc_ids) + segment_ids if random: # 暂时没有用呢 source_ids, target_ids = random_masking(token_ids) else: source_ids, target_ids = token_ids[:], token_ids[:] # 0: neutral, 1: entailment, 2:contradiction if label == 0: source_ids[mask_idx] = tokenizer._token_mask_id target_ids[mask_idx] = neutral_id elif label == 1: source_ids[mask_idx] = tokenizer._token_mask_id target_ids[mask_idx] = pos_id elif label == 2: source_ids[mask_idx] = tokenizer._token_mask_id target_ids[mask_idx] = neg_id batch_token_ids.append(source_ids) batch_segment_ids.append(segment_ids) batch_output_ids.append(target_ids) if len(batch_token_ids) == self.batch_size or is_end: # padding操作 batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_output_ids = sequence_padding(batch_output_ids) yield [batch_token_ids, batch_segment_ids, batch_output_ids], None batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
def data_score(self, text_path): time_start = time.time() # 测试集的准确率 _, _, _, test_data = json_data_process(text_path) y_pred = [] y_true = [] # for label,text in test_data: for text, label, in test_data: # print(text) # print(self.label2index) # y_true.append(self.label2index[label]) y_true.append(self.index2label[label]) token_ids, segment_ids = self.tokenizer.encode(text, maxlen=self.max_len) token_ids = sequence_padding([token_ids], length=self.max_len) segment_ids = sequence_padding([segment_ids], length=self.max_len) pre = self.model.predict([token_ids, segment_ids]) # res = self.index2label.get(str(np.argmax(pre[0]))) # token_ids = np.array([token_ids]) # # 预测 # pred = self.model.predict(token_ids) pred = str(np.argmax(pre[0])) y_pred.append(self.index2label[pred]) print("data pred ok!") # 评估 target_names = [str(label) for label in self.labels] report_predict = classification_report(y_true, y_pred, target_names=target_names, digits=9) print(report_predict) print("耗时:" + str(time.time() - time_start))
def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_output_ids = [], [], [] for is_end, (text, label) in self.sample(random): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) if label != 2: token_ids = token_ids[:1] + desc_ids + token_ids[1:] segment_ids = [0] * len(desc_ids) + segment_ids if random: # 暂时没有用呢 source_ids, target_ids = random_masking(token_ids) else: source_ids, target_ids = token_ids[:], token_ids[:] # if label == 0: # 负样本 # source_ids[mask_idx] = tokenizer._token_mask_id # target_ids[mask_idx] = neg_id # elif label == 1: # 正向样本 # source_ids[mask_idx] = tokenizer._token_mask_id # target_ids[mask_idx] = pos_id ############################################################ source_ids[mask_idx] = tokenizer._token_mask_id # print("label2tokenid_dict:,label2tokenid_dict,label:",label). e.g. {'like':like_id,'happiness':happiness_id,'sadness':sadness_id,'anger':anger_id,'disgust':disgust_id} target_id = label2tokenid_dict[label] # label2tokenid_dict: target_ids[mask_idx] = target_id ############################################################ batch_token_ids.append(source_ids) batch_segment_ids.append(segment_ids) batch_output_ids.append(target_ids) if len(batch_token_ids) == self.batch_size or is_end: # padding操作 batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_output_ids = sequence_padding(batch_output_ids) yield [batch_token_ids, batch_segment_ids, batch_output_ids], None batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_output_ids = [], [], [] for is_end, (text, label) in self.sample(random): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) if label != 2: token_ids = token_ids[:1] + desc_ids + token_ids[1:] segment_ids = [0] * len(desc_ids) + segment_ids if random: source_ids, target_ids = random_masking(token_ids) else: source_ids, target_ids = token_ids[:], token_ids[:] if label == 0: source_ids[mask_idx] = tokenizer._token_mask_id target_ids[mask_idx] = neg_id elif label == 1: source_ids[mask_idx] = tokenizer._token_mask_id target_ids[mask_idx] = pos_id batch_token_ids.append(source_ids) batch_segment_ids.append(segment_ids) batch_output_ids.append(target_ids) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_output_ids = sequence_padding(batch_output_ids) yield [ batch_token_ids, batch_segment_ids, batch_output_ids ], None batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
def __iter__(self, random=False): """单条样本格式:[CLS]篇章[SEP]问题[SEP]答案[SEP] """ idxs = list(range(len(self.data))) if random: np.random.shuffle(idxs) batch_token_ids, batch_segment_ids = [], [] for i in idxs: D = self.data[i] question = ''.join(D['question_tokens']) question = re.sub(u' |、|;|,', ',', question)[:max_q_len] start = np.argwhere(D['answer_feature'] == 1)[0][0] end = np.argwhere(D['answer_feature'] == 1)[0][-1] answer = ''.join(D['passage_tokens'][start:end + 1]) answer = re.sub(u' |、|;|,', ',', answer)[:max_a_len] passage = ''.join(D['passage_tokens']) passage = re.sub(u' |、|;|,', ',', passage) qa_token_ids, qa_segment_ids = tokenizer.encode( answer, question, max_length=max_qa_len + 1) p_token_ids, p_segment_ids = tokenizer.encode(passage, max_length=max_p_len) token_ids = p_token_ids + qa_token_ids[1:] segment_ids = p_segment_ids + qa_segment_ids[1:] batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) yield [batch_token_ids, batch_segment_ids], None batch_token_ids, batch_segment_ids = [], []
def __iter__(self, random=False): batch_token_ids, batch_segment_ids = [], [] for is_end, d in self.sample(random): text, synonyms = d['text'], d['synonyms'] synonyms = [text] + synonyms np.random.shuffle(synonyms) text, synonym = synonyms[:2] text, synonym = truncate(text), truncate(synonym) self.some_samples.append(text) if len(self.some_samples) > 1000: self.some_samples.pop(0) # sentence a 和sentence b 按前后顺序加入到序列 token_ids, segment_ids = tokenizer.encode(text, synonym, max_length=maxlen * 2) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) #把[CLS] SENT_a [SEP] SENT_b [SEP]和[CLS] SENT_b [SEP] SENT_a [SEP]都加入训练 token_ids, segment_ids = tokenizer.encode(synonym, text, max_length=maxlen * 2) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) yield [batch_token_ids, batch_segment_ids], None batch_token_ids, batch_segment_ids = [], []
def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_output_ids = [], [], [] for is_end, (text, label) in self.sample(random): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) if label != 2: token_ids = token_ids[:1] + desc_ids + token_ids[1:] segment_ids = [0] * len(desc_ids) + segment_ids if random: # 暂时没有用呢 source_ids, target_ids = random_masking(token_ids) else: source_ids, target_ids = token_ids[:], token_ids[:] source_ids[ mask_idxs[0]] = tokenizer._token_mask_id # 1的位置用[mask]填充 source_ids[ mask_idxs[1]] = tokenizer._token_mask_id # 2的位置用[mask]填充 targt_id_1 = label2tokenid_dict[label][0] targt_id_2 = label2tokenid_dict[label][ 1] # print("targt_id_1:",targt_id_1,";targt_id_2:",targt_id_2) # targt_id_1: 839(代表“伤”) ;targt_id_2: 2552(代表“心”) target_ids[mask_idxs[0]] = targt_id_1 # 第一个[mask]对应的正确的标签字,如:“伤”; target_ids[mask_idxs[1]] = targt_id_2 # 第二个[mask]对应的正确的标签字,如:“心”。 batch_token_ids.append(source_ids) batch_segment_ids.append(segment_ids) batch_output_ids.append(target_ids) if len(batch_token_ids) == self.batch_size or is_end: # padding操作 batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_output_ids = sequence_padding(batch_output_ids) yield [batch_token_ids, batch_segment_ids, batch_output_ids], None batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
def __iter__(self, random=False): idxs = list(range(len(self.data))) if random: np.random.shuffle(idxs) batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for i in idxs: token_ids, labels = [tokenizer._token_cls_id], [0] for w, l in self.data[i]: w_token_ids = tokenizer.encode(w)[0][1:-1] if len(token_ids) + len(w_token_ids) < maxlen: token_ids += w_token_ids if l == 'O': labels += [0] * len(w_token_ids) else: B = class2id[l] * 2 + 1 I = class2id[l] * 2 + 2 labels += ([B] + [I] * (len(w_token_ids) - 1)) else: break token_ids += [tokenizer._token_sep_id] labels += [0] segment_ids = [0] * len(token_ids) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(labels) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] # is_end是是否是最后一个样本,如果是,则为1 for is_end, (text, arguments) in self.sample(random): '''注意,tokenizer没重写,所以,4000 就存在''' #token_ids token_ids, segment_ids = tokenizer.encode( text, max_length=maxlen) #其实最长的是113,算上前后共115 labels = [0] * len(token_ids) #注意,这个labels是有第一位cls和最后一位的 for argument in arguments.items(): a_token_ids = tokenizer.encode(argument[0])[0][1:-1] #雀巢的数字形式 start_index = search(a_token_ids, token_ids) #寻找在句子中的位置,+1了 if start_index != -1: #如果找到 labels[start_index] = label2id[argument[1]] * 2 + 1 for i in range(1, len(a_token_ids)): labels[start_index + i] = label2id[argument[1]] * 2 + 2 #labels [0, 363, 364, 0, 0, 365, 366, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(labels) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_output_ids = [], [], [] for is_end, (text, label) in self.sample(random): if len(label) == 2: # label是两个字的文本 text = prefix + text # 拼接文本 token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) if random: source_ids, target_ids = random_masking(token_ids) else: source_ids, target_ids = token_ids[:], token_ids[:] if len(label) == 2: # label是两个字的文本 label_ids = tokenizer.encode(label)[0][ 1: -1] # label_ids: [1093, 689]。 e.g. [101, 1093, 689, 102] =[CLS,农,业,SEP]. tokenizer.encode(label): ([101, 1093, 689, 102], [0, 0, 0, 0]) for i, label_id_ in zip(mask_idxs, label_ids): source_ids[ i] = tokenizer._token_mask_id # i: 7(mask1的index) ;j: 1093(农); i:8 (mask2的index) ;j: 689(业) target_ids[i] = label_id_ batch_token_ids.append(source_ids) batch_segment_ids.append(segment_ids) batch_output_ids.append(target_ids) if len(batch_token_ids ) == self.batch_size or is_end: # 分批padding和生成 batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_output_ids = sequence_padding(batch_output_ids) yield [batch_token_ids, batch_segment_ids, batch_output_ids], None batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
def __iter__(self, random=False): """单条样本格式:[CLS]篇章[SEP]答案[SEP]问题[SEP] """ idxs = list(range(len(self.data))) batch_token_ids, batch_segment_ids = [], [] for i in idxs: text, question, answer = self.data[i] text_begin, text_end, _ = split_str(text, answer) text_cut_len = max(0, len(text) - 507 - 132) text_combine = delete_text(text, len(text_begin), len(text_end), text_cut_len) # text_b_token_ids, _ = tokenizer.encode(text_begin, max_length=375) # text_e_token_ids, _ = tokenizer.encode(text_end, max_length=375) # answer_token_ids, _ = tokenizer.encode(answer, max_length=256) token_ids = text_b_token_ids[:min(len(text_begin), 375)] + \ answer_token_ids[:min(len(answer), 256)] + text_e_token_ids[:min(len(text_end),375)] text_b_token_ids[:min(len(text_begin), 375)].insert(0, '[CLS]') text_b_token_ids[:min(len(text_begin), 375)].append('[SEP]') answer_token_ids[:min(len(answer), 256)] # text_token_ids, _ = tokenizer.encode(text_combine, max_length=375) question_token_ids, _ = tokenizer.encode(question, max_length=132) token_ids = text_token_ids + question_token_ids segment_ids = [0] * (len(token_ids) - len(question_token_ids[1:])) segment_ids += [1] * (len(question_token_ids[1:])) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) yield [batch_token_ids, batch_segment_ids], None batch_token_ids, batch_segment_ids = [], []
def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, d in self.sample(random): tokens = self.tokenizer.tokenize(d[0], maxlen=self.maxlen) mapping = self.tokenizer.rematch(d[0], tokens) start_mapping = {j[0]: i for i, j in enumerate(mapping) if j} end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j} token_ids = self.tokenizer.tokens_to_ids(tokens) segment_ids = [0] * len(token_ids) labels = np.zeros(len(token_ids)) for start, end, label in d[1:]: if start in start_mapping and end in end_mapping: start = start_mapping[start] end = end_mapping[end] labels[start] = self.categories.index(label) * 2 + 1 labels[start + 1:end + 1] = self.categories.index(label) * 2 + 2 batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(labels) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def predict(self, text): token_ids, segment_ids = self.tokenizer.encode(text) token_ids = sequence_padding([token_ids], length=self.max_len) segment_ids = sequence_padding([segment_ids], length=self.max_len) pre = self.model.predict([token_ids, segment_ids]) res = self.index2label.get(str(np.argmax(pre[0]))) return res
def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, item in self.sample(random): token_ids, labels = [tokenizer._token_start_id], [0] for w, l in item: w_token_ids = tokenizer.encode(w)[0][1:-1] if len(token_ids) + len(w_token_ids) < maxlen: token_ids += w_token_ids if l == 'O': labels += [0] * len(w_token_ids) else: B = label2id[l] * 2 + 1 I = label2id[l] * 2 + 2 labels += ([B] + [I] * (len(w_token_ids) - 1)) else: break token_ids += [tokenizer._token_end_id] labels += [0] segment_ids = [0] * len(token_ids) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(labels) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def __iter__(self, random=False): """单条样本格式:[CLS]篇章[SEP]答案[SEP]问题[SEP]。""" batch_token_ids, batch_segment_ids, batch_o_token_ids = [], [], [] for is_end, (p, q, a) in self.sample(random): p_token_ids, _ = tokenizer.encode(p, maxlen=max_p_len) a_token_ids, _ = tokenizer.encode(a, maxlen=max_a_len) q_token_ids, _ = tokenizer.encode(q, maxlen=max_q_len) token_ids = p_token_ids + a_token_ids[1:] + q_token_ids[1:] segment_ids = [0] * (len(p_token_ids) + len(a_token_ids[1:])) segment_ids += [1] * (len(token_ids) - len(p_token_ids) - len(a_token_ids[1:])) o_token_ids = token_ids if np.random.random() > 0.5: token_ids = [ t if s == 0 or (s == 1 and np.random.random() > 0.3) else np.random.choice(token_ids) for t, s in zip(token_ids, segment_ids) ] batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_o_token_ids.append(o_token_ids) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_o_token_ids = sequence_padding(batch_o_token_ids) yield [batch_token_ids, batch_segment_ids, batch_o_token_ids], None batch_token_ids, batch_segment_ids, batch_o_token_ids = [], [], []
def __iter__(self, random=True): idxs = list(range(len(self.data))) if random: np.random.shuffle(idxs) batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for i in idxs: #print(self.data[i]) _, _, text1, text2, label = self.data[i] # text1 = reproduce_text(text1) # text2 = reproduce_text(text2) token_ids, segment_ids = tokenizer.encode(text1, text2, max_length=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([label]) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) # transfer_flag = np.random.rand() # if transfer_flag>0.8: # batch_token_ids, batch_segment_ids = batch_segment_ids, batch_token_ids batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def __iter__(self, random=False): idxs = list(range(len(self.data))) if random: np.random.shuffle(idxs) batch_token_ids, batch_segment_ids = [], [] batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [] for i in idxs: d = self.data[i] token_ids, segment_ids = tokenizer.encode(d['text'], max_length=maxlen) # 整理三元组 {s: [(o, p)]} spoes = {} for s, p, o in d['spo_list']: s = tokenizer.encode(s)[0][1:-1] p = predicate2id[p] o = tokenizer.encode(o)[0][1:-1] s_idx = search(s, token_ids) o_idx = search(o, token_ids) if s_idx != -1 and o_idx != -1: s = (s_idx, s_idx + len(s) - 1) o = (o_idx, o_idx + len(o) - 1, p ) # [o-start,o-end,predicate] if s not in spoes: spoes[s] = [] spoes[s].append(o) if spoes: # subject标签 #subject_labels = np.zeros((len(token_ids),2)) subject_labels = np.zeros( (len(token_ids), len(predicate2id), 2)) #[69step ,49,2] for s in spoes: for o_s, o_e, p in spoes[s]: subject_labels[s[0], p, 0] = 1 subject_labels[s[1], p, 1] = 1 # # 构建batch batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_subject_labels.append(subject_labels) #batch_subject_ids.append(subject_ids) #batch_object_labels.append(object_labels) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_subject_labels = sequence_padding( batch_subject_labels, #padding=np.zeros(2), padding=np.zeros((len(predicate2id), 2))) #batch_subject_ids = np.array(batch_subject_ids) #batch_object_labels = sequence_padding(batch_object_labels, padding=np.zeros((len(predicate2id), 2))) yield [ batch_token_ids, batch_segment_ids, batch_subject_labels, #batch_subject_ids, batch_object_labels ], None batch_token_ids, batch_segment_ids = [], [] batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []
def __iter__(self, random=False): batch_token_ids, batch_segment_ids = [], [] batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [] for is_end, d in self.sample(random): token_ids, segment_ids = tokenizer.encode(d['text'], max_length=maxlen) # 整理三元组 {s: [(o, p)]} spoes = {} for s, p, o in d['spo_list']: s = tokenizer.encode(s)[0][1:-1] p = predicate2id[p] o = tokenizer.encode(o)[0][1:-1] s_idx = search(s, token_ids) o_idx = search(o, token_ids) if s_idx != -1 and o_idx != -1: s = (s_idx, s_idx + len(s) - 1) o = (o_idx, o_idx + len(o) - 1, p) if s not in spoes: spoes[s] = [] spoes[s].append(o) if spoes: # subject标签 subject_labels = np.zeros((len(token_ids), 2)) for s in spoes: subject_labels[s[0], 0] = 1 subject_labels[s[1], 1] = 1 # 随机选一个subject start, end = np.array(list(spoes.keys())).T start = np.random.choice(start) end = np.random.choice(end[end >= start]) subject_ids = (start, end) # 对应的object标签 object_labels = np.zeros( (len(token_ids), len(predicate2id), 2)) for o in spoes.get(subject_ids, []): object_labels[o[0], o[2], 0] = 1 object_labels[o[1], o[2], 1] = 1 # 构建batch batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_subject_labels.append(subject_labels) batch_subject_ids.append(subject_ids) batch_object_labels.append(object_labels) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_subject_labels = sequence_padding( batch_subject_labels, padding=np.zeros(2)) batch_subject_ids = np.array(batch_subject_ids) batch_object_labels = sequence_padding( batch_object_labels, padding=np.zeros((len(predicate2id), 2))) yield [ batch_token_ids, batch_segment_ids, batch_subject_labels, batch_subject_ids, batch_object_labels ], None batch_token_ids, batch_segment_ids = [], [] batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []