def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, item in self.get_sample(): token_ids, labels = [tokenizer._token_start_id], [0] for w, l in item: w_token_ids = tokenizer.encode(w)[0][1:-1] if len(token_ids) + len(w_token_ids) < maxlen: token_ids += w_token_ids if l == 'O': labels += [0] * len(w_token_ids) else: B = label2id[l] * 3 + 1 I = label2id[l] * 3 + 2 E = label2id[l] * 3 + 3 labels += ([B] + [I] * (len(w_token_ids) - 2) + [E] * int(len(w_token_ids) > 1)) else: break token_ids += [tokenizer._token_end_id] labels += [0] segment_ids = [0] * len(token_ids) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(labels) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) batch_labels = pad_sequences(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def __iter__(self, shuffle=False): batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked, = [], [], [], [] for is_end, item in self.get_sample(shuffle): source_tokens, target_tokens, segment_ids = random_masking(item) is_masked = [0 if i == 0 else 1 for i in target_tokens] batch_token_ids.append(source_tokens) batch_segment_ids.append(segment_ids) batch_target_ids.append(target_tokens) batch_is_masked.append(is_masked) # batch_nsp.append([label]) if is_end or len(batch_token_ids) == self.batch_size: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) batch_target_ids = pad_sequences(batch_target_ids) batch_is_masked = pad_sequences(batch_is_masked) # batch_nsp = sequence_padding(batch_nsp) yield [ batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked ], None batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked = [], [], [], []
def __iter__(self, random=False): # 0: 单字,1,多字词开头,2,中间,3,末尾 batch_tokens, batch_segs, batch_labels = [], [], [] for is_end, item in self.get_sample(): token_ids, labels = [tokenizer._token_start_id], [0] for word in item: token_id = tokenizer.encode(word)[0][1:-1] if len(token_ids) + len(token_id) > maxlen: break if len(token_id) == 1: labels += [0] else: labels += [1] + [2] * (len(token_id) - 2) + [3] token_ids += token_id token_ids.append(tokenizer._token_end_id) labels.append(0) batch_tokens.append(token_ids) batch_segs.append([0] * len(token_ids)) batch_labels.append(labels) if len(batch_tokens) >= self.batch_size or is_end: batch_tokens = pad_sequences(batch_tokens) batch_segs = pad_sequences(batch_segs) batch_labels = pad_sequences(batch_labels) yield [batch_tokens, batch_segs], batch_labels batch_tokens, batch_segs, batch_labels = [], [], []
def __iter__(self, shuffle=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, (text, label, label_des) in self.get_sample(shuffle): if not self.seq2seq: token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) else: text_token_ids = tokenizer.encode(text, maxlen=maxlen)[0] label_token_ids = tokenizer.encode(label_des, maxlen=max_label + 2)[0][1:] token_ids = text_token_ids + label_token_ids segment_ids = [0] * len(text_token_ids) + [1] * len( label_token_ids) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([label]) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) batch_labels = pad_sequences(batch_labels) # if self.seq2seq: # yield [batch_token_ids, batch_segment_ids], None # else: yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def __iter__(self, shuffle=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, item in self.get_sample(shuffle): context, question, answers = item[1:] context = self.random_generator(context) token_ids, segment_ids = tokenizer.encode(question, context, maxlen=maxlen) qt = tokenizer.tokenize(question) token_ids = self.random_padding(token_ids, len(qt)) a = np.random.choice(answers) a_token_ids = tokenizer.encode(a)[0][1:-1] start_index = search(a_token_ids, token_ids) if start_index != -1: labels = [[start_index], [start_index + len(a_token_ids) - 1]] batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(labels) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) batch_labels = pad_sequences(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def __iter__(self, shuffle=False): batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked, batch_nsp = [], [], [], [], [] for is_end, item in self.get_sample(shuffle): # 50% shuffle order label = 1 p = np.random.random() if p < 0.5: label = 0 item = shuffle_reply(item) source_tokens, target_tokens, segment_ids = random_masking(item) is_masked = [0 if i == 0 else 1 for i in target_tokens] batch_token_ids.append(source_tokens) batch_segment_ids.append(segment_ids) batch_target_ids.append(target_tokens) batch_is_masked.append(is_masked) batch_nsp.append([label]) if is_end or len(batch_token_ids) == self.batch_size: batch_token_ids = pad_sequences(batch_token_ids, maxlen=maxlen) batch_segment_ids = pad_sequences(batch_segment_ids, maxlen=maxlen) batch_target_ids = pad_sequences(batch_target_ids, maxlen=maxlen) batch_is_masked = pad_sequences(batch_is_masked, maxlen=maxlen) batch_nsp = pad_sequences(batch_nsp) yield [batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked, batch_nsp], None batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked = [], [], [], [] batch_nsp = []
def __iter__(self, shuffle=False): batch_tokens, batch_segments, batch_targets = [], [], [] for is_end, (text, _, label) in self.get_sample(shuffle): text = pattern + text token_ids, seg_ids = tokenizer.encode(text, maxlen=maxlen) # 训练时随机masking if shuffle: source_tokens, target_tokens = self.random_masking(token_ids) else: source_tokens, target_tokens = token_ids[:], token_ids[:] # mask label if len(label) == 2: label_ids = tokenizer.encode(label)[0][1:-1] for m, l in zip(mask_idx, label_ids): source_tokens[m] = tokenizer._token_mask_id target_tokens[m] = l batch_tokens.append(source_tokens) batch_segments.append(seg_ids) batch_targets.append(target_tokens) if len(batch_tokens) == self.batch_size or is_end: batch_tokens = pad_sequences(batch_tokens) batch_segments = pad_sequences(batch_segments) batch_targets = pad_sequences(batch_targets) yield [batch_tokens, batch_segments, batch_targets], None batch_tokens, batch_segments, batch_targets = [], [], []
def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, item in self.get_sample(): context, questions, answers = item[1:] if type(questions) != list: question = questions else: question = questions[0] if np.random.random( ) > 0.5 else np.random.choice(questions) token_ids, segment_ids = tokenizer.encode(question, context, maxlen=maxlen) a = np.random.choice(answers) a_token_ids = tokenizer.encode(a)[0][1:-1] start_index = search(a_token_ids, token_ids) if start_index != -1: labels = [[start_index], [start_index + len(a_token_ids) - 1]] batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(labels) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) batch_labels = pad_sequences(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def __iter__(self, shuffle=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, (text, label) in self.get_sample(shuffle): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([label]) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) batch_labels = pad_sequences(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def __iter__(self): batch_token_ids, batch_segment_ids, batch_labels, batch_logits = [], [], [], [] for is_end, (text, label, logits) in self.get_sample(): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(label) batch_logits.append(logits) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) batch_labels = pad_sequences(batch_labels) batch_logits = pad_sequences(batch_logits) yield [batch_token_ids, batch_segment_ids], [batch_labels, batch_logits] batch_token_ids, batch_segment_ids, batch_labels, batch_logits = [], [], [], []
def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, (text, label, label_des) in self.get_sample(): if self.data_augmentation: text = self.generate_text(text) token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([label]) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) batch_labels = pad_sequences(batch_labels) if not self.transfer: yield [batch_token_ids, batch_segment_ids], batch_labels else: yield [batch_token_ids, batch_segment_ids ] + [batch_labels], None batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def __iter__(self, shuffle=False): """[CLS]context[SEP]answer[SEP]question[SEP]""" batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, item in self.get_sample(shuffle): context, question, answer = item[1:] c_token_ids, _ = tokenizer.encode(context, maxlen=max_context_len + 1) q_token_ids, _ = tokenizer.encode(question, maxlen=max_question_len) a_token_ids, _ = tokenizer.encode(answer, maxlen=max_answer_len) token_ids = c_token_ids + a_token_ids[1:] + q_token_ids[1:] segment_ids = [0] * len(c_token_ids) + [1] * (len(token_ids) - len(c_token_ids)) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) yield [batch_token_ids, batch_segment_ids], None batch_token_ids, batch_segment_ids = [], []
def __iter__(self, shuffle=False): batch_token_ids, batch_segment_ids, batch_target_ids = [], [], [] for is_end, (q_id, q, r_id, r, label) in self.get_sample(shuffle): label = int(label) if label is not None else None if label is not None or self.prefix: q = pattern + q token_ids, segment_ids = tokenizer.encode(q, r, maxlen=maxlen) if shuffle: source_tokens, target_tokens = random_masking(token_ids) else: source_tokens, target_tokens = token_ids[:], token_ids[:] # mask label if label is not None: label_ids = tokenizer.encode(id2label[label])[0][1:-1] for m, lb in zip(mask_idx, label_ids): source_tokens[m] = tokenizer._token_mask_id target_tokens[m] = lb elif self.prefix: for i in mask_idx: source_tokens[i] = tokenizer._token_mask_id batch_token_ids.append(source_tokens) batch_segment_ids.append(segment_ids) batch_target_ids.append(target_tokens) if is_end or len(batch_token_ids) == self.batch_size: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) batch_target_ids = pad_sequences(batch_target_ids) yield [batch_token_ids, batch_segment_ids, batch_target_ids], None batch_token_ids, batch_segment_ids, batch_target_ids = [], [], []
def __iter__(self, shuffle=False): batch_token_ids, batch_segment_ids, batch_original_token_ids = [], [], [] for is_end, item in self.get_sample(shuffle): context, question, answers = item[1:] answer = np.random.choice(answers) token_ids, _ = tokenizer.encode(answer, context, maxlen=maxlen - max_question_len - 1) segment_ids = [0] * len(token_ids) question_token_ids = tokenizer.encode(question)[0][1:] token_ids = token_ids + question_token_ids segment_ids += [1] * len(question_token_ids) original_tokens = token_ids # random replace decoder tokens to generate negative sample is_negative = np.random.random() > 0.5 if is_negative: token_ids = [ token if seg == 0 or np.random.random() > 0.3 else np.random.choice(token_ids) for token, seg in zip(token_ids, segment_ids) ] # 0.5 概率替换,其中0.3的概率替换 batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_original_token_ids.append(original_tokens) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) batch_original_token_ids = pad_sequences( batch_original_token_ids) yield [ batch_token_ids, batch_segment_ids, batch_original_token_ids ], None batch_token_ids, batch_segment_ids, batch_original_token_ids = [], [], []
def __iter__(self, shuffle=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, (text, label, label_des) in self.get_sample(shuffle): if not self.sim: token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([label]) else: text_token_ids = tokenizer.encode(text, maxlen=maxlen)[0] label_token_ids = tokenizer.encode(label_des, maxlen=max_label + 2)[0][1:] token_ids = [text_token_ids] + [label_token_ids] segment_ids = [[0] * len(text_token_ids)] + [[0] * len(label_token_ids)] batch_token_ids.extend(token_ids) batch_segment_ids.extend(segment_ids) batch_labels.extend([[label]] * 2) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) batch_labels = pad_sequences(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []