def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [ record.text_type_ids for record in batch_records ] batch_position_ids = [record.position_ids for record in batch_records] # padding padded_token_ids, input_mask, seq_lens = pad_batch_data( batch_token_ids, pad_idx=self.pad_id, return_input_mask=True, return_seq_lens=True) padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) padded_task_ids = np.ones_like(padded_token_ids, dtype="int64") * self.task_id return_list = [ padded_token_ids, padded_text_type_ids, padded_position_ids, padded_task_ids, input_mask, seq_lens ] return return_list
def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [ record.text_type_ids for record in batch_records ] batch_position_ids = [record.position_ids for record in batch_records] batch_labels = [record.label_id for record in batch_records] batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1]) if batch_records[0].qid is not None: batch_qids = [record.qid for record in batch_records] batch_qids = np.array(batch_qids).astype("int64").reshape([-1, 1]) else: batch_qids = np.array([]).astype("int64").reshape([-1, 1]) # padding padded_token_ids, input_mask = pad_batch_data(batch_token_ids, pad_idx=self.pad_id, return_input_mask=True) padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) return_list = [ padded_token_ids, padded_text_type_ids, padded_position_ids, input_mask, batch_labels, batch_qids ] return return_list
def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [ record.text_type_ids for record in batch_records ] batch_position_ids = [record.position_ids for record in batch_records] # 增加batch_adjacency_matrix batch_adjacency_matrix = [ record.adjacency_matrix for record in batch_records ] # 增加batch_head_ids batch_head_ids = np.array( [record.head_ids for record in batch_records]).astype("int64") if not self.is_inference: batch_labels = [record.label_id for record in batch_records] if self.is_classify: batch_labels = np.array(batch_labels).astype("int64").reshape( [-1, 1]) elif self.is_regression: batch_labels = np.array(batch_labels).astype( "float32").reshape([-1, 1]) if batch_records[0].qid: batch_qids = [record.qid for record in batch_records] batch_qids = np.array(batch_qids).astype("int64").reshape( [-1, 1]) else: batch_qids = np.array([]).astype("int64").reshape([-1, 1]) # padding # 增加max_len=self.max_seq_len,将所有batch的长度都填充到最大长度 padded_token_ids, input_mask = pad_batch_data(batch_token_ids, max_len=self.max_seq_len, pad_idx=self.pad_id, return_input_mask=True) padded_text_type_ids = pad_batch_data(batch_text_type_ids, max_len=self.max_seq_len, pad_idx=self.pad_id) padded_position_ids = pad_batch_data(batch_position_ids, max_len=self.max_seq_len, pad_idx=self.pad_id) padded_task_ids = np.ones_like(padded_token_ids, dtype="int64") * self.task_id padded_adjacency_matrix = pad_batch_graphs(batch_adjacency_matrix, max_len=self.max_seq_len) return_list = [ padded_token_ids, padded_text_type_ids, padded_position_ids, padded_task_ids, input_mask, ] if not self.is_inference: return_list += [batch_labels, batch_qids] # 增加padded_adjacency_matrix和batch_head_ids的返回 return_list += [padded_adjacency_matrix, batch_head_ids] return return_list
def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [ record.text_type_ids for record in batch_records ] batch_position_ids = [record.position_ids for record in batch_records] batch_label_ids = [record.label_ids for record in batch_records] # padding padded_token_ids, input_mask, batch_seq_lens = pad_batch_data( batch_token_ids, pad_idx=self.pad_id, return_input_mask=True, return_seq_lens=True) padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) padded_label_ids = pad_batch_data(batch_label_ids, pad_idx=len(self.label_map) - 1) return_list = [ padded_token_ids, padded_text_type_ids, padded_position_ids, input_mask, padded_label_ids, batch_seq_lens ] return return_list
def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [record.text_type_ids for record in batch_records] batch_position_ids = [record.position_ids for record in batch_records] if not self.is_inference: batch_labels = [record.label_id for record in batch_records] batch_labels = np.array(batch_labels).astype("int64").reshape( [-1, 1]) if batch_records[0].qid: batch_qids = [record.qid for record in batch_records] batch_qids = np.array(batch_qids).astype("int64").reshape( [-1, 1]) else: batch_qids = np.array([]).astype("int64").reshape([-1, 1]) # padding padded_token_ids, input_mask = pad_batch_data( batch_token_ids, pad_idx=self.pad_id, return_input_mask=True) padded_text_type_ids = pad_batch_data( batch_text_type_ids, pad_idx=self.pad_id) padded_position_ids = pad_batch_data( batch_position_ids, pad_idx=self.pad_id) padded_task_ids = np.ones_like( padded_token_ids, dtype="int64") * self.task_id return_list = [ padded_token_ids, padded_text_type_ids, padded_position_ids, padded_task_ids, input_mask ] if not self.is_inference: return_list += [batch_labels, batch_qids] return return_list
def _pad_batch_records(self, batch_records, is_training): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [record.text_type_ids for record in batch_records] batch_position_ids = [record.position_ids for record in batch_records] if is_training: batch_start_position = [record.start_position for record in batch_records] batch_end_position = [record.end_position for record in batch_records] batch_start_position = np.array(batch_start_position).astype("int64").reshape([-1, 1]) batch_end_position = np.array(batch_end_position).astype("int64").reshape([-1, 1]) else: batch_size = len(batch_token_ids) batch_start_position = np.zeros(shape=[batch_size, 1], dtype="int64") batch_end_position = np.zeros(shape=[batch_size, 1], dtype="int64") batch_unique_ids = [record.unique_id for record in batch_records] batch_unique_ids = np.array(batch_unique_ids).astype("int64").reshape([-1, 1]) # padding padded_token_ids, input_mask = pad_batch_data(batch_token_ids, pad_idx=self.pad_id, return_input_mask=True) padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) padded_task_ids = np.ones_like(padded_token_ids, dtype="int64") * self.task_id return_list = [ padded_token_ids, padded_text_type_ids, padded_position_ids, padded_task_ids, input_mask, batch_start_position, batch_end_position, batch_unique_ids ] return return_list
def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [ record.text_type_ids for record in batch_records ] batch_position_ids = [record.position_ids for record in batch_records] batch_label_ids = [record.label_ids for record in batch_records] batch_seq_lens = [len(record.token_ids) for record in batch_records] # padding padded_token_ids, self_attn_bias = pad_batch_data( batch_token_ids, pad_idx=self.pad_id, return_next_sent_pos=False, return_attn_bias=True) padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) padded_label_ids = pad_batch_data(batch_label_ids, pad_idx=len(self.label_map) - 1) batch_seq_lens = np.array(batch_seq_lens).astype("int64").reshape( [-1, 1]) return_list = [ padded_token_ids, padded_text_type_ids, padded_position_ids, self_attn_bias, padded_label_ids, batch_seq_lens ] return return_list
def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [ record.text_type_ids for record in batch_records ] batch_position_ids = [record.position_ids for record in batch_records] batch_labels = [record.label_id for record in batch_records] batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1]) if batch_records[0].qid: batch_qids = [record.qid for record in batch_records] batch_qids = np.array(batch_qids).astype("int64").reshape([-1, 1]) else: batch_qids = np.array([]).astype("int64").reshape([-1, 1]) # padding padded_token_ids, next_sent_index, self_attn_bias = pad_batch_data( batch_token_ids, pad_idx=self.pad_id, return_next_sent_pos=True, return_attn_bias=True) padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) return_list = [ padded_token_ids, padded_text_type_ids, padded_position_ids, self_attn_bias, batch_labels, next_sent_index, batch_qids ] return return_list
def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [ record.text_type_ids for record in batch_records ] batch_position_ids = [record.position_ids for record in batch_records] batch_label_ids = [record.label_ids for record in batch_records] batch_example_index = [ record.example_index for record in batch_records ] batch_tok_to_orig_start_index = [ record.tok_to_orig_start_index for record in batch_records ] batch_tok_to_orig_end_index = [ record.tok_to_orig_end_index for record in batch_records ] # padding padded_token_ids, input_mask, batch_seq_lens = pad_batch_data( batch_token_ids, pad_idx=self.pad_id, return_input_mask=True, return_seq_lens=True) padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) # label padding for expended dimension outside_label = np.array([1] + [0] * (self.num_labels - 1)) max_len = max(len(inst) for inst in batch_label_ids) padded_label_ids = [] for i, inst in enumerate(batch_label_ids): inst = np.concatenate((np.array(inst), np.tile(outside_label, ((max_len - len(inst)), 1))), axis=0) padded_label_ids.append(inst) padded_label_ids = np.stack(padded_label_ids).astype("float32") padded_tok_to_orig_start_index = np.array([ inst + [0] * (max_len - len(inst)) for inst in batch_tok_to_orig_start_index ]) padded_tok_to_orig_end_index = np.array([ inst + [0] * (max_len - len(inst)) for inst in batch_tok_to_orig_end_index ]) padded_task_ids = np.ones_like(padded_token_ids, dtype="int64") * self.task_id return_list = [ padded_token_ids, padded_text_type_ids, padded_position_ids, padded_task_ids, input_mask, padded_label_ids, batch_seq_lens, batch_example_index, padded_tok_to_orig_start_index, padded_tok_to_orig_end_index ] return return_list
def _pad_batch_records(self, batch_records): """ batch_records: List of Records """ batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [record.text_type_ids for record in batch_records] batch_position_ids = [record.position_ids for record in batch_records] batch_char_ids = [record.char_ids for record in batch_records] batch_char_text_type_ids = [record.char_text_type_ids for record in batch_records] batch_char_position_ids = [record.char_position_ids for record in batch_records] batch_label_ids = [record.label_ids for record in batch_records] # padding padded_token_ids, input_mask, batch_seq_lens = pad_batch_data( batch_token_ids, pad_idx=self.pad_id, return_input_mask=True, return_seq_lens=True) padded_text_type_ids = pad_batch_data( batch_text_type_ids, pad_idx=self.pad_id) padded_position_ids = pad_batch_data( batch_position_ids, pad_idx=self.pad_id) padded_char_ids, char_input_mask, batch_word_lens = self._pad_batch_char_data( batch_char_ids, pad_idx=self.pad_id, return_input_mask=True, return_word_lens=True) padded_char_text_type_ids = self._pad_batch_char_data( batch_char_text_type_ids, pad_idx=self.pad_id) padded_char_position_ids = self._pad_batch_char_data( batch_char_position_ids, pad_idx=self.pad_id) padded_label_ids = pad_batch_data( batch_label_ids, pad_idx=len(self.label_map) - 1) padded_task_ids = np.ones_like( padded_token_ids, dtype="int64") * self.task_id padded_char_task_ids = np.ones_like( padded_char_ids, dtype="int64") * self.task_id return_list = [ padded_token_ids, padded_text_type_ids, padded_position_ids, padded_task_ids, input_mask, padded_label_ids, batch_seq_lens, padded_char_ids, char_input_mask, batch_word_lens, padded_char_text_type_ids, padded_char_position_ids, padded_char_task_ids ] return return_list
def pad_batch(self, token_ids, text_type_ids, position_ids): batch_token_ids = [token_ids] batch_text_type_ids = [text_type_ids] batch_position_ids = [position_ids] padded_token_ids, input_mask = pad_batch_data( batch_token_ids, max_seq_len=self.max_seq_len, pad_idx=self.pad_id, return_input_mask=True) padded_text_type_ids = pad_batch_data( batch_text_type_ids, max_seq_len=self.max_seq_len, pad_idx=self.pad_id) padded_position_ids = pad_batch_data( batch_position_ids, max_seq_len=self.max_seq_len, pad_idx=self.pad_id) return padded_token_ids, padded_position_ids, padded_text_type_ids, input_mask
def next_predict_batch(self, batch_size): if self.pos >= len(self.data): self.pos = 0 return None else: batch = self.data[self.pos:self.pos + batch_size] self.pos += batch_size token_ids_batch, sent_ids_batch, pos_ids_batch, input_mask_batch, mask_pos_batch, mask_label_batch = self.parse_batch( batch) # 用padding处理一下 token_ids_batch, input_mask_batch = pad_batch_data( token_ids_batch, pad_idx=self.pad_id, return_input_mask=True) sent_ids_batch = pad_batch_data(sent_ids_batch, pad_idx=self.pad_id) pos_ids_batch = pad_batch_data(pos_ids_batch, pad_idx=self.pad_id) # input_mask_batch = pad_batch_data(input_mask_batch, pad_idx=self.pad_id) mask_pos_batch = np.array(mask_pos_batch).astype("int64").reshape( [-1, 1]) mask_label_batch = np.array(mask_label_batch).astype( "int64").reshape([-1, 1]) # mask_pos_batch并没有进行padding,而在ernie中似乎进行了padding return token_ids_batch, sent_ids_batch, pos_ids_batch, input_mask_batch, mask_pos_batch, mask_label_batch
def _pad_batch_records(self, batch_records): """_pad_batch_records""" batch_token_query_ids = [ record.token_query_ids for record in batch_records ] batch_text_type_query_ids = [ record.text_type_query_ids for record in batch_records ] batch_position_query_ids = [ record.position_query_ids for record in batch_records ] batch_token_left_ids = [ record.token_left_ids for record in batch_records ] batch_text_type_left_ids = [ record.text_type_left_ids for record in batch_records ] batch_position_left_ids = [ record.position_left_ids for record in batch_records ] batch_token_right_ids = [ record.token_right_ids for record in batch_records ] batch_text_type_right_ids = [ record.text_type_right_ids for record in batch_records ] batch_position_right_ids = [ record.position_right_ids for record in batch_records ] if batch_records[0].qid: batch_qids = [record.qid for record in batch_records] batch_qids = np.array(batch_qids).astype("int64").reshape( [-1, 1], ) else: batch_qids = np.array([]).astype("int64").reshape([-1, 1]) if not self.is_inference: batch_labels = [record.label_id for record in batch_records] batch_types = [record.type_id for record in batch_records] if self.is_classify: batch_labels = np.array(batch_labels).astype("int64").reshape( [-1, 1], ) batch_types = np.array(batch_types).astype("int64").reshape( [-1, 1], ) elif self.is_regression: batch_labels = np.array(batch_labels).astype( "float32").reshape([-1, 1], ) else: if batch_records[0].ent_id: batch_ent_ids = [record.ent_id for record in batch_records] batch_ent_ids = np.array(batch_ent_ids).reshape([-1, 1], ) else: batch_ent_ids = np.array([]).reshape([-1, 1]) # padding padded_token_query_ids, input_query_mask = pad_batch_data( batch_token_query_ids, pad_idx=self.pad_id, return_input_mask=True, ) padded_text_type_query_ids = pad_batch_data( batch_text_type_query_ids, pad_idx=self.pad_id, ) padded_position_query_ids = pad_batch_data( batch_position_query_ids, pad_idx=self.pad_id, ) padded_task_query_ids = np.ones_like( padded_token_query_ids, dtype="int64", ) * self.task_id padded_token_left_ids, input_left_mask = pad_batch_data( batch_token_left_ids, pad_idx=self.pad_id, return_input_mask=True, ) padded_text_type_left_ids = pad_batch_data( batch_text_type_left_ids, pad_idx=self.pad_id, ) padded_position_left_ids = pad_batch_data( batch_position_left_ids, pad_idx=self.pad_id, ) padded_task_left_ids = np.ones_like( padded_token_left_ids, dtype="int64", ) * self.task_id padded_token_right_ids, input_right_mask = pad_batch_data( batch_token_right_ids, pad_idx=self.pad_id, return_input_mask=True, ) padded_text_type_right_ids = pad_batch_data( batch_text_type_right_ids, pad_idx=self.pad_id, ) padded_position_right_ids = pad_batch_data( batch_position_right_ids, pad_idx=self.pad_id, ) padded_task_right_ids = np.ones_like( padded_token_right_ids, dtype="int64", ) * self.task_id return_list = [ padded_token_query_ids, padded_text_type_query_ids, padded_position_query_ids, padded_task_query_ids, input_query_mask, padded_token_left_ids, padded_text_type_left_ids, padded_position_left_ids, padded_task_left_ids, input_left_mask, padded_token_right_ids, padded_text_type_right_ids, padded_position_right_ids, padded_task_right_ids, input_right_mask, ] if not self.is_inference: return_list += [batch_labels, batch_types, batch_qids] else: return_list += [batch_qids, batch_ent_ids] return return_list