def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [ record.text_type_ids for record in batch_records ] batch_position_ids = [record.position_ids for record in batch_records] # padding padded_token_ids, input_mask, seq_lens = pad_batch_data( batch_token_ids, pad_idx=self.pad_id, return_input_mask=True, return_seq_lens=True) padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) padded_task_ids = np.ones_like(padded_token_ids, dtype="int64") * self.task_id return_list = [ padded_token_ids, padded_text_type_ids, padded_position_ids, padded_task_ids, input_mask, seq_lens ] return return_list
def _pad_batch_records(self, batch_records, is_training): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [ record.text_type_ids for record in batch_records ] batch_position_ids = [record.position_ids for record in batch_records] if is_training: batch_start_position = [ record.start_position for record in batch_records ] batch_end_position = [ record.end_position for record in batch_records ] batch_start_position = np.array(batch_start_position).astype( "int64").reshape([-1, 1]) batch_end_position = np.array(batch_end_position).astype( "int64").reshape([-1, 1]) else: batch_size = len(batch_token_ids) batch_start_position = np.zeros(shape=[batch_size, 1], dtype="int64") batch_end_position = np.zeros(shape=[batch_size, 1], dtype="int64") batch_unique_ids = [record.unique_id for record in batch_records] batch_unique_ids = np.array(batch_unique_ids).astype("int64").reshape( [-1, 1]) # padding padded_token_ids, input_mask = pad_batch_data(batch_token_ids, pad_idx=self.pad_id, return_input_mask=True) padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) padded_task_ids = np.ones_like(padded_token_ids, dtype="int64") * self.task_id return_list = [ padded_token_ids, padded_text_type_ids, padded_position_ids, padded_task_ids, input_mask, batch_start_position, batch_end_position, batch_unique_ids ] return return_list
def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [ record.text_type_ids for record in batch_records ] batch_position_ids = [record.position_ids for record in batch_records] if not self.is_inference: batch_labels = [record.label_id for record in batch_records] if self.is_classify: batch_labels = np.array(batch_labels).astype("int64").reshape( [-1, 1]) elif self.is_regression: batch_labels = np.array(batch_labels).astype( "float32").reshape([-1, 1]) if batch_records[0].qid: batch_qids = [record.qid for record in batch_records] batch_qids = np.array(batch_qids).astype("int64").reshape( [-1, 1]) else: batch_qids = np.array([]).astype("int64").reshape([-1, 1]) # padding padded_token_ids, input_mask = pad_batch_data(batch_token_ids, pad_idx=self.pad_id, return_input_mask=True) padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) padded_task_ids = np.ones_like(padded_token_ids, dtype="int64") * self.task_id return_list = [ padded_token_ids, padded_text_type_ids, padded_position_ids, padded_task_ids, input_mask ] if not self.is_inference: return_list += [batch_labels, batch_qids] return return_list