Python pad_batch_dataの例、batching.pad_batch_data Pythonの例

コード例 #1

0

ファイルを表示

    def _pad_batch_records(self, batch_records):
        batch_token_ids = [record.token_ids for record in batch_records]
        batch_text_type_ids = [
            record.text_type_ids for record in batch_records
        ]
        batch_position_ids = [record.position_ids for record in batch_records]

        # padding
        padded_token_ids, input_mask, seq_lens = pad_batch_data(
            batch_token_ids,
            pad_idx=self.pad_id,
            return_input_mask=True,
            return_seq_lens=True)
        padded_text_type_ids = pad_batch_data(batch_text_type_ids,
                                              pad_idx=self.pad_id)
        padded_position_ids = pad_batch_data(batch_position_ids,
                                             pad_idx=self.pad_id)
        padded_task_ids = np.ones_like(padded_token_ids,
                                       dtype="int64") * self.task_id

        return_list = [
            padded_token_ids, padded_text_type_ids, padded_position_ids,
            padded_task_ids, input_mask, seq_lens
        ]

        return return_list

コード例 #2

0

ファイルを表示

ファイル: task_reader.py プロジェクト: yxWisdom/LARK

    def _pad_batch_records(self, batch_records):
        batch_token_ids = [record.token_ids for record in batch_records]
        batch_text_type_ids = [
            record.text_type_ids for record in batch_records
        ]
        batch_position_ids = [record.position_ids for record in batch_records]
        batch_labels = [record.label_id for record in batch_records]
        batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1])

        if batch_records[0].qid is not None:
            batch_qids = [record.qid for record in batch_records]
            batch_qids = np.array(batch_qids).astype("int64").reshape([-1, 1])
        else:
            batch_qids = np.array([]).astype("int64").reshape([-1, 1])

        # padding
        padded_token_ids, input_mask = pad_batch_data(batch_token_ids,
                                                      pad_idx=self.pad_id,
                                                      return_input_mask=True)
        padded_text_type_ids = pad_batch_data(batch_text_type_ids,
                                              pad_idx=self.pad_id)
        padded_position_ids = pad_batch_data(batch_position_ids,
                                             pad_idx=self.pad_id)

        return_list = [
            padded_token_ids, padded_text_type_ids, padded_position_ids,
            input_mask, batch_labels, batch_qids
        ]

        return return_list

コード例 #3

0

ファイルを表示

    def _pad_batch_records(self, batch_records):
        batch_token_ids = [record.token_ids for record in batch_records]
        batch_text_type_ids = [
            record.text_type_ids for record in batch_records
        ]
        batch_position_ids = [record.position_ids for record in batch_records]
        # 增加batch_adjacency_matrix
        batch_adjacency_matrix = [
            record.adjacency_matrix for record in batch_records
        ]
        # 增加batch_head_ids
        batch_head_ids = np.array(
            [record.head_ids for record in batch_records]).astype("int64")

        if not self.is_inference:
            batch_labels = [record.label_id for record in batch_records]
            if self.is_classify:
                batch_labels = np.array(batch_labels).astype("int64").reshape(
                    [-1, 1])
            elif self.is_regression:
                batch_labels = np.array(batch_labels).astype(
                    "float32").reshape([-1, 1])

            if batch_records[0].qid:
                batch_qids = [record.qid for record in batch_records]
                batch_qids = np.array(batch_qids).astype("int64").reshape(
                    [-1, 1])
            else:
                batch_qids = np.array([]).astype("int64").reshape([-1, 1])

        # padding
        # 增加max_len=self.max_seq_len，将所有batch的长度都填充到最大长度
        padded_token_ids, input_mask = pad_batch_data(batch_token_ids,
                                                      max_len=self.max_seq_len,
                                                      pad_idx=self.pad_id,
                                                      return_input_mask=True)
        padded_text_type_ids = pad_batch_data(batch_text_type_ids,
                                              max_len=self.max_seq_len,
                                              pad_idx=self.pad_id)
        padded_position_ids = pad_batch_data(batch_position_ids,
                                             max_len=self.max_seq_len,
                                             pad_idx=self.pad_id)
        padded_task_ids = np.ones_like(padded_token_ids,
                                       dtype="int64") * self.task_id
        padded_adjacency_matrix = pad_batch_graphs(batch_adjacency_matrix,
                                                   max_len=self.max_seq_len)

        return_list = [
            padded_token_ids,
            padded_text_type_ids,
            padded_position_ids,
            padded_task_ids,
            input_mask,
        ]
        if not self.is_inference:
            return_list += [batch_labels, batch_qids]
        # 增加padded_adjacency_matrix和batch_head_ids的返回
        return_list += [padded_adjacency_matrix, batch_head_ids]

        return return_list

コード例 #4

0

ファイルを表示

ファイル: task_reader.py プロジェクト: yxWisdom/LARK

    def _pad_batch_records(self, batch_records):
        batch_token_ids = [record.token_ids for record in batch_records]
        batch_text_type_ids = [
            record.text_type_ids for record in batch_records
        ]
        batch_position_ids = [record.position_ids for record in batch_records]
        batch_label_ids = [record.label_ids for record in batch_records]

        # padding
        padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
            batch_token_ids,
            pad_idx=self.pad_id,
            return_input_mask=True,
            return_seq_lens=True)
        padded_text_type_ids = pad_batch_data(batch_text_type_ids,
                                              pad_idx=self.pad_id)
        padded_position_ids = pad_batch_data(batch_position_ids,
                                             pad_idx=self.pad_id)
        padded_label_ids = pad_batch_data(batch_label_ids,
                                          pad_idx=len(self.label_map) - 1)

        return_list = [
            padded_token_ids, padded_text_type_ids, padded_position_ids,
            input_mask, padded_label_ids, batch_seq_lens
        ]
        return return_list

コード例 #5

0

ファイルを表示

    def _pad_batch_records(self, batch_records):
        batch_token_ids = [record.token_ids for record in batch_records]
        batch_text_type_ids = [record.text_type_ids for record in batch_records]
        batch_position_ids = [record.position_ids for record in batch_records]

        if not self.is_inference:
            batch_labels = [record.label_id for record in batch_records]
            batch_labels = np.array(batch_labels).astype("int64").reshape(
                [-1, 1])

            if batch_records[0].qid:
                batch_qids = [record.qid for record in batch_records]
                batch_qids = np.array(batch_qids).astype("int64").reshape(
                    [-1, 1])
            else:
                batch_qids = np.array([]).astype("int64").reshape([-1, 1])

        # padding
        padded_token_ids, input_mask = pad_batch_data(
            batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
        padded_text_type_ids = pad_batch_data(
            batch_text_type_ids, pad_idx=self.pad_id)
        padded_position_ids = pad_batch_data(
            batch_position_ids, pad_idx=self.pad_id)
        padded_task_ids = np.ones_like(
            padded_token_ids, dtype="int64") * self.task_id

        return_list = [
            padded_token_ids, padded_text_type_ids, padded_position_ids,
            padded_task_ids, input_mask
        ]
        if not self.is_inference:
            return_list += [batch_labels, batch_qids]

        return return_list

コード例 #6

0

ファイルを表示

ファイル: task_reader.py プロジェクト: zw331/DDParser

    def _pad_batch_records(self, batch_records, is_training):
        batch_token_ids = [record.token_ids for record in batch_records]
        batch_text_type_ids = [record.text_type_ids for record in batch_records]
        batch_position_ids = [record.position_ids for record in batch_records]
        if is_training:
            batch_start_position = [record.start_position for record in batch_records]
            batch_end_position = [record.end_position for record in batch_records]
            batch_start_position = np.array(batch_start_position).astype("int64").reshape([-1, 1])
            batch_end_position = np.array(batch_end_position).astype("int64").reshape([-1, 1])

        else:
            batch_size = len(batch_token_ids)
            batch_start_position = np.zeros(shape=[batch_size, 1], dtype="int64")
            batch_end_position = np.zeros(shape=[batch_size, 1], dtype="int64")

        batch_unique_ids = [record.unique_id for record in batch_records]
        batch_unique_ids = np.array(batch_unique_ids).astype("int64").reshape([-1, 1])

        # padding
        padded_token_ids, input_mask = pad_batch_data(batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
        padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id)
        padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id)
        padded_task_ids = np.ones_like(padded_token_ids, dtype="int64") * self.task_id

        return_list = [
            padded_token_ids, padded_text_type_ids, padded_position_ids, padded_task_ids, input_mask,
            batch_start_position, batch_end_position, batch_unique_ids
        ]

        return return_list

コード例 #7

0

ファイルを表示

    def _pad_batch_records(self, batch_records):
        batch_token_ids = [record.token_ids for record in batch_records]
        batch_text_type_ids = [
            record.text_type_ids for record in batch_records
        ]
        batch_position_ids = [record.position_ids for record in batch_records]
        batch_label_ids = [record.label_ids for record in batch_records]
        batch_seq_lens = [len(record.token_ids) for record in batch_records]

        # padding
        padded_token_ids, self_attn_bias = pad_batch_data(
            batch_token_ids,
            pad_idx=self.pad_id,
            return_next_sent_pos=False,
            return_attn_bias=True)
        padded_text_type_ids = pad_batch_data(batch_text_type_ids,
                                              pad_idx=self.pad_id)
        padded_position_ids = pad_batch_data(batch_position_ids,
                                             pad_idx=self.pad_id)
        padded_label_ids = pad_batch_data(batch_label_ids,
                                          pad_idx=len(self.label_map) - 1)
        batch_seq_lens = np.array(batch_seq_lens).astype("int64").reshape(
            [-1, 1])

        return_list = [
            padded_token_ids, padded_text_type_ids, padded_position_ids,
            self_attn_bias, padded_label_ids, batch_seq_lens
        ]
        return return_list

コード例 #8

0

ファイルを表示

    def _pad_batch_records(self, batch_records):
        batch_token_ids = [record.token_ids for record in batch_records]
        batch_text_type_ids = [
            record.text_type_ids for record in batch_records
        ]
        batch_position_ids = [record.position_ids for record in batch_records]
        batch_labels = [record.label_id for record in batch_records]
        batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1])

        if batch_records[0].qid:
            batch_qids = [record.qid for record in batch_records]
            batch_qids = np.array(batch_qids).astype("int64").reshape([-1, 1])
        else:
            batch_qids = np.array([]).astype("int64").reshape([-1, 1])

        # padding
        padded_token_ids, next_sent_index, self_attn_bias = pad_batch_data(
            batch_token_ids,
            pad_idx=self.pad_id,
            return_next_sent_pos=True,
            return_attn_bias=True)
        padded_text_type_ids = pad_batch_data(batch_text_type_ids,
                                              pad_idx=self.pad_id)
        padded_position_ids = pad_batch_data(batch_position_ids,
                                             pad_idx=self.pad_id)

        return_list = [
            padded_token_ids, padded_text_type_ids, padded_position_ids,
            self_attn_bias, batch_labels, next_sent_index, batch_qids
        ]

        return return_list

コード例 #9

0

ファイルを表示

ファイル: task_reader.py プロジェクト: zhhezhhe/Research

    def _pad_batch_records(self, batch_records):
        batch_token_ids = [record.token_ids for record in batch_records]
        batch_text_type_ids = [
            record.text_type_ids for record in batch_records
        ]
        batch_position_ids = [record.position_ids for record in batch_records]
        batch_label_ids = [record.label_ids for record in batch_records]
        batch_example_index = [
            record.example_index for record in batch_records
        ]
        batch_tok_to_orig_start_index = [
            record.tok_to_orig_start_index for record in batch_records
        ]
        batch_tok_to_orig_end_index = [
            record.tok_to_orig_end_index for record in batch_records
        ]
        # padding
        padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
            batch_token_ids,
            pad_idx=self.pad_id,
            return_input_mask=True,
            return_seq_lens=True)
        padded_text_type_ids = pad_batch_data(batch_text_type_ids,
                                              pad_idx=self.pad_id)
        padded_position_ids = pad_batch_data(batch_position_ids,
                                             pad_idx=self.pad_id)

        #  label padding for expended dimension
        outside_label = np.array([1] + [0] * (self.num_labels - 1))
        max_len = max(len(inst) for inst in batch_label_ids)
        padded_label_ids = []
        for i, inst in enumerate(batch_label_ids):
            inst = np.concatenate((np.array(inst),
                                   np.tile(outside_label,
                                           ((max_len - len(inst)), 1))),
                                  axis=0)
            padded_label_ids.append(inst)
        padded_label_ids = np.stack(padded_label_ids).astype("float32")

        padded_tok_to_orig_start_index = np.array([
            inst + [0] * (max_len - len(inst))
            for inst in batch_tok_to_orig_start_index
        ])
        padded_tok_to_orig_end_index = np.array([
            inst + [0] * (max_len - len(inst))
            for inst in batch_tok_to_orig_end_index
        ])

        padded_task_ids = np.ones_like(padded_token_ids,
                                       dtype="int64") * self.task_id

        return_list = [
            padded_token_ids, padded_text_type_ids, padded_position_ids,
            padded_task_ids, input_mask, padded_label_ids, batch_seq_lens,
            batch_example_index, padded_tok_to_orig_start_index,
            padded_tok_to_orig_end_index
        ]
        return return_list

コード例 #10

0

ファイルを表示

ファイル: task_reader.py プロジェクト: jacklxc/StandAloneSpellingCorrection

    def _pad_batch_records(self, batch_records):
        """
        batch_records: List of Records
        """
        batch_token_ids = [record.token_ids for record in batch_records]
        batch_text_type_ids = [record.text_type_ids for record in batch_records]
        batch_position_ids = [record.position_ids for record in batch_records]

        batch_char_ids = [record.char_ids for record in batch_records]
        batch_char_text_type_ids = [record.char_text_type_ids for record in batch_records]
        batch_char_position_ids = [record.char_position_ids for record in batch_records]

        batch_label_ids = [record.label_ids for record in batch_records]

        # padding
        padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
            batch_token_ids,
            pad_idx=self.pad_id,
            return_input_mask=True,
            return_seq_lens=True)
        padded_text_type_ids = pad_batch_data(
            batch_text_type_ids, pad_idx=self.pad_id)
        padded_position_ids = pad_batch_data(
            batch_position_ids, pad_idx=self.pad_id)

        padded_char_ids, char_input_mask, batch_word_lens = self._pad_batch_char_data(
            batch_char_ids,
            pad_idx=self.pad_id,
            return_input_mask=True,
            return_word_lens=True)
        padded_char_text_type_ids = self._pad_batch_char_data(
            batch_char_text_type_ids, pad_idx=self.pad_id)
        padded_char_position_ids = self._pad_batch_char_data(
            batch_char_position_ids, pad_idx=self.pad_id)

        padded_label_ids = pad_batch_data(
            batch_label_ids, pad_idx=len(self.label_map) - 1)
        padded_task_ids = np.ones_like(
            padded_token_ids, dtype="int64") * self.task_id
        padded_char_task_ids = np.ones_like(
            padded_char_ids, dtype="int64") * self.task_id

        return_list = [
            padded_token_ids, padded_text_type_ids, padded_position_ids,
            padded_task_ids, input_mask, padded_label_ids, batch_seq_lens,
            padded_char_ids, char_input_mask, batch_word_lens, 
            padded_char_text_type_ids, padded_char_position_ids, padded_char_task_ids
        ]
        return return_list

コード例 #11

0

ファイルを表示

    def pad_batch(self, token_ids, text_type_ids, position_ids):
        batch_token_ids = [token_ids]
        batch_text_type_ids = [text_type_ids]
        batch_position_ids = [position_ids]

        padded_token_ids, input_mask = pad_batch_data(
            batch_token_ids,
            max_seq_len=self.max_seq_len,
            pad_idx=self.pad_id,
            return_input_mask=True)
        padded_text_type_ids = pad_batch_data(
            batch_text_type_ids,
            max_seq_len=self.max_seq_len,
            pad_idx=self.pad_id)
        padded_position_ids = pad_batch_data(
            batch_position_ids,
            max_seq_len=self.max_seq_len,
            pad_idx=self.pad_id)
        return padded_token_ids, padded_position_ids, padded_text_type_ids, input_mask

コード例 #12

0

ファイルを表示

 def next_predict_batch(self, batch_size):
     if self.pos >= len(self.data):
         self.pos = 0
         return None
     else:
         batch = self.data[self.pos:self.pos + batch_size]
         self.pos += batch_size
         token_ids_batch, sent_ids_batch, pos_ids_batch, input_mask_batch, mask_pos_batch, mask_label_batch = self.parse_batch(
             batch)
         # 用padding处理一下
         token_ids_batch, input_mask_batch = pad_batch_data(
             token_ids_batch, pad_idx=self.pad_id, return_input_mask=True)
         sent_ids_batch = pad_batch_data(sent_ids_batch,
                                         pad_idx=self.pad_id)
         pos_ids_batch = pad_batch_data(pos_ids_batch, pad_idx=self.pad_id)
         # input_mask_batch = pad_batch_data(input_mask_batch, pad_idx=self.pad_id)
         mask_pos_batch = np.array(mask_pos_batch).astype("int64").reshape(
             [-1, 1])
         mask_label_batch = np.array(mask_label_batch).astype(
             "int64").reshape([-1, 1])
         # mask_pos_batch并没有进行padding,而在ernie中似乎进行了padding
         return token_ids_batch, sent_ids_batch, pos_ids_batch, input_mask_batch, mask_pos_batch, mask_label_batch

コード例 #13

0

ファイルを表示

    def _pad_batch_records(self, batch_records):
        """_pad_batch_records"""
        batch_token_query_ids = [
            record.token_query_ids for record in batch_records
        ]
        batch_text_type_query_ids = [
            record.text_type_query_ids for record in batch_records
        ]
        batch_position_query_ids = [
            record.position_query_ids for record in batch_records
        ]
        batch_token_left_ids = [
            record.token_left_ids for record in batch_records
        ]
        batch_text_type_left_ids = [
            record.text_type_left_ids for record in batch_records
        ]
        batch_position_left_ids = [
            record.position_left_ids for record in batch_records
        ]
        batch_token_right_ids = [
            record.token_right_ids for record in batch_records
        ]
        batch_text_type_right_ids = [
            record.text_type_right_ids for record in batch_records
        ]
        batch_position_right_ids = [
            record.position_right_ids for record in batch_records
        ]

        if batch_records[0].qid:
            batch_qids = [record.qid for record in batch_records]
            batch_qids = np.array(batch_qids).astype("int64").reshape(
                [-1, 1], )
        else:
            batch_qids = np.array([]).astype("int64").reshape([-1, 1])

        if not self.is_inference:
            batch_labels = [record.label_id for record in batch_records]
            batch_types = [record.type_id for record in batch_records]
            if self.is_classify:
                batch_labels = np.array(batch_labels).astype("int64").reshape(
                    [-1, 1], )
                batch_types = np.array(batch_types).astype("int64").reshape(
                    [-1, 1], )
            elif self.is_regression:
                batch_labels = np.array(batch_labels).astype(
                    "float32").reshape([-1, 1], )
        else:
            if batch_records[0].ent_id:
                batch_ent_ids = [record.ent_id for record in batch_records]
                batch_ent_ids = np.array(batch_ent_ids).reshape([-1, 1], )
            else:
                batch_ent_ids = np.array([]).reshape([-1, 1])

        # padding
        padded_token_query_ids, input_query_mask = pad_batch_data(
            batch_token_query_ids,
            pad_idx=self.pad_id,
            return_input_mask=True,
        )
        padded_text_type_query_ids = pad_batch_data(
            batch_text_type_query_ids,
            pad_idx=self.pad_id,
        )
        padded_position_query_ids = pad_batch_data(
            batch_position_query_ids,
            pad_idx=self.pad_id,
        )
        padded_task_query_ids = np.ones_like(
            padded_token_query_ids,
            dtype="int64",
        ) * self.task_id

        padded_token_left_ids, input_left_mask = pad_batch_data(
            batch_token_left_ids,
            pad_idx=self.pad_id,
            return_input_mask=True,
        )
        padded_text_type_left_ids = pad_batch_data(
            batch_text_type_left_ids,
            pad_idx=self.pad_id,
        )
        padded_position_left_ids = pad_batch_data(
            batch_position_left_ids,
            pad_idx=self.pad_id,
        )
        padded_task_left_ids = np.ones_like(
            padded_token_left_ids,
            dtype="int64",
        ) * self.task_id

        padded_token_right_ids, input_right_mask = pad_batch_data(
            batch_token_right_ids,
            pad_idx=self.pad_id,
            return_input_mask=True,
        )
        padded_text_type_right_ids = pad_batch_data(
            batch_text_type_right_ids,
            pad_idx=self.pad_id,
        )
        padded_position_right_ids = pad_batch_data(
            batch_position_right_ids,
            pad_idx=self.pad_id,
        )
        padded_task_right_ids = np.ones_like(
            padded_token_right_ids,
            dtype="int64",
        ) * self.task_id

        return_list = [
            padded_token_query_ids,
            padded_text_type_query_ids,
            padded_position_query_ids,
            padded_task_query_ids,
            input_query_mask,
            padded_token_left_ids,
            padded_text_type_left_ids,
            padded_position_left_ids,
            padded_task_left_ids,
            input_left_mask,
            padded_token_right_ids,
            padded_text_type_right_ids,
            padded_position_right_ids,
            padded_task_right_ids,
            input_right_mask,
        ]
        if not self.is_inference:
            return_list += [batch_labels, batch_types, batch_qids]
        else:
            return_list += [batch_qids, batch_ent_ids]
        return return_list