Esempio n. 1
0
    def tensorize(
        self, rows: List[List[str]]
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """Convert multiple rows of raw inputs into model input tensors.

        Args:
            row: 1) each row is a list of raw inputs, in most case it is a
                single text or a pair of texts.
                 2) each row is a list of preprocced tokens, we could still
                apply other operations (for example: bpe) on it.

        Returns:
            model input tensors.
        """

        tokens_2d: List[List[int]] = []
        segment_labels_2d: List[List[int]] = []
        seq_len_2d: List[int] = []

        for row in rows:
            numberized: Tuple[List[int], List[int], int] = self.numberize(row)
            tokens_2d.append(numberized[0])
            segment_labels_2d.append(numberized[1])
            seq_len_2d.append(numberized[2])

        tokens, pad_mask = pad_2d_mask(tokens_2d, pad_value=self.vocab.pad_idx)
        segment_labels, _ = pad_2d_mask(segment_labels_2d,
                                        pad_value=self.vocab.pad_idx)
        return tokens, pad_mask, segment_labels
Esempio n. 2
0
 def tensorize(
     self,
     tokens_2d: List[List[int]],
     segment_labels_2d: List[List[int]],
     seq_lens_1d: List[int],
     positions_2d: List[List[int]],
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Convert instance level vectors into batch level tensors.
     """
     tokens, pad_mask = pad_2d_mask(tokens_2d, pad_value=self.vocab.pad_idx)
     segment_labels = torch.tensor(
         pad_2d(segment_labels_2d, seq_lens=seq_lens_1d, pad_idx=0), dtype=torch.long
     )
     positions = torch.tensor(
         pad_2d(positions_2d, seq_lens=seq_lens_1d, pad_idx=0), dtype=torch.long
     )
     if self.device == "":
         return tokens, pad_mask, segment_labels, positions
     else:
         return (
             tokens.to(self.device),
             pad_mask.to(self.device),
             segment_labels.to(self.device),
             positions.to(self.device),
         )
Esempio n. 3
0
    def tensorize(
        self, rows: List[List[str]]
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        tokens_2d: List[List[int]] = []
        segment_labels_2d: List[List[int]] = []
        seq_len_2d: List[int] = []

        for row in rows:
            numberized: Tuple[List[int], List[int], int] = self.numberize(row)
            tokens_2d.append(numberized[0])
            segment_labels_2d.append(numberized[1])
            seq_len_2d.append(numberized[2])

        tokens, pad_mask = pad_2d_mask(tokens_2d, pad_value=self.vocab.pad_idx)
        segment_labels, _ = pad_2d_mask(segment_labels_2d,
                                        pad_value=self.vocab.pad_idx)
        return tokens, pad_mask, segment_labels
Esempio n. 4
0
    def tensorize(
        self,
        texts: Optional[List[List[str]]] = None,
        tokens: Optional[List[List[List[str]]]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
               torch.Tensor]:
        tokens_2d: List[List[int]] = []
        seq_len_2d: List[int] = []
        start_indices_2d: List[List[int]] = []
        end_indices_2d: List[List[int]] = []
        positions_2d: List[List[int]] = []

        for idx in range(self.batch_size(texts, tokens)):
            numberized: Tuple[List[int], int, List[int], List[int],
                              List[int]] = self.numberize(
                                  self.get_texts_by_index(texts, idx),
                                  self.get_tokens_by_index(tokens, idx),
                              )
            tokens_2d.append(numberized[0])
            seq_len_2d.append(numberized[1])
            start_indices_2d.append(numberized[2])
            end_indices_2d.append(numberized[3])
            positions_2d.append(numberized[4])

        tokens, pad_mask = pad_2d_mask(tokens_2d, pad_value=self.vocab.pad_idx)
        start_indices = torch.tensor(
            pad_2d(start_indices_2d,
                   seq_lens=seq_len_2d,
                   pad_idx=self.vocab.pad_idx),
            dtype=torch.long,
        )
        end_indices = torch.tensor(
            pad_2d(end_indices_2d,
                   seq_lens=seq_len_2d,
                   pad_idx=self.vocab.pad_idx),
            dtype=torch.long,
        )
        positions = torch.tensor(
            pad_2d(positions_2d,
                   seq_lens=seq_len_2d,
                   pad_idx=self.vocab.pad_idx),
            dtype=torch.long,
        )

        if self.device == "":
            return tokens, pad_mask, start_indices, end_indices, positions
        else:
            return (
                tokens.to(self.device),
                pad_mask.to(self.device),
                start_indices.to(self.device),
                end_indices.to(self.device),
                positions.to(self.device),
            )
Esempio n. 5
0
    def tensorize(
        self,
        texts: Optional[List[List[str]]] = None,
        tokens: Optional[List[List[List[str]]]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Process raw inputs into model input tensors, it supports two input
        formats:
            1) multiple rows of texts (single sentence or a pair)
            2) multiple rows of pre-processed tokens (single sentence or a pair)

        This function should handle the logic of calling numberize() and also
        padding the numberized result.
        """
        tokens_2d: List[List[int]] = []
        segment_labels_2d: List[List[int]] = []
        seq_len_2d: List[int] = []
        positions_2d: List[List[int]] = []

        for idx in range(self.batch_size(texts, tokens)):
            numberized: Tuple[List[int], List[int], int,
                              List[int]] = self.numberize(
                                  self.get_texts_by_index(texts, idx),
                                  self.get_tokens_by_index(tokens, idx),
                              )
            tokens_2d.append(numberized[0])
            segment_labels_2d.append(numberized[1])
            seq_len_2d.append(numberized[2])
            positions_2d.append(numberized[3])

        tokens, pad_mask = pad_2d_mask(tokens_2d, pad_value=self.vocab.pad_idx)
        segment_labels = torch.tensor(
            pad_2d(segment_labels_2d,
                   seq_lens=seq_len_2d,
                   pad_idx=self.vocab.pad_idx),
            dtype=torch.long,
        )
        positions = torch.tensor(
            pad_2d(positions_2d,
                   seq_lens=seq_len_2d,
                   pad_idx=self.vocab.pad_idx),
            dtype=torch.long,
        )

        if self.device == "":
            return tokens, pad_mask, segment_labels, positions
        else:
            return (
                tokens.to(self.device),
                pad_mask.to(self.device),
                segment_labels.to(self.device),
                positions.to(self.device),
            )
Esempio n. 6
0
    def tensorize(
        self,
        texts: Optional[List[List[str]]] = None,
        tokens: Optional[List[List[List[str]]]] = None,
        languages: Optional[List[List[str]]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        # unwrap Optional
        batch_size: int = self.batch_size(texts, tokens)
        row_size: int = self.row_size(texts, tokens)

        if languages is None:
            languages = [[self.default_language] * row_size] * batch_size

        tokens_2d: List[List[int]] = []
        languages_2d: List[List[int]] = []
        seq_len_2d: List[int] = []
        positions_2d: List[List[int]] = []

        for idx in range(batch_size):
            numberized: Tuple[List[int], List[int], int,
                              List[int]] = self.numberize(
                                  self.get_texts_by_index(texts, idx),
                                  self.get_tokens_by_index(tokens, idx),
                                  languages[idx],
                              )
            tokens_2d.append(numberized[0])
            languages_2d.append(numberized[1])
            seq_len_2d.append(numberized[2])
            positions_2d.append(numberized[3])

        tokens, pad_mask = pad_2d_mask(tokens_2d,
                                       pad_value=self.token_vocab.pad_idx)
        languages = torch.tensor(pad_2d(languages_2d,
                                        seq_lens=seq_len_2d,
                                        pad_idx=0),
                                 dtype=torch.long)
        positions = torch.tensor(pad_2d(positions_2d,
                                        seq_lens=seq_len_2d,
                                        pad_idx=0),
                                 dtype=torch.long)

        if self.device == "":
            return tokens, pad_mask, languages, positions
        else:
            return (
                tokens.to(self.device),
                pad_mask.to(self.device),
                languages.to(self.device),
                positions.to(self.device),
            )
Esempio n. 7
0
    def tensorize(
        self,
        texts: Optional[List[List[str]]] = None,
        tokens: Optional[List[List[List[str]]]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        tokens_2d: List[List[int]] = []
        segment_labels_2d: List[List[int]] = []
        seq_len_2d: List[int] = []
        positions_2d: List[List[int]] = []

        for idx in range(self.batch_size(texts, tokens)):
            numberized: Tuple[List[int], List[int], int,
                              List[int]] = self.numberize(
                                  self.get_texts_by_index(texts, idx),
                                  self.get_tokens_by_index(tokens, idx),
                              )
            tokens_2d.append(numberized[0])
            segment_labels_2d.append(numberized[1])
            seq_len_2d.append(numberized[2])
            positions_2d.append(numberized[3])

        tokens, pad_mask = pad_2d_mask(tokens_2d, pad_value=self.vocab.pad_idx)
        segment_labels = torch.tensor(
            pad_2d(segment_labels_2d,
                   seq_lens=seq_len_2d,
                   pad_idx=self.vocab.pad_idx),
            dtype=torch.long,
        )
        positions = torch.tensor(
            pad_2d(positions_2d,
                   seq_lens=seq_len_2d,
                   pad_idx=self.vocab.pad_idx),
            dtype=torch.long,
        )

        return tokens, pad_mask, segment_labels, positions
Esempio n. 8
0
    def tensorize(
        self,
        texts: Optional[List[List[str]]] = None,
        tokens: Optional[List[List[List[str]]]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
               torch.Tensor]:
        tokens_2d: List[List[int]] = []
        start_indices_2d: List[List[int]] = []
        end_indices_2d: List[List[int]] = []
        positions_2d: List[List[int]] = []

        for idx in range(self.batch_size(texts, tokens)):
            numberized: Tuple[List[int], List[int], List[int],
                              List[int]] = self.numberize(
                                  self.get_texts_by_index(texts, idx),
                                  self.get_tokens_by_index(tokens, idx),
                              )
            tokens_2d.append(numberized[0])
            start_indices_2d.append(numberized[1])
            end_indices_2d.append(numberized[2])
            positions_2d.append(numberized[3])

        tokens, pad_mask = pad_2d_mask(
            tokens_2d,
            pad_value=self.vocab.pad_idx,
            seq_padding_control=self.seq_padding_control,
            max_seq_pad_len=self.max_seq_len,
            batch_padding_control=self.batch_padding_control,
        )

        start_indices, _ = pad_2d_mask(
            start_indices_2d,
            pad_value=0,
            seq_padding_control=self.seq_padding_control,
            max_seq_pad_len=self.max_seq_len,
            batch_padding_control=self.batch_padding_control,
        )

        end_indices, _ = pad_2d_mask(
            end_indices_2d,
            pad_value=0,
            seq_padding_control=self.seq_padding_control,
            max_seq_pad_len=self.max_seq_len,
            batch_padding_control=self.batch_padding_control,
        )

        positions, _ = pad_2d_mask(
            positions_2d,
            pad_value=0,
            seq_padding_control=self.seq_padding_control,
            max_seq_pad_len=self.max_seq_len,
            batch_padding_control=self.batch_padding_control,
        )

        if self.device == "":
            return tokens, pad_mask, start_indices, end_indices, positions
        else:
            return (
                tokens.to(self.device),
                pad_mask.to(self.device),
                start_indices.to(self.device),
                end_indices.to(self.device),
                positions.to(self.device),
            )