def tensorize( self, rows: List[List[str]] ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """Convert multiple rows of raw inputs into model input tensors. Args: row: 1) each row is a list of raw inputs, in most case it is a single text or a pair of texts. 2) each row is a list of preprocced tokens, we could still apply other operations (for example: bpe) on it. Returns: model input tensors. """ tokens_2d: List[List[int]] = [] segment_labels_2d: List[List[int]] = [] seq_len_2d: List[int] = [] for row in rows: numberized: Tuple[List[int], List[int], int] = self.numberize(row) tokens_2d.append(numberized[0]) segment_labels_2d.append(numberized[1]) seq_len_2d.append(numberized[2]) tokens, pad_mask = pad_2d_mask(tokens_2d, pad_value=self.vocab.pad_idx) segment_labels, _ = pad_2d_mask(segment_labels_2d, pad_value=self.vocab.pad_idx) return tokens, pad_mask, segment_labels
def tensorize( self, tokens_2d: List[List[int]], segment_labels_2d: List[List[int]], seq_lens_1d: List[int], positions_2d: List[List[int]], ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Convert instance level vectors into batch level tensors. """ tokens, pad_mask = pad_2d_mask(tokens_2d, pad_value=self.vocab.pad_idx) segment_labels = torch.tensor( pad_2d(segment_labels_2d, seq_lens=seq_lens_1d, pad_idx=0), dtype=torch.long ) positions = torch.tensor( pad_2d(positions_2d, seq_lens=seq_lens_1d, pad_idx=0), dtype=torch.long ) if self.device == "": return tokens, pad_mask, segment_labels, positions else: return ( tokens.to(self.device), pad_mask.to(self.device), segment_labels.to(self.device), positions.to(self.device), )
def tensorize( self, rows: List[List[str]] ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: tokens_2d: List[List[int]] = [] segment_labels_2d: List[List[int]] = [] seq_len_2d: List[int] = [] for row in rows: numberized: Tuple[List[int], List[int], int] = self.numberize(row) tokens_2d.append(numberized[0]) segment_labels_2d.append(numberized[1]) seq_len_2d.append(numberized[2]) tokens, pad_mask = pad_2d_mask(tokens_2d, pad_value=self.vocab.pad_idx) segment_labels, _ = pad_2d_mask(segment_labels_2d, pad_value=self.vocab.pad_idx) return tokens, pad_mask, segment_labels
def tensorize( self, texts: Optional[List[List[str]]] = None, tokens: Optional[List[List[List[str]]]] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: tokens_2d: List[List[int]] = [] seq_len_2d: List[int] = [] start_indices_2d: List[List[int]] = [] end_indices_2d: List[List[int]] = [] positions_2d: List[List[int]] = [] for idx in range(self.batch_size(texts, tokens)): numberized: Tuple[List[int], int, List[int], List[int], List[int]] = self.numberize( self.get_texts_by_index(texts, idx), self.get_tokens_by_index(tokens, idx), ) tokens_2d.append(numberized[0]) seq_len_2d.append(numberized[1]) start_indices_2d.append(numberized[2]) end_indices_2d.append(numberized[3]) positions_2d.append(numberized[4]) tokens, pad_mask = pad_2d_mask(tokens_2d, pad_value=self.vocab.pad_idx) start_indices = torch.tensor( pad_2d(start_indices_2d, seq_lens=seq_len_2d, pad_idx=self.vocab.pad_idx), dtype=torch.long, ) end_indices = torch.tensor( pad_2d(end_indices_2d, seq_lens=seq_len_2d, pad_idx=self.vocab.pad_idx), dtype=torch.long, ) positions = torch.tensor( pad_2d(positions_2d, seq_lens=seq_len_2d, pad_idx=self.vocab.pad_idx), dtype=torch.long, ) if self.device == "": return tokens, pad_mask, start_indices, end_indices, positions else: return ( tokens.to(self.device), pad_mask.to(self.device), start_indices.to(self.device), end_indices.to(self.device), positions.to(self.device), )
def tensorize( self, texts: Optional[List[List[str]]] = None, tokens: Optional[List[List[List[str]]]] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Process raw inputs into model input tensors, it supports two input formats: 1) multiple rows of texts (single sentence or a pair) 2) multiple rows of pre-processed tokens (single sentence or a pair) This function should handle the logic of calling numberize() and also padding the numberized result. """ tokens_2d: List[List[int]] = [] segment_labels_2d: List[List[int]] = [] seq_len_2d: List[int] = [] positions_2d: List[List[int]] = [] for idx in range(self.batch_size(texts, tokens)): numberized: Tuple[List[int], List[int], int, List[int]] = self.numberize( self.get_texts_by_index(texts, idx), self.get_tokens_by_index(tokens, idx), ) tokens_2d.append(numberized[0]) segment_labels_2d.append(numberized[1]) seq_len_2d.append(numberized[2]) positions_2d.append(numberized[3]) tokens, pad_mask = pad_2d_mask(tokens_2d, pad_value=self.vocab.pad_idx) segment_labels = torch.tensor( pad_2d(segment_labels_2d, seq_lens=seq_len_2d, pad_idx=self.vocab.pad_idx), dtype=torch.long, ) positions = torch.tensor( pad_2d(positions_2d, seq_lens=seq_len_2d, pad_idx=self.vocab.pad_idx), dtype=torch.long, ) if self.device == "": return tokens, pad_mask, segment_labels, positions else: return ( tokens.to(self.device), pad_mask.to(self.device), segment_labels.to(self.device), positions.to(self.device), )
def tensorize( self, texts: Optional[List[List[str]]] = None, tokens: Optional[List[List[List[str]]]] = None, languages: Optional[List[List[str]]] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: # unwrap Optional batch_size: int = self.batch_size(texts, tokens) row_size: int = self.row_size(texts, tokens) if languages is None: languages = [[self.default_language] * row_size] * batch_size tokens_2d: List[List[int]] = [] languages_2d: List[List[int]] = [] seq_len_2d: List[int] = [] positions_2d: List[List[int]] = [] for idx in range(batch_size): numberized: Tuple[List[int], List[int], int, List[int]] = self.numberize( self.get_texts_by_index(texts, idx), self.get_tokens_by_index(tokens, idx), languages[idx], ) tokens_2d.append(numberized[0]) languages_2d.append(numberized[1]) seq_len_2d.append(numberized[2]) positions_2d.append(numberized[3]) tokens, pad_mask = pad_2d_mask(tokens_2d, pad_value=self.token_vocab.pad_idx) languages = torch.tensor(pad_2d(languages_2d, seq_lens=seq_len_2d, pad_idx=0), dtype=torch.long) positions = torch.tensor(pad_2d(positions_2d, seq_lens=seq_len_2d, pad_idx=0), dtype=torch.long) if self.device == "": return tokens, pad_mask, languages, positions else: return ( tokens.to(self.device), pad_mask.to(self.device), languages.to(self.device), positions.to(self.device), )
def tensorize( self, texts: Optional[List[List[str]]] = None, tokens: Optional[List[List[List[str]]]] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: tokens_2d: List[List[int]] = [] segment_labels_2d: List[List[int]] = [] seq_len_2d: List[int] = [] positions_2d: List[List[int]] = [] for idx in range(self.batch_size(texts, tokens)): numberized: Tuple[List[int], List[int], int, List[int]] = self.numberize( self.get_texts_by_index(texts, idx), self.get_tokens_by_index(tokens, idx), ) tokens_2d.append(numberized[0]) segment_labels_2d.append(numberized[1]) seq_len_2d.append(numberized[2]) positions_2d.append(numberized[3]) tokens, pad_mask = pad_2d_mask(tokens_2d, pad_value=self.vocab.pad_idx) segment_labels = torch.tensor( pad_2d(segment_labels_2d, seq_lens=seq_len_2d, pad_idx=self.vocab.pad_idx), dtype=torch.long, ) positions = torch.tensor( pad_2d(positions_2d, seq_lens=seq_len_2d, pad_idx=self.vocab.pad_idx), dtype=torch.long, ) return tokens, pad_mask, segment_labels, positions
def tensorize( self, texts: Optional[List[List[str]]] = None, tokens: Optional[List[List[List[str]]]] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: tokens_2d: List[List[int]] = [] start_indices_2d: List[List[int]] = [] end_indices_2d: List[List[int]] = [] positions_2d: List[List[int]] = [] for idx in range(self.batch_size(texts, tokens)): numberized: Tuple[List[int], List[int], List[int], List[int]] = self.numberize( self.get_texts_by_index(texts, idx), self.get_tokens_by_index(tokens, idx), ) tokens_2d.append(numberized[0]) start_indices_2d.append(numberized[1]) end_indices_2d.append(numberized[2]) positions_2d.append(numberized[3]) tokens, pad_mask = pad_2d_mask( tokens_2d, pad_value=self.vocab.pad_idx, seq_padding_control=self.seq_padding_control, max_seq_pad_len=self.max_seq_len, batch_padding_control=self.batch_padding_control, ) start_indices, _ = pad_2d_mask( start_indices_2d, pad_value=0, seq_padding_control=self.seq_padding_control, max_seq_pad_len=self.max_seq_len, batch_padding_control=self.batch_padding_control, ) end_indices, _ = pad_2d_mask( end_indices_2d, pad_value=0, seq_padding_control=self.seq_padding_control, max_seq_pad_len=self.max_seq_len, batch_padding_control=self.batch_padding_control, ) positions, _ = pad_2d_mask( positions_2d, pad_value=0, seq_padding_control=self.seq_padding_control, max_seq_pad_len=self.max_seq_len, batch_padding_control=self.batch_padding_control, ) if self.device == "": return tokens, pad_mask, start_indices, end_indices, positions else: return ( tokens.to(self.device), pad_mask.to(self.device), start_indices.to(self.device), end_indices.to(self.device), positions.to(self.device), )