Example #1
0
 def build_dataloader(self,
                      data,
                      transform: TransformList = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      tokenizer: PreTrainedTokenizer = None,
                      **kwargs) -> DataLoader:
     assert tokenizer
     dataset = TextTokenizingDataset(data, cache=isinstance(data, str), delimiter=self.config.sent_delimiter,
                                     generate_idx=isinstance(data, list),
                                     max_seq_len=self.config.max_seq_len,
                                     sent_delimiter=self.config.sent_delimiter,
                                     transform=[
                                         TransformerSequenceTokenizer(tokenizer,
                                                                      'text',
                                                                      ret_prefix_mask=True,
                                                                      ret_subtokens=True,
                                                                      ),
                                         FieldLength('text_input_ids', 'text_input_ids_length', delta=-2),
                                         generate_token_span_tuple])
     return PadSequenceDataLoader(
         batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset, 'text_input_ids', 'text'),
                                                  shuffle=training),
         device=device,
         dataset=dataset)
Example #2
0
 def build_dataset(self, data, **kwargs):
     return TextTokenizingDataset(data, **kwargs)