def test_lm_tokenizer_without_padding(self): tokenizer = AutoTokenizer.from_pretrained("gpt2") data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) # ^ causal lm dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512) examples = [dataset[i] for i in range(len(dataset))] with self.assertRaises(ValueError): # Expect error due to padding token missing on gpt2: data_collator.collate_batch(examples) dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True) examples = [dataset[i] for i in range(len(dataset))] batch = data_collator.collate_batch(examples) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512))) self.assertEqual(batch["labels"].shape, torch.Size((2, 512)))
def test_lm_tokenizer_with_padding(self): tokenizer = AutoTokenizer.from_pretrained("distilroberta-base") data_collator = DataCollatorForLanguageModeling(tokenizer) # ^ masked lm dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512) examples = [dataset[i] for i in range(len(dataset))] batch = data_collator.collate_batch(examples) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((31, 107))) self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((31, 107))) dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True) examples = [dataset[i] for i in range(len(dataset))] batch = data_collator.collate_batch(examples) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512))) self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((2, 512)))