def test_data_collator_for_language_modeling(self): tokenizer = BertTokenizer(self.vocab_file) no_pad_features = [{ "input_ids": list(range(10)) }, { "input_ids": list(range(10)) }] pad_features = [{ "input_ids": list(range(5)) }, { "input_ids": list(range(10)) }] data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) tokenizer._pad_token = None data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) with self.assertRaises(ValueError): # Expect error due to padding token missing data_collator(pad_features) set_seed(42) # For reproducibility tokenizer = BertTokenizer(self.vocab_file) data_collator = DataCollatorForLanguageModeling(tokenizer) batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(torch.any(masked_tokens)) self.assertTrue( all(x == -100 for x in batch["labels"][~masked_tokens].tolist())) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(torch.any(masked_tokens)) self.assertTrue( all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
def _test_no_pad_and_pad(self, no_pad_features, pad_features): tokenizer = BertTokenizer(self.vocab_file) data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8) batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16))) self.assertEqual(batch["labels"].shape, torch.Size((2, 16))) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16))) self.assertEqual(batch["labels"].shape, torch.Size((2, 16))) tokenizer._pad_token = None data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) with self.assertRaises(ValueError): # Expect error due to padding token missing data_collator(pad_features) set_seed(42) # For reproducibility tokenizer = BertTokenizer(self.vocab_file) data_collator = DataCollatorForLanguageModeling(tokenizer) batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(torch.any(masked_tokens)) self.assertTrue( all(x == -100 for x in batch["labels"][~masked_tokens].tolist())) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(torch.any(masked_tokens)) self.assertTrue( all(x == -100 for x in batch["labels"][~masked_tokens].tolist())) data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8) batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16))) self.assertEqual(batch["labels"].shape, torch.Size((2, 16))) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(torch.any(masked_tokens)) self.assertTrue( all(x == -100 for x in batch["labels"][~masked_tokens].tolist())) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16))) self.assertEqual(batch["labels"].shape, torch.Size((2, 16))) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(torch.any(masked_tokens)) self.assertTrue( all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
def _test_no_pad_and_pad(self, no_pad_features, pad_features): tokenizer = BertTokenizer(self.vocab_file) data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf") batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10]) self.assertEqual(batch["labels"].shape.as_list(), [2, 10]) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10]) self.assertEqual(batch["labels"].shape.as_list(), [2, 10]) data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors="tf") batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16]) self.assertEqual(batch["labels"].shape.as_list(), [2, 16]) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16]) self.assertEqual(batch["labels"].shape.as_list(), [2, 16]) tokenizer._pad_token = None data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf") with self.assertRaises(ValueError): # Expect error due to padding token missing data_collator(pad_features) set_seed(42) # For reproducibility tokenizer = BertTokenizer(self.vocab_file) data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf") batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10]) self.assertEqual(batch["labels"].shape.as_list(), [2, 10]) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(tf.reduce_any(masked_tokens)) # self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist())) batch = data_collator(pad_features, return_tensors="tf") self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10]) self.assertEqual(batch["labels"].shape.as_list(), [2, 10]) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(tf.reduce_any(masked_tokens)) # self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist())) data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf") batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16]) self.assertEqual(batch["labels"].shape.as_list(), [2, 16]) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(tf.reduce_any(masked_tokens)) # self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist())) batch = data_collator(pad_features, return_tensors="tf") self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16]) self.assertEqual(batch["labels"].shape.as_list(), [2, 16]) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(tf.reduce_any(masked_tokens))