class ScheduleInitTest(unittest.TestCase): m = torch.nn.Linear(50, 50) if is_torch_available() else None optimizer = AdamW(m.parameters(), lr=10.) if is_torch_available() else None num_steps = 10 def assertListAlmostEqual(self, list1, list2, tol): self.assertEqual(len(list1), len(list2)) for a, b in zip(list1, list2): self.assertAlmostEqual(a, b, delta=tol) def test_constant_scheduler(self): scheduler = get_constant_schedule(self.optimizer) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [10.] * self.num_steps self.assertEqual(len(lrs[0]), 1) self.assertListEqual([l[0] for l in lrs], expected_learning_rates) scheduler = get_constant_schedule(self.optimizer) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) def test_warmup_constant_scheduler(self): scheduler = get_constant_schedule_with_warmup(self.optimizer, num_warmup_steps=4) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [ 2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0 ] self.assertEqual(len(lrs[0]), 1) self.assertListEqual([l[0] for l in lrs], expected_learning_rates) scheduler = get_constant_schedule_with_warmup(self.optimizer, num_warmup_steps=4) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) def test_warmup_linear_scheduler(self): scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [ 5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0 ] self.assertEqual(len(lrs[0]), 1) self.assertListEqual([l[0] for l in lrs], expected_learning_rates) scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) def test_warmup_cosine_scheduler(self): scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [ 5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38, 0.0 ] self.assertEqual(len(lrs[0]), 1) self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2) scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) def test_warmup_cosine_hard_restart_scheduler(self): scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [ 5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0 ] self.assertEqual(len(lrs[0]), 1) self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2) scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
class BertModelTest(CommonTestCases.CommonModelTester): all_model_classes = ( BertModel, BertForMaskedLM, BertForNextSentencePrediction, BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BertForTokenClassification) if is_torch_available() else () class BertModelTester(object): def __init__( self, parent, batch_size=13, seq_length=7, is_training=True, use_input_mask=True, use_token_type_ids=True, use_labels=True, vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, type_sequence_label_size=2, initializer_range=0.02, num_labels=3, num_choices=4, scope=None, device='cpu', ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length self.is_training = is_training self.use_input_mask = use_input_mask self.use_token_type_ids = use_token_type_ids self.use_labels = use_labels self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.intermediate_size = intermediate_size self.hidden_act = hidden_act self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.type_sequence_label_size = type_sequence_label_size self.initializer_range = initializer_range self.num_labels = num_labels self.num_choices = num_choices self.scope = scope self.device = device # 1. superset of bert input/output descs # see BertPreTrainedModel doc self.input_ids_desc = IODescription( 'input_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=self.vocab_size) self.attention_mask_desc = IODescription( 'attention_mask', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=2) self.token_type_ids_desc = IODescription( 'token_type_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=2) self.position_ids_desc = IODescription( 'position_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=self.max_position_embeddings) self.head_mask_desc = IODescription( 'head_mask', [self.num_hidden_layers, self.num_attention_heads], torch.int64, num_classes=2) self.inputs_embeds_desc = IODescription( 'inputs_embeds', ['batch', 'max_seq_len_in_batch', self.hidden_size], torch.float32) self.encoder_hidden_states_desc = IODescription( 'encoder_hidden_states', ['batch', 'max_seq_len_in_batch', self.hidden_size], torch.float32) self.encoder_attention_mask_desc = IODescription( 'encoder_attention_mask', ['batch', 'max_seq_len_in_batch'], torch.float32) # see BertForPreTraining doc self.masked_lm_labels_desc = IODescription( 'masked_lm_labels', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=self.vocab_size) self.next_sentence_label_desc = IODescription( 'next_sentence_label', [ 'batch', ], torch.int64, num_classes=2) # outputs self.loss_desc = IODescription('loss', [ 1, ], torch.float32) self.prediction_scores_desc = IODescription( 'prediction_scores', ['batch', 'max_seq_len_in_batch', self.vocab_size], torch.float32) self.seq_relationship_scores_desc = IODescription( 'seq_relationship_scores', ['batch', 2], torch.float32 ) # IODescription('seq_relationship_scores', ['batch', 'max_seq_len_in_batch', 2], torch.float32) self.hidden_states_desc = IODescription('hidden_states', [ self.num_hidden_layers, 'batch', 'max_seq_len_in_batch', self.hidden_size ], torch.float32) self.attentions_desc = IODescription('attentions', [ self.num_hidden_layers, 'batch', self.num_attention_heads, 'max_seq_len_in_batch', 'max_seq_len_in_batch' ], torch.float32) self.last_hidden_state_desc = IODescription( 'last_hidden_state', ['batch', 'max_seq_len_in_batch', self.hidden_size], torch.float32) self.pooler_output_desc = IODescription( 'pooler_output', ['batch', self.hidden_size], torch.float32) # BertForPreTraining forward: # def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, # position_ids??=None, head_mask??=None, inputs_embeds??=None, # masked_lm_labels=None, next_sentence_label=None): # # create_and_check_bert_for_pretraining calls BertForPreTraining: # model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, # masked_lm_labels=token_labels, next_sentence_label=sequence_labels) def BertForPreTraining_descs(self): return ModelDescription( [ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc, self.next_sentence_label_desc ], # returns loss_desc if both masked_lm_labels_desc, next_sentence_label are provided # hidden_states_desc, attentions_desc shall be included according to config.output_attentions, config.output_hidden_states [ self.loss_desc, self.prediction_scores_desc, self.seq_relationship_scores_desc, #hidden_states_desc, attentions_desc ]) def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).to(self.device) input_mask = None if self.use_input_mask: input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2).to(self.device) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size).to( self.device) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size).to( self.device) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels).to(self.device) choice_labels = ids_tensor([self.batch_size], self.num_choices).to(self.device) config = BertConfig( vocab_size_or_config_json_file=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels def prepare_config_and_inputs_for_decoder(self): config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels = self.prepare_config_and_inputs( ) config.is_decoder = True encoder_hidden_states = floats_tensor( [self.batch_size, self.seq_length, self.hidden_size]) encoder_attention_mask = ids_tensor( [self.batch_size, self.seq_length], vocab_size=2) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask def check_loss_output(self, result): self.parent.assertListEqual(list(result["loss"].size()), []) def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertModel(config=config) model.to(input_ids.device) model.eval() sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) # failed because there is not loss output model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc ], [self.last_hidden_state_desc, self.pooler_output_desc]) args_gradient_accumulation_steps = 8 args_local_rank = 0 args_world_size = 1 args_fp16 = True args_allreduce_post_accumulation = True model = ORTTrainer( model, None, model_desc, "LambOptimizer", map_optimizer_attributes=map_optimizer_attributes, learning_rate_description=IODescription( 'Learning_Rate', [ 1, ], torch.float32), device=self.device, postprocess_model=postprocess_model, gradient_accumulation_steps=args_gradient_accumulation_steps, world_rank=args_local_rank, world_size=args_world_size, use_mixed_precision=True if args_fp16 else False, allreduce_post_accumulation=True if args_allreduce_post_accumulation else False) sequence_output, pooled_output = model( input_ids, token_type_ids=token_type_ids) sequence_output, pooled_output = model(input_ids) result = { "sequence_output": sequence_output, "pooled_output": pooled_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask): model = BertModel(config) model.eval() sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask) sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states) sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = { "sequence_output": sequence_output, "pooled_output": pooled_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForMaskedLM(config=config) model.eval() loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) ##### model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc ], [self.loss_desc, self.prediction_scores_desc]) args_gradient_accumulation_steps = 8 args_local_rank = 0 args_world_size = 1 args_fp16 = True args_allreduce_post_accumulation = True model = ORTTrainer( model, None, model_desc, "LambOptimizer", map_optimizer_attributes=map_optimizer_attributes, learning_rate_description=IODescription( 'Learning_Rate', [ 1, ], torch.float32), device=self.device, postprocess_model=postprocess_model, gradient_accumulation_steps=args_gradient_accumulation_steps, world_rank=args_local_rank, world_size=args_world_size, use_mixed_precision=True if args_fp16 else False, allreduce_post_accumulation=True if args_allreduce_post_accumulation else False) model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) def create_and_check_bert_model_for_masked_lm_as_decoder( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask): model = BertForMaskedLM(config=config) model.eval() loss, prediction_scores = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask) loss, prediction_scores = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result) def create_and_check_bert_for_next_sequence_prediction( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForNextSentencePrediction(config=config) model.eval() loss, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels) result = { "loss": loss, "seq_relationship_score": seq_relationship_score, } self.parent.assertListEqual( list(result["seq_relationship_score"].size()), [self.batch_size, 2]) self.check_loss_output(result) def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForPreTraining(config=config) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, next_sentence_label=sequence_labels) model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc, self.next_sentence_label_desc ], [ self.loss_desc, self.prediction_scores_desc, self.seq_relationship_scores_desc ]) import argparse args_ = argparse.Namespace(fp16=True, amp_opt_level='O1') from collections import namedtuple MyArgs = namedtuple( "MyArgs", "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len" ) args = MyArgs(local_rank=0, world_size=1, max_steps=100, learning_rate=0.00001, warmup_proportion=0.01, batch_size=13, seq_len=7) from train_with_ort_trainer import get_lr def get_lr_this_step(global_step): return get_lr(args, global_step) loss_scaler = LossScaler('loss_scale_input_name', True, up_scale_window=2000) option_gradient_accumulation_steps = [8] option_fp16 = [True, False] option_allreduce_post_accumulation = True option_use_internal_get_lr_this_step = False option_use_internal_loss_scaler = False # TODO: with with fetches for gradient_accumulation_steps in option_gradient_accumulation_steps: for fp16 in option_fp16: for option_split_batch in BatchArgsOption: loss_ort, prediction_scores_ort, seq_relationship_score_ort =\ run_test(model, model_desc, self.device, args, gradient_accumulation_steps, fp16, option_allreduce_post_accumulation, get_lr_this_step, option_use_internal_get_lr_this_step, loss_scaler, option_use_internal_loss_scaler, option_split_batch) print(loss_ort) print(prediction_scores_ort) print(seq_relationship_score_ort) def create_and_check_bert_for_question_answering( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForQuestionAnswering(config=config) model.eval() loss, start_logits, end_logits = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, start_positions=sequence_labels, end_positions=sequence_labels) result = { "loss": loss, "start_logits": start_logits, "end_logits": end_logits, } self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) def create_and_check_bert_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.num_labels = self.num_labels model = BertForSequenceClassification(config) model.eval() loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) def create_and_check_bert_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.num_labels = self.num_labels model = BertForTokenClassification(config=config) model.eval() loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) def create_and_check_bert_for_multiple_choice( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.num_choices = self.num_choices model = BertForMultipleChoice(config=config) model.eval() multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand( -1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze( 1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand( -1, self.num_choices, -1).contiguous() loss, logits = model(multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, token_type_ids=multiple_choice_token_type_ids, labels=choice_labels) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) self.check_loss_output(result) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() (config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs inputs_dict = { 'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask } return config, inputs_dict def setUp(self): self.model_tester = BertModelTest.BertModelTester(self) self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37) # def test_config(self): # self.config_tester.run_common_tests() # def test_bert_model(self, use_cuda=False): # # ^^ This could be a real fixture # if use_cuda: # self.model_tester.device = "cuda" # config_and_inputs = self.model_tester.prepare_config_and_inputs() # self.model_tester.create_and_check_bert_model(*config_and_inputs) # def test_bert_model_as_decoder(self): # config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() # self.model_tester.create_and_check_bert_model_as_decoder(*config_and_inputs) def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_bert_for_masked_lm( *config_and_inputs) # def test_for_masked_lm_decoder(self): # config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() # self.model_tester.create_and_check_bert_model_for_masked_lm_as_decoder(*config_and_inputs) # def test_for_multiple_choice(self): # config_and_inputs = self.model_tester.prepare_config_and_inputs() # self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs) # def test_for_next_sequence_prediction(self): # config_and_inputs = self.model_tester.prepare_config_and_inputs() # self.model_tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs) def test_for_pretraining(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_bert_for_pretraining( *config_and_inputs)
class TapasModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (( TapasModel, TapasForMaskedLM, TapasForQuestionAnswering, TapasForSequenceClassification, ) if is_torch_available() else None) test_pruning = False test_torchscript = False test_resize_embeddings = True test_head_masking = False def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = copy.deepcopy(inputs_dict) if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING): inputs_dict = { k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous() if isinstance(v, torch.Tensor) and v.ndim > 1 else v for k, v in inputs_dict.items() } if return_labels: if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING): inputs_dict["labels"] = torch.ones( self.model_tester.batch_size, dtype=torch.long, device=torch_device) elif model_class in get_values( MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING): inputs_dict["labels"] = torch.zeros( (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device) inputs_dict["aggregation_labels"] = torch.zeros( self.model_tester.batch_size, dtype=torch.long, device=torch_device) inputs_dict["numeric_values"] = torch.zeros( (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.float, device=torch_device, ) inputs_dict["numeric_values_scale"] = torch.zeros( (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.float, device=torch_device, ) inputs_dict["float_answer"] = torch.zeros( self.model_tester.batch_size, dtype=torch.float, device=torch_device) elif model_class in [ *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING), *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING), ]: inputs_dict["labels"] = torch.zeros( self.model_tester.batch_size, dtype=torch.long, device=torch_device) elif model_class in [ *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING), *get_values(MODEL_FOR_CAUSAL_LM_MAPPING), *get_values(MODEL_FOR_MASKED_LM_MAPPING), *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING), ]: inputs_dict["labels"] = torch.zeros( (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device) return inputs_dict def setUp(self): self.model_tester = TapasModelTester(self) self.config_tester = ConfigTester(self, config_class=TapasConfig, dim=37) def test_config(self): self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering( *config_and_inputs) def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_sequence_classification( *config_and_inputs)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--models", required=False, type=str, default="all", help="Model checkpoints to be provided " "to the AutoModel classes. Leave " "blank to benchmark the base version " "of all available model " "architectures.", ) parser.add_argument("--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models") parser.add_argument("--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available " "cuda devices") parser.add_argument( "--torchscript", required=False, action="store_true", help="Pytorch only: trace the models " "using torchscript", ) parser.add_argument( "--tensorflow", required=False, action="store_true", help="Benchmark the TensorFlow version " "of the models. Will run on GPU if " "the correct dependencies are " "installed", ) parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.") parser.add_argument( "--amp", required=False, action="store_true", help="TensorFlow only: use automatic mixed precision acceleration.", ) parser.add_argument("--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference.") parser.add_argument( "--keras_predict", required=False, action="store_true", help="Whether to use model.predict " "instead of model() to do a " "forward pass.", ) parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.") parser.add_argument("--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv.") parser.add_argument("--average_over", required=False, default=30, type=int, help="Times an experiment will be run.") args = parser.parse_args() if args.models == "all": args.models = [ "gpt2", "bert-base-cased", "xlnet-base-cased", "xlm-mlm-en-2048", "transfo-xl-wt103", "openai-gpt", "distilbert-base-uncased", "distilgpt2", "roberta-base", "ctrl", ] else: args.models = args.models.split() print("Running with arguments", args) if args.torch: if is_torch_available(): create_setup_and_compute( model_names=args.models, tensorflow=False, gpu=args.torch_cuda, torchscript=args.torchscript, fp16=args.fp16, save_to_csv=args.save_to_csv, csv_filename=args.csv_filename, average_over=args.average_over, ) else: raise ImportError( "Trying to run a PyTorch benchmark but PyTorch was not found in the environment." ) if args.tensorflow: if is_tf_available(): create_setup_and_compute( model_names=args.models, tensorflow=True, xla=args.xla, amp=args.amp, save_to_csv=args.save_to_csv, csv_filename=args.csv_filename, average_over=args.average_over, ) else: raise ImportError( "Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment." )
class T5ModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( T5Model, T5ForConditionalGeneration) if is_torch_available() else () all_generative_model_classes = ( T5ForConditionalGeneration, ) if is_torch_available() else () test_pruning = False test_torchscript = False test_resize_embeddings = False is_encoder_decoder = True def setUp(self): self.model_tester = T5ModelTester(self) self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37) def test_config(self): self.config_tester.run_common_tests() def test_shift_right(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.check_prepare_lm_labels_via_shift_left( *config_and_inputs) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_with_lm_head(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_with_lm_head(*config_and_inputs) def test_decoder_model_past(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_past( *config_and_inputs) def test_decoder_model_past_with_attn_mask(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_attention_mask_past( *config_and_inputs) def test_generate_with_past_key_value_states(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_generate_with_past_key_value_states( *config_and_inputs) def test_encoder_decoder_shared_weights(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_encoder_decoder_shared_weights( *config_and_inputs) @unittest.skipIf(torch_device == "cpu", "Cant do half precision") def test_model_fp16_forward(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_fp16_forward( *config_and_inputs) @slow def test_model_from_pretrained(self): for model_name in T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = T5Model.from_pretrained(model_name) self.assertIsNotNone(model) def test_export_to_onnx(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() model = T5Model(config_and_inputs[0]).to(torch_device) with tempfile.TemporaryDirectory() as tmpdirname: torch.onnx.export( model, config_and_inputs[1], f"{tmpdirname}/t5_test.onnx", export_params=True, opset_version=9, )
class ElectraModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (( ElectraModel, ElectraForPreTraining, ElectraForMaskedLM, ElectraForCausalLM, ElectraForMultipleChoice, ElectraForTokenClassification, ElectraForSequenceClassification, ElectraForQuestionAnswering, ) if is_torch_available() else ()) all_generative_model_classes = ( ElectraForCausalLM, ) if is_torch_available() else () fx_ready_model_classes = all_model_classes fx_dynamic_ready_model_classes = all_model_classes # special case for ForPreTraining model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) if return_labels: if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING): inputs_dict["labels"] = torch.zeros( (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device) return inputs_dict def setUp(self): self.model_tester = ElectraModelTester(self) self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_electra_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_electra_model(*config_and_inputs) def test_electra_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder( ) self.model_tester.create_and_check_electra_model_as_decoder( *config_and_inputs) def test_electra_model_various_embeddings(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() for type in ["absolute", "relative_key", "relative_key_query"]: config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_electra_model( *config_and_inputs) def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_electra_for_masked_lm( *config_and_inputs) def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_electra_for_token_classification( *config_and_inputs) def test_for_pre_training(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_electra_for_pretraining( *config_and_inputs) def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_electra_for_sequence_classification( *config_and_inputs) def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_electra_for_question_answering( *config_and_inputs) def test_for_multiple_choice(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_electra_for_multiple_choice( *config_and_inputs) @slow def test_model_from_pretrained(self): for model_name in ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = ElectraModel.from_pretrained(model_name) self.assertIsNotNone(model) def test_for_causal_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder( ) self.model_tester.create_and_check_electra_for_causal_lm( *config_and_inputs)
class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = ((GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2ForSequenceClassification) if is_torch_available() else ()) all_generative_model_classes = ( GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else () all_parallelizable_model_classes = ( GPT2LMHeadModel, ) if is_torch_available() else () test_missing_keys = False test_model_parallel = True # special case for DoubleHeads model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) if return_labels: if model_class.__name__ == "GPT2DoubleHeadsModel": inputs_dict["labels"] = torch.zeros( (self.model_tester.batch_size, self.model_tester.num_choices, self.model_tester.seq_length), dtype=torch.long, device=torch_device, ) inputs_dict["input_ids"] = inputs_dict["labels"] inputs_dict["token_type_ids"] = inputs_dict["labels"] inputs_dict["mc_token_ids"] = torch.zeros( (self.model_tester.batch_size, self.model_tester.num_choices), dtype=torch.long, device=torch_device, ) inputs_dict["mc_labels"] = torch.zeros( self.model_tester.batch_size, dtype=torch.long, device=torch_device) return inputs_dict def setUp(self): self.model_tester = GPT2ModelTester(self) self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37) def test_config(self): self.config_tester.run_common_tests() def test_gpt2_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt2_model(*config_and_inputs) def test_gpt2_model_past(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt2_model_past(*config_and_inputs) def test_gpt2_model_att_mask_past(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt2_model_attention_mask_past( *config_and_inputs) def test_gpt2_model_past_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt2_model_past_large_inputs( *config_and_inputs) def test_gpt2_lm_head_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_lm_head_model(*config_and_inputs) def test_gpt2_double_lm_head_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_double_lm_head_model( *config_and_inputs) def test_gpt2_sequence_classification_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt2_for_sequence_classification( *config_and_inputs) def test_gpt2_gradient_checkpointing(self): config_and_inputs = self.model_tester.prepare_config_and_inputs( gradient_checkpointing=True) self.model_tester.create_and_check_forward_and_backwards( *config_and_inputs) @slow def test_batch_generation(self): model = GPT2LMHeadModel.from_pretrained("gpt2") model.to(torch_device) tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.padding_side = "left" # Define PAD Token = EOS Token = 50256 tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = model.config.eos_token_id # use different length sentences to test batching sentences = [ "Hello, my dog is a little", "Today, I", ] inputs = tokenizer(sentences, return_tensors="pt", padding=True) input_ids = inputs["input_ids"].to(torch_device) token_type_ids = torch.cat( [ input_ids.new_full( (input_ids.shape[0], input_ids.shape[1] - 1), 0), input_ids.new_full((input_ids.shape[0], 1), 500), ], dim=-1, ) outputs = model.generate( input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), ) outputs_tt = model.generate( input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), token_type_ids=token_type_ids, ) inputs_non_padded = tokenizer( sentences[0], return_tensors="pt").input_ids.to(torch_device) output_non_padded = model.generate(input_ids=inputs_non_padded) num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][ -1].long().sum().cpu().item() inputs_padded = tokenizer( sentences[1], return_tensors="pt").input_ids.to(torch_device) output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) batch_out_sentence_tt = tokenizer.batch_decode( outputs_tt, skip_special_tokens=True) non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) expected_output_sentence = [ "Hello, my dog is a little bit of a mess. I'm not sure if he's going", "Today, I'm going to be doing a lot of research on this. I", ] self.assertListEqual(expected_output_sentence, batch_out_sentence) self.assertTrue( batch_out_sentence_tt != batch_out_sentence) # token_type_ids should change output self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) @slow def test_batch_generation_2heads(self): model = GPT2DoubleHeadsModel.from_pretrained("gpt2") model.to(torch_device) tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.padding_side = "left" # This tokenizer has no pad token, so we have to set it in some way # Define PAD Token = EOS Token = 50256 tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = model.config.eos_token_id # use different length sentences to test batching sentences = [ "Hello, my dog is a little", "Today, I", ] inputs = tokenizer(sentences, return_tensors="pt", padding=True) input_ids = inputs["input_ids"].to(torch_device) token_type_ids = torch.cat( [ input_ids.new_full( (input_ids.shape[0], input_ids.shape[1] - 1), 0), input_ids.new_full((input_ids.shape[0], 1), 500), ], dim=-1, ) outputs = model.generate( input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), ) outputs_tt = model.generate( input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), token_type_ids=token_type_ids, ) inputs_non_padded = tokenizer( sentences[0], return_tensors="pt").input_ids.to(torch_device) output_non_padded = model.generate(input_ids=inputs_non_padded) num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][ -1].long().sum().cpu().item() inputs_padded = tokenizer( sentences[1], return_tensors="pt").input_ids.to(torch_device) output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) batch_out_sentence_tt = tokenizer.batch_decode( outputs_tt, skip_special_tokens=True) non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) expected_output_sentence = [ "Hello, my dog is a little bit of a mess. I'm not sure if he's going", "Today, I'm going to be doing a lot of research on this. I", ] self.assertListEqual(expected_output_sentence, batch_out_sentence) self.assertTrue( batch_out_sentence_tt != batch_out_sentence) # token_type_ids should change output self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) @slow def test_model_from_pretrained(self): for model_name in GPT2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = GPT2Model.from_pretrained(model_name) self.assertIsNotNone(model)
class CTRLModelTest(CommonTestCases.CommonModelTester): all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else () test_pruning = False test_torchscript = False test_resize_embeddings = False test_head_masking = False class CTRLModelTester(object): def __init__( self, parent, batch_size=13, seq_length=7, is_training=True, use_token_type_ids=True, use_input_mask=True, use_labels=True, use_mc_token_ids=True, vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, type_sequence_label_size=2, initializer_range=0.02, num_labels=3, num_choices=4, scope=None, ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length self.is_training = is_training self.use_token_type_ids = use_token_type_ids self.use_input_mask = use_input_mask self.use_labels = use_labels self.use_mc_token_ids = use_mc_token_ids self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.intermediate_size = intermediate_size self.hidden_act = hidden_act self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.type_sequence_label_size = type_sequence_label_size self.initializer_range = initializer_range self.num_labels = num_labels self.num_choices = num_choices self.scope = scope def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) mc_token_ids = None if self.use_mc_token_ids: mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) config = CTRLConfig( vocab_size_or_config_json_file=self.vocab_size, n_embd=self.hidden_size, n_layer=self.num_hidden_layers, n_head=self.num_attention_heads, # intermediate_size=self.intermediate_size, # hidden_act=self.hidden_act, # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, n_ctx=self.max_position_embeddings # type_vocab_size=self.type_vocab_size, # initializer_range=self.initializer_range ) head_mask = ids_tensor( [self.num_hidden_layers, self.num_attention_heads], 2) return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels def check_loss_output(self, result): self.parent.assertListEqual(list(result["loss"].size()), []) def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = CTRLModel(config=config) model.eval() model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) model(input_ids, token_type_ids=token_type_ids) sequence_output, presents = model(input_ids) result = { "sequence_output": sequence_output, "presents": presents, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertEqual(len(result["presents"]), config.n_layer) def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = CTRLLMHeadModel(config) model.eval() loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) result = {"loss": loss, "lm_logits": lm_logits} self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() (config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs inputs_dict = { 'input_ids': input_ids, 'token_type_ids': token_type_ids, 'head_mask': head_mask } return config, inputs_dict def setUp(self): self.model_tester = CTRLModelTest.CTRLModelTester(self) self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37) def test_config(self): self.config_tester.run_common_tests() def test_ctrl_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_ctrl_model(*config_and_inputs) def test_ctrl_lm_head_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_lm_head_model(*config_and_inputs) @pytest.mark.slow def test_model_from_pretrained(self): cache_dir = "/tmp/transformers_test/" for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = CTRLModel.from_pretrained(model_name, cache_dir=cache_dir) shutil.rmtree(cache_dir) self.assertIsNotNone(model)
class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = ((GPTNeoModel, GPTNeoForCausalLM, GPTNeoForSequenceClassification) if is_torch_available() else ()) all_generative_model_classes = ( GPTNeoForCausalLM, ) if is_torch_available() else () fx_compatible = True test_missing_keys = False test_pruning = False test_model_parallel = False # special case for DoubleHeads model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) return inputs_dict def setUp(self): self.model_tester = GPTNeoModelTester(self) self.config_tester = ConfigTester(self, config_class=GPTNeoConfig, n_embd=37) def test_config(self): self.config_tester.run_common_tests() def test_gpt_neo_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt_neo_model(*config_and_inputs) def test_gpt_neo_model_past(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt_neo_model_past( *config_and_inputs) def test_gpt_neo_model_att_mask_past(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt_neo_model_attention_mask_past( *config_and_inputs) def test_gpt_neo_model_past_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt_neo_model_past_large_inputs( *config_and_inputs) def test_gpt_neo_lm_head_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_lm_head_model(*config_and_inputs) def test_gpt_neo_sequence_classification_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt_neo_for_sequence_classification( *config_and_inputs) def test_gpt_neo_gradient_checkpointing(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_forward_and_backwards( *config_and_inputs, gradient_checkpointing=True) def _get_hidden_states(self): return torch.tensor( [[ [0.4983, -0.7584, -1.6944, 0.5440], [2.6918, 0.4206, 0.4176, 0.2055], [-0.0071, -0.0405, -1.4920, -0.3630], [1.0492, 0.1599, -1.7648, 0.2419], [-1.8348, 2.0514, -0.1946, 0.3203], [0.7672, -1.1600, -1.7118, -0.9056], [0.2986, 0.5372, 0.7729, -0.1927], [0.0285, 0.2629, -1.1156, -1.1992], ]], dtype=torch.float32, device=torch_device, ) def test_local_attn_probs(self): model = GPTNeoModel.from_pretrained( "valhalla/gpt-neo-random-tiny").eval() layer = model.h[1].attn.attention.to(torch_device) hidden_states = self._get_hidden_states() hidden_states = torch.cat([hidden_states, hidden_states - 0.5], dim=2) batch_size, seq_length, _ = hidden_states.shape mask_tokens = 2 attention_mask = torch.ones(batch_size, seq_length, device=torch_device, dtype=torch.long) attention_mask[:, -mask_tokens:] = 0 # dont attend last mask_tokens attention_mask = attention_mask.view(batch_size, -1) attention_mask = attention_mask[:, None, None, :] attention_mask = (1.0 - attention_mask) * -10000.0 attn_probs = layer(hidden_states, attention_mask=attention_mask, output_attentions=True)[-1] # the last 2 tokens are masked, and should have 0 attn_probs self.assertTrue( torch.all(attn_probs[:, :, -mask_tokens:, -mask_tokens:] == 0)) # in loacal attention each token can only attend to the previous window_size tokens (inlcuding itself) # here window_size is 4, so a token at index 5 can only attend to indcies [2, 3, 4, 5] # and the attn_probs should be 0 for token [0, 1] self.assertTrue(torch.all(attn_probs[:, :, 5, 2:6] != 0)) self.assertTrue(torch.all(attn_probs[:, :, 5, :2] == 0))
class DPRModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (( DPRContextEncoder, DPRQuestionEncoder, DPRReader, ) if is_torch_available() else ()) test_resize_embeddings = False test_missing_keys = False # why? test_pruning = False test_head_masking = False def setUp(self): self.model_tester = DPRModelTester(self) self.config_tester = ConfigTester(self, config_class=DPRConfig, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_context_encoder_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_context_encoder(*config_and_inputs) def test_question_encoder_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_question_encoder(*config_and_inputs) def test_reader_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_reader(*config_and_inputs) def test_init_changed_config(self): config = self.model_tester.prepare_config_and_inputs()[0] model = DPRQuestionEncoder(config=config) model.to(torch_device) model.eval() with tempfile.TemporaryDirectory() as tmp_dirname: model.save_pretrained(tmp_dirname) model = DPRQuestionEncoder.from_pretrained(tmp_dirname, projection_dim=512) self.assertIsNotNone(model) @slow def test_model_from_pretrained(self): for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[: 1]: model = DPRContextEncoder.from_pretrained(model_name) self.assertIsNotNone(model) for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[: 1]: model = DPRContextEncoder.from_pretrained(model_name) self.assertIsNotNone(model) for model_name in DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[: 1]: model = DPRQuestionEncoder.from_pretrained(model_name) self.assertIsNotNone(model) for model_name in DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = DPRReader.from_pretrained(model_name) self.assertIsNotNone(model)
class XLMModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = (( XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification, XLMForQuestionAnsweringSimple, XLMForTokenClassification, XLMForMultipleChoice, ) if is_torch_available() else ()) all_generative_model_classes = ( (XLMWithLMHeadModel, ) if is_torch_available() else () ) # TODO (PVP): Check other models whether language generation is also applicable test_sequence_classification_problem_types = True # XLM has 2 QA models -> need to manually set the correct labels for one of them here def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) if return_labels: if model_class.__name__ == "XLMForQuestionAnswering": inputs_dict["start_positions"] = torch.zeros( self.model_tester.batch_size, dtype=torch.long, device=torch_device) inputs_dict["end_positions"] = torch.zeros( self.model_tester.batch_size, dtype=torch.long, device=torch_device) return inputs_dict def setUp(self): self.model_tester = XLMModelTester(self) self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37) def test_config(self): self.config_tester.run_common_tests() def test_xlm_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_model(*config_and_inputs) def test_xlm_lm_head(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs) def test_xlm_simple_qa(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_simple_qa(*config_and_inputs) def test_xlm_qa(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_qa(*config_and_inputs) def test_xlm_sequence_classif(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_sequence_classif( *config_and_inputs) def test_xlm_token_classif(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_token_classif( *config_and_inputs) def test_xlm_for_multiple_choice(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_for_multiple_choice( *config_and_inputs) def _check_attentions_for_generate(self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1): self.assertIsInstance(attentions, tuple) self.assertListEqual([ isinstance(iter_attentions, tuple) for iter_attentions in attentions ], [True] * len(attentions)) self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups) for idx, iter_attentions in enumerate(attentions): # adds PAD dummy token tgt_len = min_length + idx + 1 src_len = min_length + idx + 1 expected_shape = ( batch_size * num_beam_groups, config.num_attention_heads, tgt_len, src_len, ) # check attn size self.assertListEqual( [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)) def _check_hidden_states_for_generate(self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1): self.assertIsInstance(hidden_states, tuple) self.assertListEqual( [ isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states ], [True] * len(hidden_states), ) self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups) for idx, iter_hidden_states in enumerate(hidden_states): # adds PAD dummy token seq_len = min_length + idx + 1 expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size) # check hidden size self.assertListEqual( [ layer_hidden_states.shape for layer_hidden_states in iter_hidden_states ], [expected_shape] * len(iter_hidden_states), ) pass @slow def test_model_from_pretrained(self): for model_name in XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = XLMModel.from_pretrained(model_name) self.assertIsNotNone(model)
class T5ModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else () all_generative_model_classes = (T5ForConditionalGeneration,) if is_torch_available() else () test_pruning = False test_torchscript = False test_resize_embeddings = False is_encoder_decoder = True class T5ModelTester(object): def __init__( self, parent, batch_size=13, encoder_seq_length=7, decoder_seq_length=9, is_training=True, use_attention_mask=True, use_labels=True, vocab_size=99, n_positions=14, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, dropout_rate=0.1, initializer_factor=0.002, eos_token_id=1, pad_token_id=0, decoder_start_token_id=0, scope=None, ): self.parent = parent self.batch_size = batch_size self.encoder_seq_length = encoder_seq_length self.decoder_seq_length = decoder_seq_length self.is_training = is_training self.use_attention_mask = use_attention_mask self.use_labels = use_labels self.vocab_size = vocab_size self.n_positions = n_positions self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.d_ff = d_ff self.relative_attention_num_buckets = relative_attention_num_buckets self.dropout_rate = dropout_rate self.initializer_factor = initializer_factor self.scope = scope self.eos_token_id = eos_token_id self.pad_token_id = pad_token_id self.decoder_start_token_id = decoder_start_token_id def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size) decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) attention_mask = None decoder_attention_mask = None if self.use_attention_mask: attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2) decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2) lm_labels = None if self.use_labels: lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) config = T5Config( vocab_size=self.vocab_size, n_positions=self.n_positions, d_model=self.hidden_size, d_ff=self.d_ff, d_kv=self.hidden_size // self.num_attention_heads, num_layers=self.num_hidden_layers, num_heads=self.num_attention_heads, relative_attention_num_buckets=self.relative_attention_num_buckets, dropout_rate=self.dropout_rate, initializer_factor=self.initializer_factor, eos_token_id=self.eos_token_id, bos_token_id=self.pad_token_id, pad_token_id=self.pad_token_id, decoder_start_token_id=self.decoder_start_token_id, ) return ( config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ) def check_loss_output(self, result): self.parent.assertListEqual(list(result["loss"].size()), []) def check_prepare_lm_labels_via_shift_left( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): model = T5Model(config=config) model.to(torch_device) model.eval() # make sure that lm_labels are correctly padded from the right lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id) # add casaul pad token mask triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not() lm_labels.masked_fill_(triangular_mask, self.pad_token_id) decoder_input_ids = model._shift_right(lm_labels) for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)): # first item self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id) if i < decoder_input_ids_slice.shape[-1]: if i < decoder_input_ids.shape[-1] - 1: # items before diagonal self.parent.assertListEqual( decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist() ) # pad items after diagonal if i < decoder_input_ids.shape[-1] - 2: self.parent.assertListEqual( decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist() ) else: # all items after square self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist()) def create_and_check_t5_model( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): model = T5Model(config=config) model.to(torch_device) model.eval() decoder_output, decoder_past, encoder_output = model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) decoder_output, decoder_past, encoder_output = model( input_ids=input_ids, decoder_input_ids=decoder_input_ids ) result = { "encoder_output": encoder_output, "decoder_output": decoder_output, "decoder_past": decoder_past, } self.parent.assertListEqual( list(result["encoder_output"].size()), [self.batch_size, self.encoder_seq_length, self.hidden_size] ) self.parent.assertListEqual( list(result["decoder_output"].size()), [self.batch_size, self.decoder_seq_length, self.hidden_size] ) self.parent.assertEqual(len(decoder_past), 2) # decoder_past[0] should correspond to encoder output self.parent.assertTrue(torch.all(decoder_past[0][0] == encoder_output)) # There should be `num_layers` key value embeddings stored in decoder_past[1] self.parent.assertEqual(len(decoder_past[1]), config.num_layers) # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past[1] tuple self.parent.assertEqual(len(decoder_past[1][0]), 4) def create_and_check_t5_with_lm_head( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): model = T5ForConditionalGeneration(config=config) model.to(torch_device) model.eval() outputs = model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, lm_labels=lm_labels, ) loss, prediction_scores, _, _ = outputs self.parent.assertEqual(len(outputs), 4) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size] ) self.check_loss_output(result) def create_and_check_t5_decoder_model_past( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): model = T5Model(config=config).get_decoder() model.to(torch_device) model.eval() # first forward pass output, past_key_value_states = model(input_ids, use_cache=True) # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) # append to next input_ids and next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) output_from_no_past = model(next_input_ids)[0] output_from_past = model(next_tokens, past_key_value_states=past_key_value_states)[0] # select random slice random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() # test that outputs are equal for slice self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) def create_and_check_t5_decoder_model_attention_mask_past( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): model = T5Model(config=config).get_decoder() model.to(torch_device) model.eval() # create attention mask attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) half_seq_length = input_ids.shape[-1] // 2 attn_mask[:, half_seq_length:] = 0 # first forward pass output, past_key_value_states = model(input_ids, attention_mask=attn_mask, use_cache=True) # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) # change a random masked slice from input_ids random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1 random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1) input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens # append to next input_ids and attn_mask next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) attn_mask = torch.cat( [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1, ) # get two different outputs output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0] output_from_past = model( next_tokens, past_key_value_states=past_key_value_states, attention_mask=attn_mask )[0] # select random slice random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() # test that outputs are equal for slice self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) def create_t5_and_check_t5_generate_with_past_key_value_states( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): model = T5ForConditionalGeneration(config=config) model.to(torch_device) model.eval() torch.manual_seed(0) output_without_past_cache = model.generate( input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False ) torch.manual_seed(0) output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True) self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache)) def create_and_check_t5_model_fp16_forward( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): model = T5Model(config=config) model.to(torch_device) model.half() model.eval() output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)[0] self.parent.assertFalse(torch.isnan(output).any().item()) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ) = config_and_inputs inputs_dict = { "input_ids": input_ids, "attention_mask": attention_mask, "decoder_input_ids": decoder_input_ids, "decoder_attention_mask": decoder_attention_mask, "use_cache": False, } return config, inputs_dict def setUp(self): self.model_tester = T5ModelTest.T5ModelTester(self) self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37) def test_config(self): self.config_tester.run_common_tests() def test_shift_right(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs) def test_t5_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_t5_model(*config_and_inputs) def test_with_lm_head(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs) def test_t5_decoder_model_past(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_t5_decoder_model_past(*config_and_inputs) def test_t5_decoder_model_past_with_attn_mask(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_t5_decoder_model_attention_mask_past(*config_and_inputs) def test_t5_generate_with_past_key_value_states(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_t5_and_check_t5_generate_with_past_key_value_states(*config_and_inputs) @unittest.skipIf(torch_device == "cpu", "Cant do half precision") def test_t5_model_fp16_forward(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_t5_model_fp16_forward(*config_and_inputs) @slow def test_model_from_pretrained(self): for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = T5Model.from_pretrained(model_name) self.assertIsNotNone(model)
class CanineModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( ( CanineModel, CanineForMultipleChoice, CanineForQuestionAnswering, CanineForSequenceClassification, CanineForTokenClassification, ) if is_torch_available() else () ) test_torchscript = False test_mismatched_shapes = False test_resize_embeddings = False test_pruning = False def setUp(self): self.model_tester = CanineModelTester(self) # we set has_text_modality to False as the config has no vocab_size attribute self.config_tester = ConfigTester(self, config_class=CanineConfig, has_text_modality=False, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_for_multiple_choice(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering(*config_and_inputs) def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_token_classification(*config_and_inputs) def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) hidden_states = outputs.hidden_states # expected_num_layers equals num_hidden_layers of the deep encoder + 1, + 2 for the first shallow encoder, + 2 # for the final shallow encoder expected_num_layers = self.model_tester.num_hidden_layers + 1 + 2 + 2 self.assertEqual(len(hidden_states), expected_num_layers) seq_length = self.model_tester.seq_length for i in range(expected_num_layers): if (i < 2) or ((expected_num_layers - i) < 3): # the expected length of the hidden_states of the first and final shallow encoders # is equal to the seq_length self.assertListEqual( list(hidden_states[i].shape[-2:]), [seq_length, self.model_tester.hidden_size], ) else: # the expected length of the hidden_states of the deep encoder need to be updated # for CANINE since the seq length is downsampled self.assertListEqual( list(hidden_states[i].shape[-2:]), [seq_length // config.downsampling_rate, self.model_tester.hidden_size], ) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True check_hidden_states_output(inputs_dict, config, model_class) # check that output_hidden_states also work using config del inputs_dict["output_hidden_states"] config.output_hidden_states = True check_hidden_states_output(inputs_dict, config, model_class) def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True seq_len = getattr(self.model_tester, "seq_length", None) for model_class in self.all_model_classes: inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) attentions = outputs.attentions # we add + 2 due to the 2 shallow encoders self.assertEqual(len(attentions), self.model_tester.num_hidden_layers + 2) # check that output_attentions also work using config del inputs_dict["output_attentions"] config.output_attentions = True model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) attentions = outputs.attentions # we add + 2 due to the 2 shallow encoders self.assertEqual(len(attentions), self.model_tester.num_hidden_layers + 2) self.assertListEqual( list(attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, seq_len, seq_len], ) out_len = len(outputs) # Check attention is always last and order is fine inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = True model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) if hasattr(self.model_tester, "num_hidden_states_types"): added_hidden_states = self.model_tester.num_hidden_states_types else: added_hidden_states = 1 self.assertEqual(out_len + added_hidden_states, len(outputs)) self_attentions = outputs.attentions self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers + 2) self.assertListEqual( list(self_attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, seq_len, seq_len], ) def test_model_outputs_equivalence(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def set_nan_tensor_to_zero(t): t[t != t] = 0 return t def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): with torch.no_grad(): tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() def recursive_check(tuple_object, dict_object): if isinstance(tuple_object, (List, Tuple)): for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): recursive_check(tuple_iterable_value, dict_iterable_value) elif tuple_object is None: return else: self.assertTrue( torch.allclose( set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5 ), msg=f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.", ) recursive_check(tuple_output, dict_output) for model_class in self.all_model_classes: print(model_class) model = model_class(config) model.to(torch_device) model.eval() tuple_inputs = self._prepare_for_class(inputs_dict, model_class) dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs) tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence(model, tuple_inputs, dict_inputs) tuple_inputs = self._prepare_for_class(inputs_dict, model_class) dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) tuple_inputs = self._prepare_for_class(inputs_dict, model_class) dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence( model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} ) def test_headmasking(self): if not self.test_head_masking: return global_rng.seed(42) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() global_rng.seed() inputs_dict["output_attentions"] = True config.output_hidden_states = True configs_no_init = _config_zero_init(config) # To be sure we have no Nan for model_class in self.all_model_classes: model = model_class(config=configs_no_init) model.to(torch_device) model.eval() # Prepare head_mask # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) head_mask = torch.ones( self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device, ) head_mask[0, 0] = 0 head_mask[-1, :-1] = 0 head_mask.requires_grad_(requires_grad=True) inputs = self._prepare_for_class(inputs_dict, model_class).copy() inputs["head_mask"] = head_mask outputs = model(**inputs, return_dict=True) # Test that we can get a gradient back for importance score computation output = sum(t.sum() for t in outputs[0]) output = output.sum() output.backward() multihead_outputs = head_mask.grad self.assertIsNotNone(multihead_outputs) self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers) def check_attentions_validity(attentions): # Remove Nan for t in attentions: self.assertLess( torch.sum(torch.isnan(t)), t.numel() / 4 ) # Check we don't have more than 25% nans (arbitrary) attentions = [ t.masked_fill(torch.isnan(t), 0.0) for t in attentions ] # remove them (the test is less complete) self.assertAlmostEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0) self.assertNotEqual(attentions[1][..., -1, :, :].flatten().sum().item(), 0.0) self.assertAlmostEqual(attentions[-2][..., -2, :, :].flatten().sum().item(), 0.0) self.assertNotEqual(attentions[-2][..., -1, :, :].flatten().sum().item(), 0.0) check_attentions_validity(outputs.attentions) @unittest.skip("CANINE does not have a get_input_embeddings() method.") def test_inputs_embeds(self): # ViT does not use inputs_embeds pass @unittest.skip("CANINE does not have a get_input_embeddings() method.") def test_model_common_attributes(self): pass @slow def test_model_from_pretrained(self): for model_name in CANINE_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = CanineModel.from_pretrained(model_name) self.assertIsNotNone(model)
class LxmertModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( LxmertModel, LxmertForPreTraining, LxmertForQuestionAnswering) if is_torch_available() else () test_head_masking = False test_pruning = False test_torchscript = False # overwrite function because qa models takes different input label shape def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = copy.deepcopy(inputs_dict) if return_labels: if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING): inputs_dict["labels"] = torch.zeros( self.model_tester.batch_size, dtype=torch.long, device=torch_device) elif model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING): # special case for models like BERT that use multi-loss training for PreTraining inputs_dict["labels"] = torch.zeros( (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device) return inputs_dict def setUp(self): self.model_tester = LxmertModelTester(self) self.config_tester = ConfigTester(self, config_class=LxmertConfig, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_lxmert_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_lxmert_model(*config_and_inputs) def test_lxmert_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_lxmert_for_question_answering( *config_and_inputs) def test_lxmert_pretraining(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_lxmert_for_pretraining( *config_and_inputs) def test_lxmert_question_answering_labels_resize(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.resize_lxmert_num_qa_labels(*config_and_inputs) @slow def test_model_from_pretrained(self): for model_name in LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = LxmertModel.from_pretrained(model_name) model.to(torch_device) self.assertIsNotNone(model) def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) seq_len = getattr(self.model_tester, "seq_length", None) encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) chunk_length = getattr(self.model_tester, "chunk_length", None) if chunk_length is not None and hasattr(self.model_tester, "num_hashes"): encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes for model_class in self.all_model_classes: inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model( **self._prepare_for_class(inputs_dict, model_class)) language_attentions, vision_attentions, cross_encoder_attentions = ( outputs[-3], outputs[-2], outputs[-1]) self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"]) self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"]) self.assertEqual( len(cross_encoder_attentions), self.model_tester.num_hidden_layers["cross_encoder"]) # check that output_attentions also work using config del inputs_dict["output_attentions"] config.output_attentions = True model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model( **self._prepare_for_class(inputs_dict, model_class)) language_attentions, vision_attentions, cross_encoder_attentions = ( outputs[-3], outputs[-2], outputs[-1]) self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"]) self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"]) self.assertEqual( len(cross_encoder_attentions), self.model_tester.num_hidden_layers["cross_encoder"]) attentions = [ language_attentions, vision_attentions, cross_encoder_attentions ] attention_shapes = [ [ self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length ], [ self.model_tester.num_attention_heads, self.model_tester.num_visual_features, self.model_tester.num_visual_features, ], [ self.model_tester.num_attention_heads, encoder_key_length, self.model_tester.num_visual_features ], ] for attention, attention_shape in zip(attentions, attention_shapes): self.assertListEqual(list(attention[0].shape[-3:]), attention_shape) out_len = len(outputs) # Check attention is always last and order is fine inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = True model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model( **self._prepare_for_class(inputs_dict, model_class)) # 2 hidden states were added self.assertEqual(out_len + 2, len(outputs)) language_attentions, vision_attentions, cross_encoder_attentions = ( outputs[-3], outputs[-2], outputs[-1]) self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"]) self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"]) self.assertEqual( len(cross_encoder_attentions), self.model_tester.num_hidden_layers["cross_encoder"]) attentions = [ language_attentions, vision_attentions, cross_encoder_attentions ] attention_shapes = [ [ self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length ], [ self.model_tester.num_attention_heads, self.model_tester.num_visual_features, self.model_tester.num_visual_features, ], [ self.model_tester.num_attention_heads, encoder_key_length, self.model_tester.num_visual_features ], ] for attention, attention_shape in zip(attentions, attention_shapes): self.assertListEqual(list(attention[0].shape[-3:]), attention_shape) def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model( **self._prepare_for_class(inputs_dict, model_class)) language_hidden_states, vision_hidden_states = outputs[ -2], outputs[-1] self.assertEqual( len(language_hidden_states), self.model_tester.num_hidden_layers["language"] + 1) self.assertEqual(len(vision_hidden_states), self.model_tester.num_hidden_layers["vision"] + 1) seq_length = self.model_tester.seq_length num_visual_features = self.model_tester.num_visual_features self.assertListEqual( list(language_hidden_states[0].shape[-2:]), [seq_length, self.model_tester.hidden_size], ) self.assertListEqual( list(vision_hidden_states[0].shape[-2:]), [num_visual_features, self.model_tester.hidden_size], ) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True check_hidden_states_output(inputs_dict, config, model_class) # check that output_hidden_states also work using config del inputs_dict["output_hidden_states"] config.output_hidden_states = True check_hidden_states_output(inputs_dict, config, model_class) def test_retain_grad_hidden_states_attentions(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) config.output_hidden_states = True config.output_attentions = True # no need to test all models as different heads yield the same functionality model_class = self.all_model_classes[0] model = model_class(config) model.to(torch_device) inputs = self._prepare_for_class(inputs_dict, model_class) outputs = model(**inputs) hidden_states_lang = outputs.language_hidden_states[0] attentions_lang = outputs.language_attentions[0] hidden_states_vision = outputs.vision_hidden_states[0] attentions_vision = outputs.vision_attentions[0] hidden_states_lang.retain_grad() attentions_lang.retain_grad() hidden_states_vision.retain_grad() attentions_vision.retain_grad() outputs.language_output.flatten()[0].backward(retain_graph=True) outputs.vision_output.flatten()[0].backward(retain_graph=True) self.assertIsNotNone(hidden_states_lang.grad) self.assertIsNotNone(attentions_vision.grad) self.assertIsNotNone(hidden_states_vision.grad) self.assertIsNotNone(attentions_vision.grad) def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict): tf_inputs_dict = {} for key, value in pt_inputs_dict.items(): # skip key that does not exist in tf if isinstance(value, dict): tf_inputs_dict[key] = self.prepare_pt_inputs_from_tf_inputs( value) elif isinstance(value, (list, tuple)): tf_inputs_dict[key] = ( self.prepare_pt_inputs_from_tf_inputs(iter_value) for iter_value in value) elif type(value) == bool: tf_inputs_dict[key] = value elif key == "input_values": tf_inputs_dict[key] = tf.convert_to_tensor(value.cpu().numpy(), dtype=tf.float32) elif key == "pixel_values": tf_inputs_dict[key] = tf.convert_to_tensor(value.cpu().numpy(), dtype=tf.float32) elif key == "input_features": tf_inputs_dict[key] = tf.convert_to_tensor(value.cpu().numpy(), dtype=tf.float32) # other general float inputs elif value.is_floating_point(): tf_inputs_dict[key] = tf.convert_to_tensor(value.cpu().numpy(), dtype=tf.float32) else: tf_inputs_dict[key] = tf.convert_to_tensor(value.cpu().numpy(), dtype=tf.int32) return tf_inputs_dict
class PoolFormerModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( PoolFormerModel, PoolFormerForImageClassification) if is_torch_available() else () test_head_masking = False test_pruning = False test_resize_embeddings = False test_torchscript = False def setUp(self): self.model_tester = PoolFormerModelTester(self) self.config_tester = PoolFormerConfigTester( self, config_class=PoolFormerConfig) def test_config(self): self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) @unittest.skip("PoolFormer does not use inputs_embeds") def test_inputs_embeds(self): pass @unittest.skip( "PoolFormer does not have get_input_embeddings method and get_output_embeddings methods" ) def test_model_common_attributes(self): pass def test_retain_grad_hidden_states_attentions(self): # Since poolformer doesn't use Attention config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) config.output_hidden_states = True # no need to test all models as different heads yield the same functionality model_class = self.all_model_classes[0] model = model_class(config) model.to(torch_device) inputs = self._prepare_for_class(inputs_dict, model_class) outputs = model(**inputs) output = outputs[0] hidden_states = outputs.hidden_states[0] hidden_states.retain_grad() output.flatten()[0].backward(retain_graph=True) self.assertIsNotNone(hidden_states.grad) def test_model_outputs_equivalence(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) def set_nan_tensor_to_zero(t): t[t != t] = 0 return t def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): with torch.no_grad(): tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() def recursive_check(tuple_object, dict_object): if isinstance(tuple_object, (List, Tuple)): for tuple_iterable_value, dict_iterable_value in zip( tuple_object, dict_object): recursive_check(tuple_iterable_value, dict_iterable_value) elif isinstance(tuple_object, Dict): for tuple_iterable_value, dict_iterable_value in zip( tuple_object.values(), dict_object.values()): recursive_check(tuple_iterable_value, dict_iterable_value) elif tuple_object is None: return else: self.assertTrue( torch.allclose( set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5), msg= f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.", ) recursive_check(tuple_output, dict_output) for model_class in self.all_model_classes: model = model_class(config) model.to(torch_device) model.eval() tuple_inputs = self._prepare_for_class(inputs_dict, model_class) dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs) tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence(model, tuple_inputs, dict_inputs) tuple_inputs = self._prepare_for_class(inputs_dict, model_class) dict_inputs = self._prepare_for_class(inputs_dict, model_class) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) signature = inspect.signature(model.forward) # signature.parameters is an OrderedDict => so arg_names order is deterministic arg_names = [*signature.parameters.keys()] expected_arg_names = ["pixel_values"] self.assertListEqual(arg_names[:1], expected_arg_names) @unittest.skip("PoolFormer does not have attention") def test_attention_outputs(self): pass def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model( **self._prepare_for_class(inputs_dict, model_class)) hidden_states = outputs.hidden_states expected_num_layers = self.model_tester.num_encoder_blocks self.assertEqual(len(hidden_states), expected_num_layers) # verify the first hidden states (first block) self.assertListEqual( list(hidden_states[0].shape[-3:]), [ self.model_tester.hidden_sizes[0], self.model_tester.image_size // 4, self.model_tester.image_size // 4, ], ) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True check_hidden_states_output(inputs_dict, config, model_class) # check that output_hidden_states also work using config del inputs_dict["output_hidden_states"] config.output_hidden_states = True check_hidden_states_output(inputs_dict, config, model_class) def test_training(self): if not self.model_tester.is_training: return config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) config.return_dict = True for model_class in self.all_model_classes: if model_class in get_values(MODEL_MAPPING): continue model = model_class(config) model.to(torch_device) model.train() inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) loss = model(**inputs).loss loss.backward() @slow def test_model_from_pretrained(self): for model_name in POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = PoolFormerModel.from_pretrained(model_name) self.assertIsNotNone(model)
class AlbertModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (( AlbertModel, AlbertForPreTraining, AlbertForMaskedLM, AlbertForMultipleChoice, AlbertForSequenceClassification, AlbertForTokenClassification, AlbertForQuestionAnswering, ) if is_torch_available() else ()) # special case for ForPreTraining model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) if return_labels: if model_class in MODEL_FOR_PRETRAINING_MAPPING.values(): inputs_dict["labels"] = torch.zeros( (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device) inputs_dict["sentence_order_label"] = torch.zeros( self.model_tester.batch_size, dtype=torch.long, device=torch_device) return inputs_dict def setUp(self): self.model_tester = AlbertModelTester(self) self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_for_pretraining(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_pretraining(*config_and_inputs) def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) def test_for_multiple_choice(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_multiple_choice( *config_and_inputs) def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering( *config_and_inputs) def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_sequence_classification( *config_and_inputs) def test_model_various_embeddings(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() for type in ["absolute", "relative_key", "relative_key_query"]: config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) @slow def test_model_from_pretrained(self): for model_name in ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = AlbertModel.from_pretrained(model_name) self.assertIsNotNone(model)
class ConvBertModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (( ConvBertModel, ConvBertForMaskedLM, ConvBertForMultipleChoice, ConvBertForQuestionAnswering, ConvBertForSequenceClassification, ConvBertForTokenClassification, ) if is_torch_available() else ()) test_pruning = False test_head_masking = False def setUp(self): self.model_tester = ConvBertModelTester(self) self.config_tester = ConfigTester(self, config_class=ConvBertConfig, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) def test_for_multiple_choice(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_multiple_choice( *config_and_inputs) def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering( *config_and_inputs) def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_sequence_classification( *config_and_inputs) def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_token_classification( *config_and_inputs) @slow def test_model_from_pretrained(self): for model_name in CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = ConvBertModel.from_pretrained(model_name) self.assertIsNotNone(model) def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) config.return_dict = True seq_len = getattr(self.model_tester, "seq_length", None) decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length) encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) chunk_length = getattr(self.model_tester, "chunk_length", None) if chunk_length is not None and hasattr(self.model_tester, "num_hashes"): encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes for model_class in self.all_model_classes: inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model( **self._prepare_for_class(inputs_dict, model_class)) attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) # check that output_attentions also work using config del inputs_dict["output_attentions"] config.output_attentions = True model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model( **self._prepare_for_class(inputs_dict, model_class)) attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) if chunk_length is not None: self.assertListEqual( list(attentions[0].shape[-4:]), [ self.model_tester.num_attention_heads / 2, encoder_seq_length, chunk_length, encoder_key_length ], ) else: self.assertListEqual( list(attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length ], ) out_len = len(outputs) if self.is_encoder_decoder: correct_outlen = 5 # loss is at first position if "labels" in inputs_dict: correct_outlen += 1 # loss is added to beginning # Question Answering model returns start_logits and end_logits if model_class in get_values( MODEL_FOR_QUESTION_ANSWERING_MAPPING): correct_outlen += 1 # start_logits and end_logits instead of only 1 output if "past_key_values" in outputs: correct_outlen += 1 # past_key_values have been returned self.assertEqual(out_len, correct_outlen) # decoder attentions decoder_attentions = outputs.decoder_attentions self.assertIsInstance(decoder_attentions, (list, tuple)) self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(decoder_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length ], ) # cross attentions cross_attentions = outputs.cross_attentions self.assertIsInstance(cross_attentions, (list, tuple)) self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(cross_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, decoder_seq_length, encoder_key_length, ], ) # Check attention is always last and order is fine inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = True model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model( **self._prepare_for_class(inputs_dict, model_class)) if hasattr(self.model_tester, "num_hidden_states_types"): added_hidden_states = self.model_tester.num_hidden_states_types elif self.is_encoder_decoder: added_hidden_states = 2 else: added_hidden_states = 1 self.assertEqual(out_len + added_hidden_states, len(outputs)) self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) if chunk_length is not None: self.assertListEqual( list(self_attentions[0].shape[-4:]), [ self.model_tester.num_attention_heads / 2, encoder_seq_length, chunk_length, encoder_key_length ], ) else: self.assertListEqual( list(self_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length ], ) @slow @require_torch_gpu def test_torchscript_device_change(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: # ConvBertForMultipleChoice behaves incorrectly in JIT environments. if model_class == ConvBertForMultipleChoice: return config.torchscript = True model = model_class(config=config) inputs_dict = self._prepare_for_class(inputs_dict, model_class) traced_model = torch.jit.trace( model, (inputs_dict["input_ids"].to("cpu"), inputs_dict["attention_mask"].to("cpu"))) with tempfile.TemporaryDirectory() as tmp: torch.jit.save(traced_model, os.path.join(tmp, "traced_model.pt")) loaded = torch.jit.load(os.path.join(tmp, "traced_model.pt"), map_location=torch_device) loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device))
class QDQBertModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (( QDQBertModel, QDQBertForMaskedLM, QDQBertForMultipleChoice, QDQBertForNextSentencePrediction, QDQBertForQuestionAnswering, QDQBertForSequenceClassification, QDQBertForTokenClassification, QDQBertLMHeadModel, ) if is_torch_available() else ()) all_generative_model_classes = ( QDQBertLMHeadModel, ) if is_torch_available() else () def setUp(self): self.model_tester = QDQBertModelTester(self) self.config_tester = ConfigTester(self, config_class=QDQBertConfig, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_model_various_embeddings(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() for type in ["absolute", "relative_key", "relative_key_query"]: config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) def test_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder( ) self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): # This regression test was failing with PyTorch < 1.3 ( config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask, ) = self.model_tester.prepare_config_and_inputs_for_decoder() input_mask = None self.model_tester.create_and_check_model_as_decoder( config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask, ) def test_for_causal_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder( ) self.model_tester.create_and_check_for_causal_lm(*config_and_inputs) def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) def test_for_causal_lm_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder( ) self.model_tester.create_and_check_model_for_causal_lm_as_decoder( *config_and_inputs) def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder( ) self.model_tester.create_and_check_decoder_model_past_large_inputs( *config_and_inputs) def test_for_multiple_choice(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_multiple_choice( *config_and_inputs) def test_for_next_sequence_prediction(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_next_sequence_prediction( *config_and_inputs) def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering( *config_and_inputs) def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_sequence_classification( *config_and_inputs) def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_token_classification( *config_and_inputs) @slow def test_model_from_pretrained(self): for model_name in QDQBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = QDQBertModel.from_pretrained(model_name) self.assertIsNotNone(model) # Override def test_feed_forward_chunking(self): # feed forward chunking is not supported in QDQBert pass
class LongformerModelTest(ModelTesterMixin, unittest.TestCase): test_pruning = False # pruning is not supported test_torchscript = False all_model_classes = (( LongformerModel, LongformerForMaskedLM, LongformerForSequenceClassification, LongformerForQuestionAnswering, LongformerForTokenClassification, LongformerForMultipleChoice, ) if is_torch_available() else ()) def setUp(self): self.model_tester = LongformerModelTester(self) self.config_tester = ConfigTester(self, config_class=LongformerConfig, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_model_attention_mask_determinism(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_attention_mask_determinism( *config_and_inputs) def test_model_global_attention_mask(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_with_global_attention_mask( *config_and_inputs) def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_question_answering( ) self.model_tester.create_and_check_for_question_answering( *config_and_inputs) def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_sequence_classification( *config_and_inputs) def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_token_classification( *config_and_inputs) def test_for_multiple_choice(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_multiple_choice( *config_and_inputs) def test_retain_grad_hidden_states_attentions(self): # longformer cannot keep gradients in attentions or hidden states return
class XLMModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (( XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification, XLMForQuestionAnsweringSimple, ) if is_torch_available() else ()) all_generative_model_classes = ( (XLMWithLMHeadModel, ) if is_torch_available() else () ) # TODO (PVP): Check other models whether language generation is also applicable class XLMModelTester(object): def __init__( self, parent, batch_size=13, seq_length=7, is_training=True, use_input_lengths=True, use_token_type_ids=True, use_labels=True, gelu_activation=True, sinusoidal_embeddings=False, causal=False, asm=False, n_langs=2, vocab_size=99, n_special=0, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, type_sequence_label_size=2, initializer_range=0.02, num_labels=3, num_choices=4, summary_type="last", use_proj=True, scope=None, bos_token_id=0, ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length self.is_training = is_training self.use_input_lengths = use_input_lengths self.use_token_type_ids = use_token_type_ids self.use_labels = use_labels self.gelu_activation = gelu_activation self.sinusoidal_embeddings = sinusoidal_embeddings self.asm = asm self.n_langs = n_langs self.vocab_size = vocab_size self.n_special = n_special self.summary_type = summary_type self.causal = causal self.use_proj = use_proj self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.n_langs = n_langs self.type_sequence_label_size = type_sequence_label_size self.initializer_range = initializer_range self.summary_type = summary_type self.num_labels = num_labels self.num_choices = num_choices self.scope = scope self.bos_token_id = bos_token_id def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float() input_lengths = None if self.use_input_lengths: input_lengths = (ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2 ) # small variation of seq_length token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs) sequence_labels = None token_labels = None is_impossible_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) is_impossible_labels = ids_tensor([self.batch_size], 2).float() config = XLMConfig( vocab_size=self.vocab_size, n_special=self.n_special, emb_dim=self.hidden_size, n_layers=self.num_hidden_layers, n_heads=self.num_attention_heads, dropout=self.hidden_dropout_prob, attention_dropout=self.attention_probs_dropout_prob, gelu_activation=self.gelu_activation, sinusoidal_embeddings=self.sinusoidal_embeddings, asm=self.asm, causal=self.causal, n_langs=self.n_langs, max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, summary_type=self.summary_type, use_proj=self.use_proj, bos_token_id=self.bos_token_id, ) return ( config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask, ) def check_loss_output(self, result): self.parent.assertListEqual(list(result["loss"].size()), []) def create_and_check_xlm_model( self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask, ): model = XLMModel(config=config) model.to(torch_device) model.eval() outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids) outputs = model(input_ids, langs=token_type_ids) outputs = model(input_ids) sequence_output = outputs[0] result = { "sequence_output": sequence_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) def create_and_check_xlm_lm_head( self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask, ): model = XLMWithLMHeadModel(config) model.to(torch_device) model.eval() loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_xlm_simple_qa( self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask, ): model = XLMForQuestionAnsweringSimple(config) model.to(torch_device) model.eval() outputs = model(input_ids) outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) loss, start_logits, end_logits = outputs result = { "loss": loss, "start_logits": start_logits, "end_logits": end_logits, } self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) def create_and_check_xlm_qa( self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask, ): model = XLMForQuestionAnswering(config) model.to(torch_device) model.eval() outputs = model(input_ids) start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs outputs = model( input_ids, start_positions=sequence_labels, end_positions=sequence_labels, cls_index=sequence_labels, is_impossible=is_impossible_labels, p_mask=input_mask, ) outputs = model( input_ids, start_positions=sequence_labels, end_positions=sequence_labels, cls_index=sequence_labels, is_impossible=is_impossible_labels, ) (total_loss, ) = outputs outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) (total_loss, ) = outputs result = { "loss": total_loss, "start_top_log_probs": start_top_log_probs, "start_top_index": start_top_index, "end_top_log_probs": end_top_log_probs, "end_top_index": end_top_index, "cls_logits": cls_logits, } self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]) self.parent.assertListEqual( list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]) self.parent.assertListEqual( list(result["end_top_log_probs"].size()), [ self.batch_size, model.config.start_n_top * model.config.end_n_top ], ) self.parent.assertListEqual( list(result["end_top_index"].size()), [ self.batch_size, model.config.start_n_top * model.config.end_n_top ], ) self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size]) def create_and_check_xlm_sequence_classif( self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask, ): model = XLMForSequenceClassification(config) model.to(torch_device) model.eval() (logits, ) = model(input_ids) loss, logits = model(input_ids, labels=sequence_labels) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask, ) = config_and_inputs inputs_dict = { "input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths } return config, inputs_dict def setUp(self): self.model_tester = XLMModelTest.XLMModelTester(self) self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37) def test_config(self): self.config_tester.run_common_tests() def test_xlm_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_model(*config_and_inputs) def test_xlm_lm_head(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs) def test_xlm_simple_qa(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_simple_qa(*config_and_inputs) def test_xlm_qa(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_qa(*config_and_inputs) def test_xlm_sequence_classif(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_sequence_classif( *config_and_inputs) @slow def test_model_from_pretrained(self): for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = XLMModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model)
class BARTModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( BartModel, BartForMaskedLM, BartForSequenceClassification) if is_torch_available() else () is_encoder_decoder = True # TODO(SS): fix the below in a separate PR test_pruning = False test_torchscript = False test_head_masking = False test_resize_embeddings = False # This requires inputs_dict['input_ids'] def setUp(self): self.model_tester = ModelTester(self) self.config_tester = ConfigTester(self, config_class=BartConfig) def test_config(self): self.config_tester.run_common_tests() def test_advanced_inputs(self): # (config, input_ids, token_type_ids, input_mask, *unused) = \ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) decoder_input_ids, decoder_attn_mask = _prepare_bart_decoder_inputs( config, inputs_dict["input_ids"]) model = BartModel(config) model.to(torch_device) model.eval() # test init self.assertTrue( (model.encoder.embed_tokens.weight == model.shared.weight ).all().item()) def _check_var(module): """Check that we initialized various parameters from N(0, config.init_std).""" self.assertAlmostEqual( torch.std(module.weight).item(), config.init_std, 2) _check_var(model.encoder.embed_tokens) _check_var(model.encoder.layers[0].self_attn.k_proj) _check_var(model.encoder.layers[0].fc1) _check_var(model.encoder.embed_positions) decoder_features_with_created_mask = model.forward(**inputs_dict)[0] decoder_features_with_passed_mask = model.forward( decoder_attention_mask=decoder_attn_mask, decoder_input_ids=decoder_input_ids, **inputs_dict)[0] _assert_tensors_equal(decoder_features_with_passed_mask, decoder_features_with_created_mask) useless_mask = torch.zeros_like(decoder_attn_mask) decoder_features = model.forward(decoder_attention_mask=useless_mask, **inputs_dict)[0] self.assertTrue(isinstance( decoder_features, torch.Tensor)) # no hidden states or attentions self.assertEqual(decoder_features.size(), (self.model_tester.batch_size, self.model_tester.seq_length, config.d_model)) if decoder_attn_mask.min().item() < -1e3: # some tokens were masked self.assertFalse( (decoder_features_with_created_mask == decoder_features ).all().item()) # Test different encoder attention masks decoder_features_with_long_encoder_mask = model.forward( inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"].long())[0] _assert_tensors_equal(decoder_features_with_long_encoder_mask, decoder_features_with_created_mask) def test_save_load_strict(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: model = model_class(config) with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model2, info = model_class.from_pretrained( tmpdirname, output_loading_info=True) self.assertEqual(info["missing_keys"], []) @unittest.skip("Passing inputs_embeds not implemented for Bart.") def test_inputs_embeds(self): pass
class FSMTModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( FSMTModel, FSMTForConditionalGeneration) if is_torch_available() else () all_generative_model_classes = ( FSMTForConditionalGeneration, ) if is_torch_available() else () is_encoder_decoder = True # TODO(SS): fix the below in a separate PR test_pruning = False test_torchscript = True test_head_masking = False test_resize_embeddings = True # This requires inputs_dict['input_ids'] test_missing_keys = False # because FSMTForConditionalGeneration and FSMTModel now have identical state_dict def setUp(self): self.model_tester = ModelTester(self) self.langs = ["en", "ru"] config = { "langs": self.langs, "src_vocab_size": 10, "tgt_vocab_size": 20, } # XXX: hack to appease to all other models requiring `vocab_size` config["vocab_size"] = 99 # no such thing in FSMT self.config_tester = ConfigTester(self, config_class=FSMTConfig, **config) def test_config(self): self.config_tester.run_common_tests() # XXX: override test_model_common_attributes / different Embedding type def test_model_common_attributes(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: model = model_class(config) self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding)) model.set_input_embeddings(torch.nn.Embedding(10, 10)) x = model.get_output_embeddings() self.assertTrue( x is None or isinstance(x, torch.nn.modules.sparse.Embedding)) def test_initialization_more(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) model = FSMTModel(config) model.to(torch_device) model.eval() # test init # self.assertTrue((model.encoder.embed_tokens.weight == model.shared.weight).all().item()) def _check_var(module): """Check that we initialized various parameters from N(0, config.init_std).""" self.assertAlmostEqual( torch.std(module.weight).item(), config.init_std, 2) _check_var(model.encoder.embed_tokens) _check_var(model.encoder.layers[0].self_attn.k_proj) _check_var(model.encoder.layers[0].fc1) # XXX: different std for fairseq version of SinusoidalPositionalEmbedding # self.assertAlmostEqual(torch.std(model.encoder.embed_positions.weights).item(), config.init_std, 2) def test_advanced_inputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) config.use_cache = False inputs_dict["input_ids"][:, -2:] = config.pad_token_id decoder_input_ids, decoder_attn_mask, causal_mask = _prepare_fsmt_decoder_inputs( config, inputs_dict["input_ids"]) model = FSMTModel(config).to(torch_device).eval() decoder_features_with_created_mask = model(**inputs_dict)[0] decoder_features_with_passed_mask = model( decoder_attention_mask=invert_mask(decoder_attn_mask), decoder_input_ids=decoder_input_ids, **inputs_dict)[0] _assert_tensors_equal(decoder_features_with_passed_mask, decoder_features_with_created_mask) useless_mask = torch.zeros_like(decoder_attn_mask) decoder_features = model(decoder_attention_mask=useless_mask, **inputs_dict)[0] self.assertTrue(isinstance( decoder_features, torch.Tensor)) # no hidden states or attentions self.assertEqual( decoder_features.size(), (self.model_tester.batch_size, self.model_tester.seq_length, config.tgt_vocab_size), ) if decoder_attn_mask.min().item() < -1e3: # some tokens were masked self.assertFalse( (decoder_features_with_created_mask == decoder_features ).all().item()) # Test different encoder attention masks decoder_features_with_long_encoder_mask = model( inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"].long())[0] _assert_tensors_equal(decoder_features_with_long_encoder_mask, decoder_features_with_created_mask) def test_save_load_strict(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: model = model_class(config) with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model2, info = model_class.from_pretrained( tmpdirname, output_loading_info=True) self.assertEqual(info["missing_keys"], []) def test_save_load_no_save_keys(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: model = model_class(config) state_dict_no_save_keys = getattr(model, "state_dict_no_save_keys", None) if state_dict_no_save_keys is None: continue # check the keys are in the original state_dict for k in state_dict_no_save_keys: self.assertIn(k, model.state_dict()) # check that certain keys didn't get saved with the model with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) output_model_file = os.path.join(tmpdirname, WEIGHTS_NAME) state_dict_saved = torch.load(output_model_file) for k in state_dict_no_save_keys: self.assertNotIn(k, state_dict_saved) @unittest.skip("can't be implemented for FSMT due to dual vocab.") def test_resize_tokens_embeddings(self): pass @unittest.skip("Passing inputs_embeds not implemented for FSMT.") def test_inputs_embeds(self): pass @unittest.skip("model weights aren't tied in FSMT.") def test_tie_model_weights(self): pass
def test_pt_tf_model_equivalence(self): if not is_torch_available(): return import torch import transformers config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beggining pt_model_class = getattr(transformers, pt_model_class_name) config.output_hidden_states = True tf_model = model_class(config) pt_model = pt_model_class(config) # Check we can load pt model in tf and vice-versa with model => model functions tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=inputs_dict) pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() pt_inputs_dict = dict( (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items() ) with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(inputs_dict, training=False) tf_hidden_states = tfo[0].numpy() pt_hidden_states = pto[0].numpy() tf_nans = np.copy(np.isnan(tf_hidden_states)) pt_nans = np.copy(np.isnan(pt_hidden_states)) pt_hidden_states[tf_nans] = 0 tf_hidden_states[tf_nans] = 0 pt_hidden_states[pt_nans] = 0 tf_hidden_states[pt_nans] = 0 max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) # Debug info (remove when fixed) if max_diff >= 2e-2: print("===") print(model_class) print(config) print(inputs_dict) print(pt_inputs_dict) self.assertLessEqual(max_diff, 2e-2) # Check we can load pt model in tf and vice-versa with checkpoint => model functions with tempfile.TemporaryDirectory() as tmpdirname: pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") torch.save(pt_model.state_dict(), pt_checkpoint_path) tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") tf_model.save_weights(tf_checkpoint_path) pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() pt_inputs_dict = dict( (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items() ) with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(inputs_dict) tfo = tfo[0].numpy() pto = pto[0].numpy() tf_nans = np.copy(np.isnan(tfo)) pt_nans = np.copy(np.isnan(pto)) pto[tf_nans] = 0 tfo[tf_nans] = 0 pto[pt_nans] = 0 tfo[pt_nans] = 0 max_diff = np.amax(np.abs(tfo - pto)) self.assertLessEqual(max_diff, 2e-2)
class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (Wav2Vec2ForCTC, Wav2Vec2Model, Wav2Vec2ForMaskedLM) if is_torch_available() else () test_pruning = False test_headmasking = False test_torchscript = False def setUp(self): self.model_tester = Wav2Vec2ModelTester(self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True) self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_batched_inference(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_batch_inference(*config_and_inputs) def test_ctc_loss_inference(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.check_ctc_loss(*config_and_inputs) def test_train(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.check_training(*config_and_inputs) # Wav2Vec2 has no inputs_embeds def test_inputs_embeds(self): pass # `input_ids` is renamed to `input_values` def test_forward_signature(self): pass # Wav2Vec2 cannot resize token embeddings # since it has no tokens embeddings def test_resize_tokens_embeddings(self): pass # Wav2Vec2 has no inputs_embeds # and thus the `get_input_embeddings` fn # is not implemented def test_model_common_attributes(self): pass def test_retain_grad_hidden_states_attentions(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) config.output_hidden_states = True config.output_attentions = True # no need to test all models as different heads yield the same functionality model_class = self.all_model_classes[0] model = model_class(config) model.to(torch_device) # set layer drop to 0 model.config.layerdrop = 0.0 input_values = inputs_dict["input_values"] input_lengths = torch.tensor( [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device) output_lengths = model._get_feat_extract_output_lengths(input_lengths) labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size) inputs_dict["attention_mask"] = torch.ones_like( inputs_dict["attention_mask"]) inputs_dict["labels"] = labels outputs = model(**inputs_dict) output = outputs[0] # Encoder-/Decoder-only models hidden_states = outputs.hidden_states[0] attentions = outputs.attentions[0] hidden_states.retain_grad() attentions.retain_grad() output.flatten()[0].backward(retain_graph=True) self.assertIsNotNone(hidden_states.grad) self.assertIsNotNone(attentions.grad) def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) configs_no_init = _config_zero_init(config) for model_class in self.all_model_classes: model = model_class(config=configs_no_init) for name, param in model.named_parameters(): if param.requires_grad: if "conv.weight" in name or "masked_spec_embed" in name: self.assertTrue( -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg= f"Parameter {name} of model {model_class} seems not properly initialized", ) else: self.assertIn( ((param.data.mean() * 1e9).round() / 1e9).item(), [0.0, 1.0], msg= f"Parameter {name} of model {model_class} seems not properly initialized", ) # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: module.weight.data.fill_(3) if hasattr(module, "weight_g") and module.weight is not None: module.weight_g.data.fill_(3) if hasattr(module, "bias") and module.bias is not None: module.bias.data.fill_(3) @slow def test_model_from_pretrained(self): model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") self.assertIsNotNone(model)
class TransfoXLModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel) if is_torch_available() else () all_generative_model_classes = ( TransfoXLLMHeadModel, ) if is_torch_available() else () test_pruning = False test_torchscript = False test_resize_embeddings = False class TransfoXLModelTester(object): def __init__( self, parent, batch_size=13, seq_length=7, mem_len=30, clamp_len=15, is_training=True, use_labels=True, vocab_size=99, cutoffs=[10, 50, 80], hidden_size=32, d_embed=32, num_attention_heads=4, d_head=8, d_inner=128, div_val=2, num_hidden_layers=5, scope=None, seed=1, eos_token_id=0, ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length self.mem_len = mem_len self.key_length = seq_length + mem_len self.clamp_len = clamp_len self.is_training = is_training self.use_labels = use_labels self.vocab_size = vocab_size self.cutoffs = cutoffs self.hidden_size = hidden_size self.d_embed = d_embed self.num_attention_heads = num_attention_heads self.d_head = d_head self.d_inner = d_inner self.div_val = div_val self.num_hidden_layers = num_hidden_layers self.scope = scope self.seed = seed self.eos_token_id = eos_token_id def prepare_config_and_inputs(self): input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) lm_labels = None if self.use_labels: lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) config = TransfoXLConfig( vocab_size=self.vocab_size, mem_len=self.mem_len, clamp_len=self.clamp_len, cutoffs=self.cutoffs, d_model=self.hidden_size, d_embed=self.d_embed, n_head=self.num_attention_heads, d_head=self.d_head, d_inner=self.d_inner, div_val=self.div_val, n_layer=self.num_hidden_layers, eos_token_ids=self.eos_token_id, ) return (config, input_ids_1, input_ids_2, lm_labels) def set_seed(self): random.seed(self.seed) torch.manual_seed(self.seed) def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels): model = TransfoXLModel(config) model.to(torch_device) model.eval() hidden_states_1, mems_1 = model(input_ids_1) hidden_states_2, mems_2 = model(input_ids_2, mems_1) outputs = { "hidden_states_1": hidden_states_1, "mems_1": mems_1, "hidden_states_2": hidden_states_2, "mems_2": mems_2, } return outputs def check_transfo_xl_model_output(self, result): self.parent.assertListEqual( list(result["hidden_states_1"].size()), [self.batch_size, self.seq_length, self.hidden_size], ) self.parent.assertListEqual( list(result["hidden_states_2"].size()), [self.batch_size, self.seq_length, self.hidden_size], ) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_1"]), [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_2"]), [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels): model = TransfoXLLMHeadModel(config) model.to(torch_device) model.eval() lm_logits_1, mems_1 = model(input_ids_1) loss_1, _, mems_1 = model(input_ids_1, labels=lm_labels) lm_logits_2, mems_2 = model(input_ids_2, mems=mems_1) loss_2, _, mems_2 = model(input_ids_2, labels=lm_labels, mems=mems_1) outputs = { "loss_1": loss_1, "mems_1": mems_1, "lm_logits_1": lm_logits_1, "loss_2": loss_2, "mems_2": mems_2, "lm_logits_2": lm_logits_2, } return outputs def check_transfo_xl_lm_head_output(self, result): self.parent.assertListEqual(list(result["loss_1"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual( list(result["lm_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size], ) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_1"]), [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) self.parent.assertListEqual(list(result["loss_2"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual( list(result["lm_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size], ) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_2"]), [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs inputs_dict = {"input_ids": input_ids_1} return config, inputs_dict def setUp(self): self.model_tester = TransfoXLModelTest.TransfoXLModelTester(self) self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37) def test_config(self): self.config_tester.run_common_tests() def test_transfo_xl_model(self): self.model_tester.set_seed() config_and_inputs = self.model_tester.prepare_config_and_inputs() output_result = self.model_tester.create_transfo_xl_model( *config_and_inputs) self.model_tester.check_transfo_xl_model_output(output_result) def test_transfo_xl_lm_head(self): self.model_tester.set_seed() config_and_inputs = self.model_tester.prepare_config_and_inputs() output_result = self.model_tester.create_transfo_xl_lm_head( *config_and_inputs) self.model_tester.check_transfo_xl_lm_head_output(output_result) @slow def test_model_from_pretrained(self): for model_name in list( TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = TransfoXLModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model)
class BARTModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( (BartModel, BartForConditionalGeneration, BartForSequenceClassification, BartForQuestionAnswering) if is_torch_available() else () ) all_generative_model_classes = (BartForConditionalGeneration,) if is_torch_available() else () is_encoder_decoder = True # TODO(SS): fix the below in a separate PR test_pruning = False test_torchscript = True test_head_masking = False test_resize_embeddings = True # This requires inputs_dict['input_ids'] test_missing_keys = False # because BartForConditionalGeneration and BartModel now have identical state_dict def setUp(self): self.model_tester = ModelTester(self) self.config_tester = ConfigTester(self, config_class=BartConfig) def test_config(self): self.config_tester.run_common_tests() def test_initialization_more(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() model = BartModel(config) model.to(torch_device) model.eval() # test init self.assertTrue((model.encoder.embed_tokens.weight == model.shared.weight).all().item()) def _check_var(module): """Check that we initialized various parameters from N(0, config.init_std).""" self.assertAlmostEqual(torch.std(module.weight).item(), config.init_std, 2) _check_var(model.encoder.embed_tokens) _check_var(model.encoder.layers[0].self_attn.k_proj) _check_var(model.encoder.layers[0].fc1) _check_var(model.encoder.embed_positions) def test_advanced_inputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.use_cache = False inputs_dict["input_ids"][:, -2:] = config.pad_token_id decoder_input_ids, decoder_attn_mask, causal_mask = _prepare_bart_decoder_inputs( config, inputs_dict["input_ids"] ) model = BartModel(config).to(torch_device).eval() decoder_features_with_created_mask = model(**inputs_dict)[0] decoder_features_with_passed_mask = model( decoder_attention_mask=invert_mask(decoder_attn_mask), decoder_input_ids=decoder_input_ids, **inputs_dict )[0] _assert_tensors_equal(decoder_features_with_passed_mask, decoder_features_with_created_mask) useless_mask = torch.zeros_like(decoder_attn_mask) decoder_features = model(decoder_attention_mask=useless_mask, **inputs_dict)[0] self.assertTrue(isinstance(decoder_features, torch.Tensor)) # no hidden states or attentions self.assertEqual( decoder_features.size(), (self.model_tester.batch_size, self.model_tester.seq_length, config.d_model) ) if decoder_attn_mask.min().item() < -1e3: # some tokens were masked self.assertFalse((decoder_features_with_created_mask == decoder_features).all().item()) # Test different encoder attention masks decoder_features_with_long_encoder_mask = model( inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"].long() )[0] _assert_tensors_equal(decoder_features_with_long_encoder_mask, decoder_features_with_created_mask) def test_save_load_strict(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True) self.assertEqual(info["missing_keys"], []) @unittest.skip("Passing inputs_embeds not implemented for Bart.") def test_inputs_embeds(self): pass def test_tiny_model(self): model_name = "sshleifer/bart-tiny-random" tiny = AutoModel.from_pretrained(model_name) # same vocab size tok = AutoTokenizer.from_pretrained(model_name) # same tokenizer inputs_dict = tok.batch_encode_plus(["Hello my friends"], return_tensors="pt") with torch.no_grad(): tiny(**inputs_dict)
def test_pt_tf_model_equivalence(self): from transformers import is_torch_available if not is_torch_available(): return import torch import transformers for model_class in self.all_model_classes: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( return_obj_labels="PreTraining" in model_class.__name__ ) pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning pt_model_class = getattr(transformers, pt_model_class_name) config.output_hidden_states = True config.task_obj_predict = False tf_model = model_class(config) pt_model = pt_model_class(config) # Check we can load pt model in tf and vice-versa with model => model functions tf_model = transformers.load_pytorch_model_in_tf2_model( tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class) ) pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() # Delete obj labels as we want to compute the hidden states and not the loss if "obj_labels" in inputs_dict: del inputs_dict["obj_labels"] def torch_type(key): if key in ("visual_feats", "visual_pos"): return torch.float32 else: return torch.long def recursive_numpy_convert(iterable): return_dict = {} for key, value in iterable.items(): if isinstance(value, dict): return_dict[key] = recursive_numpy_convert(value) else: if isinstance(value, (list, tuple)): return_dict[key] = ( torch.from_numpy(iter_value.numpy()).to(torch_type(key)) for iter_value in value ) else: return_dict[key] = torch.from_numpy(value.numpy()).to(torch_type(key)) return return_dict pt_inputs_dict = recursive_numpy_convert(self._prepare_for_class(inputs_dict, model_class)) # need to rename encoder-decoder "inputs" for PyTorch if "inputs" in pt_inputs_dict and self.is_encoder_decoder: pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False) tf_hidden_states = tfo[0].numpy() pt_hidden_states = pto[0].numpy() import numpy as np tf_nans = np.copy(np.isnan(tf_hidden_states)) pt_nans = np.copy(np.isnan(pt_hidden_states)) pt_hidden_states[tf_nans] = 0 tf_hidden_states[tf_nans] = 0 pt_hidden_states[pt_nans] = 0 tf_hidden_states[pt_nans] = 0 max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) # Debug info (remove when fixed) if max_diff >= 2e-2: print("===") print(model_class) print(config) print(inputs_dict) print(pt_inputs_dict) self.assertLessEqual(max_diff, 6e-2) # Check we can load pt model in tf and vice-versa with checkpoint => model functions with tempfile.TemporaryDirectory() as tmpdirname: import os pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") torch.save(pt_model.state_dict(), pt_checkpoint_path) tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") tf_model.save_weights(tf_checkpoint_path) pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() pt_inputs_dict = dict( (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in self._prepare_for_class(inputs_dict, model_class).items() ) for key, value in pt_inputs_dict.items(): if key in ("visual_feats", "visual_pos"): pt_inputs_dict[key] = value.to(torch.float32) else: pt_inputs_dict[key] = value.to(torch.long) with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(self._prepare_for_class(inputs_dict, model_class)) tfo = tfo[0].numpy() pto = pto[0].numpy() tf_nans = np.copy(np.isnan(tfo)) pt_nans = np.copy(np.isnan(pto)) pto[tf_nans] = 0 tfo[tf_nans] = 0 pto[pt_nans] = 0 tfo[pt_nans] = 0 max_diff = np.amax(np.abs(tfo - pto)) self.assertLessEqual(max_diff, 6e-2)
class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (Wav2Vec2Model, Wav2Vec2ForMaskedLM, Wav2Vec2ForCTC) if is_torch_available() else () test_pruning = False test_headmasking = False test_torchscript = False def setUp(self): self.model_tester = Wav2Vec2ModelTester( self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True ) self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) # Wav2Vec2 has no inputs_embeds def test_inputs_embeds(self): pass # `input_ids` is renamed to `input_values` def test_forward_signature(self): pass # Wav2Vec2 cannot resize token embeddings # since it has no tokens embeddings def test_resize_tokens_embeddings(self): pass # Wav2Vec2 has no inputs_embeds # and thus the `get_input_embeddings` fn # is not implemented def test_model_common_attributes(self): pass def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() configs_no_init = _config_zero_init(config) for model_class in self.all_model_classes: model = model_class(config=configs_no_init) for name, param in model.named_parameters(): if param.requires_grad: if "conv.weight" in name: self.assertTrue( -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg="Parameter {} of model {} seems not properly initialized".format(name, model_class), ) else: self.assertIn( ((param.data.mean() * 1e9).round() / 1e9).item(), [0.0, 1.0], msg="Parameter {} of model {} seems not properly initialized".format(name, model_class), ) @slow def test_model_from_pretrained(self): model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") self.assertIsNotNone(model)
# If checking the tensors placement # tf.debugging.set_log_device_placement(True) from typing import List import timeit from transformers import is_tf_available, is_torch_available from time import time import argparse import csv if is_tf_available(): import tensorflow as tf from transformers import TFAutoModel if is_torch_available(): import torch from transformers import AutoModel from transformers import AutoConfig, AutoTokenizer input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as the Director of Hatcheries and Conditioning entered the room, in the scarcely breathing silence, the absent-minded, soliloquizing hum or whistle, of absorbed concentration. A troop of newly arrived students, very young, pink and callow, followed nervously, rather abjectly, at the Director's heels. Each of them carried a notebook, in which, whenever the great man spoke, he desperately scribbled. Straight from the
class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = (( BertModel, BertLMHeadModel, BertForMaskedLM, BertForMultipleChoice, BertForNextSentencePrediction, BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BertForTokenClassification, ) if is_torch_available() else ()) all_generative_model_classes = ( BertLMHeadModel, ) if is_torch_available() else () # special case for ForPreTraining model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) if return_labels: if model_class in MODEL_FOR_PRETRAINING_MAPPING.values(): inputs_dict["labels"] = torch.zeros( (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device) inputs_dict["next_sentence_label"] = torch.zeros( self.model_tester.batch_size, dtype=torch.long, device=torch_device) return inputs_dict def setUp(self): self.model_tester = BertModelTester(self) self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_model_various_embeddings(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() for type in ["absolute", "relative_key", "relative_key_query"]: config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) def test_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder( ) self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): # This regression test was failing with PyTorch < 1.3 ( config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask, ) = self.model_tester.prepare_config_and_inputs_for_decoder() input_mask = None self.model_tester.create_and_check_model_as_decoder( config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask, ) def test_for_causal_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder( ) self.model_tester.create_and_check_for_causal_lm(*config_and_inputs) def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) def test_for_causal_lm_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder( ) self.model_tester.create_and_check_model_for_causal_lm_as_decoder( *config_and_inputs) def test_for_multiple_choice(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_multiple_choice( *config_and_inputs) def test_for_next_sequence_prediction(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_next_sequence_prediction( *config_and_inputs) def test_for_pretraining(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_pretraining(*config_and_inputs) def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering( *config_and_inputs) def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_sequence_classification( *config_and_inputs) def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_token_classification( *config_and_inputs) @slow def test_model_from_pretrained(self): for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = BertModel.from_pretrained(model_name) self.assertIsNotNone(model)