def test_batched_nan_fp16(self): # a bug manifested starting at models facebook/opt-1.3 and larger when running batched generations, # therefore not using a tiny model, but the smallest model the problem was seen with which is opt-1.3b. # please refer to this github thread: https://github.com/huggingface/transformers/pull/17437 for more details model_name = "facebook/opt-1.3b" tokenizer = GPT2Tokenizer.from_pretrained(model_name, use_fast=False, padding_side="left") model = OPTForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, use_cache=True).cuda() model = model.eval() batch = tokenizer(["Who are you?", "Joe Biden is the president of"], padding=True, return_tensors="pt") input_ids = batch["input_ids"].cuda() attention_mask = batch["attention_mask"].cuda() with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask) self.assertFalse(torch.isnan(outputs.logits[0]).any().item( )) # the first logits could contain NaNs if it fails
def test_generate_fp16(self): config, input_dict = self.model_tester.prepare_config_and_inputs() input_ids = input_dict["input_ids"] attention_mask = input_ids.ne(1).to(torch_device) model = OPTForCausalLM(config).eval().to(torch_device) if torch_device == "cuda": model.half() model.generate(input_ids, attention_mask=attention_mask) model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
def test_batch_generation(self): model_id = "facebook/opt-350m" tokenizer = GPT2Tokenizer.from_pretrained(model_id) model = OPTForCausalLM.from_pretrained(model_id) model.to(torch_device) tokenizer.padding_side = "left" # use different length sentences to test batching sentences = [ "Hello, my dog is a little", "Today, I", ] inputs = tokenizer(sentences, return_tensors="pt", padding=True) input_ids = inputs["input_ids"].to(torch_device) outputs = model.generate( input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), ) inputs_non_padded = tokenizer( sentences[0], return_tensors="pt").input_ids.to(torch_device) output_non_padded = model.generate(input_ids=inputs_non_padded) num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][ -1].long().sum().cpu().item() inputs_padded = tokenizer( sentences[1], return_tensors="pt").input_ids.to(torch_device) output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) expected_output_sentence = [ "Hello, my dog is a little bit of a dork.\nI'm a little bit", "Today, I was in the middle of a conversation with a friend about the", ] self.assertListEqual(expected_output_sentence, batch_out_sentence) self.assertListEqual(batch_out_sentence, [non_padded_sentence, padded_sentence])
def test_logits(self): model = OPTForCausalLM.from_pretrained(self.path_model) model = model.eval() tokenizer = GPT2Tokenizer.from_pretrained(self.path_model) prompts = [ "Today is a beautiful day and I want to", "In the city of", "Paris is the capital of France and", "Computers and mobile phones have taken", ] # verify that prompt without BOS token is identical to Metaseq -> add_special_tokens=False inputs = tokenizer(prompts, return_tensors="pt", padding=True, add_special_tokens=False) logits = model(inputs.input_ids, attention_mask=inputs.attention_mask)[0].mean(dim=-1) # logits_meta = torch.load(self.path_logits_meta) logits_meta = torch.Tensor([ [ 1.3851, -13.8923, -10.5229, -10.7533, -0.2309, -10.2384, -0.5365, -9.0947, -5.1670 ], [ -4.7073, -10.6276, -3.9415, -21.5242, -0.2822, -0.2822, -0.2822, -0.2822, -0.2822 ], [ 0.6247, -3.4229, -8.9179, -1.4297, -14.1650, 1.4146, -9.0218, -0.2703, -0.2703 ], [ 6.4783, -1.9913, -10.7926, -2.3336, 1.5092, -0.9974, -6.8213, 1.3477, 1.3477 ], ]) assert torch.allclose(logits, logits_meta, atol=1e-4)
def test_generation_post_attn_layer_norm(self): model_id = "facebook/opt-350m" EXPECTED_OUTPUTS = [ "Today is a beautiful day and I want to", "In the city of San Francisco, the city", "Paris is the capital of France and the capital", "Computers and mobile phones have taken over the", ] predicted_outputs = [] tokenizer = GPT2Tokenizer.from_pretrained(model_id) model = OPTForCausalLM.from_pretrained(model_id) for prompt in self.prompts: input_ids = tokenizer(prompt, return_tensors="pt").input_ids generated_ids = model.generate(input_ids, max_length=10) generated_string = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) predicted_outputs += generated_string self.assertListEqual(predicted_outputs, EXPECTED_OUTPUTS)
def test_load_model(self): try: _ = OPTForCausalLM.from_pretrained(self.path_model) except BaseException: self.fail("Failed loading model")