Exemple #1
0
    def test_question_gen_inference(self):
        model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased-squad-qg")
        model.to(torch_device)

        tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased-squad-qg")

        INPUTS = [
            "Bill Gates [SEP] Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975.",
            "1975 [SEP] Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975.",
            "April 4, 1975 [SEP] Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975.",
        ]

        input_ids = tokenizer(INPUTS, truncation=True, padding=True, return_tensors="pt").input_ids
        input_ids = input_ids.to(torch_device)

        gen_output = model.generate(input_ids, num_beams=5, early_stopping=True)
        generated_questions = tokenizer.batch_decode(gen_output, skip_special_tokens=True)

        EXPECTED_QUESTIONS = [
            "along with paul allen, who founded microsoft?",
            "what year was microsoft founded?",
            "on what date was microsoft founded?",
        ]

        self.assertListEqual(
            EXPECTED_QUESTIONS,
            generated_questions,
        )
Exemple #2
0
 def create_and_check_generate_with_past_key_value_states(
     self,
     config,
     input_ids,
     decoder_input_ids,
     attention_mask,
     decoder_attention_mask,
     lm_labels,
 ):
     model = ProphetNetForConditionalGeneration(config=config).to(torch_device).eval()
     torch.manual_seed(0)
     output_without_past_cache = model.generate(
         input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
     )
     torch.manual_seed(0)
     output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
     self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
    def test_cnndm_inference(self):
        model = ProphetNetForConditionalGeneration.from_pretrained(
            "microsoft/prophetnet-large-uncased-cnndm")
        model.config.max_length = 512
        model.to(torch_device)

        tokenizer = ProphetNetTokenizer.from_pretrained(
            "microsoft/prophetnet-large-uncased-cnndm")

        ARTICLE_TO_SUMMARIZE = "USTC was founded in Beijing by the Chinese Academy of Sciences (CAS) in September 1958. The Director of CAS, Mr. Guo Moruo was appointed the first president of USTC. USTC's founding mission was to develop a high-level science and technology workforce, as deemed critical for development of China's economy, defense, and science and technology education. The establishment was hailed as \"A Major Event in the History of Chinese Education and Science.\" CAS has supported USTC by combining most of its institutes with the departments of the university. USTC is listed in the top 16 national key universities, becoming the youngest national key university.".lower(
        )
        input_ids = tokenizer([ARTICLE_TO_SUMMARIZE],
                              max_length=511,
                              return_tensors="pt").input_ids

        input_ids = input_ids.to(torch_device)

        summary_ids = model.generate(input_ids,
                                     num_beams=4,
                                     length_penalty=1.0,
                                     no_repeat_ngram_size=3,
                                     early_stopping=True)
        EXPECTED_SUMMARIZE_512 = "us ##tc was founded by the chinese academy of sciences ( cas ) in 1958 . [X_SEP] us ##tc is listed in the top 16 national key universities ."
        generated_titles = [
            " ".join(
                tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True))
            for g in summary_ids
        ]
        self.assertListEqual(
            [EXPECTED_SUMMARIZE_512],
            generated_titles,
        )
        input_ids = tokenizer([ARTICLE_TO_SUMMARIZE],
                              max_length=99,
                              return_tensors="pt").input_ids
        input_ids = input_ids.to(torch_device)
        # actually 98 tokens are used. max_length=100 contains bos and eos.
        summary_ids = model.generate(input_ids,
                                     num_beams=4,
                                     length_penalty=1.0,
                                     no_repeat_ngram_size=3,
                                     early_stopping=True)
        EXPECTED_SUMMARIZE_100 = (
            r"us ##tc was founded in beijing by the chinese academy of sciences ( cas ) in 1958 . [X_SEP] us ##tc "
            "'"
            ' s founding mission was to develop a high - level science and technology workforce . [X_SEP] establishment hailed as " a major event in the history of chinese education and science "'
        )
        generated_titles = [
            " ".join(
                tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True))
            for g in summary_ids
        ]
        self.assertListEqual(
            [EXPECTED_SUMMARIZE_100],
            generated_titles,
        )
Exemple #4
0
    def check_fast_integration(
        self,
        config,
        *args,
    ):
        input_ids = torch.tensor([[7, 4, 78, 0, 24, 52, 43]], device=torch_device, dtype=torch.long)
        decoder_input_ids = torch.tensor([[12, 62, 25, 11, 47, 15, 14]], device=torch_device, dtype=torch.long)
        attention_mask = torch.tensor([[1, 1, 1, 0, 1, 0, 0]], device=torch_device, dtype=torch.long)
        decoder_attention_mask = torch.tensor([[1, 1, 1, 0, 0, 1, 0]], device=torch_device, dtype=torch.long)
        lm_labels = torch.tensor([[62, 25, 11, 47, 15, 14, 24]], device=torch_device, dtype=torch.long)
        torch.manual_seed(0)
        config.ngram = 4
        model = ProphetNetForConditionalGeneration(config=config)
        model.to(torch_device)
        model.eval()
        with torch.no_grad():
            result = model(
                input_ids=input_ids,
                decoder_input_ids=decoder_input_ids,
                attention_mask=attention_mask,
                decoder_attention_mask=decoder_attention_mask,
                labels=lm_labels,
                return_dict=True,
            )
        self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(128.2925, device=torch_device), atol=1e-3))

        expected_logit_slice = torch.tensor(
            [-0.1565, 0.0418, 0.1207, 0.0030, 0.0665, 0.0467, 0.0412], device=torch_device
        )
        self.parent.assertTrue(torch.allclose(result.logits[0, :, 1], expected_logit_slice, atol=1e-3))
Exemple #5
0
    def __init__(self, config, dataset):
        super(ProphetNet, self).__init__(config, dataset)
        self.pretrained_model_path = config['pretrained_model_path']
        self.config = ProphetNetConfig.from_pretrained(
            self.pretrained_model_path)
        self.tokenizer = ProphetNetTokenizer.from_pretrained(
            self.pretrained_model_path)
        self.model = ProphetNetForConditionalGeneration.from_pretrained(
            self.pretrained_model_path, config=self.config)

        self.padding_token_idx = self.tokenizer.pad_token_id
        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
Exemple #6
0
 def create_and_check_with_lm_head(
     self,
     config,
     input_ids,
     decoder_input_ids,
     attention_mask,
     decoder_attention_mask,
     lm_labels,
 ):
     model = ProphetNetForConditionalGeneration(config=config).to(torch_device).eval()
     outputs = model(
         input_ids=input_ids,
         decoder_input_ids=decoder_input_ids,
         decoder_attention_mask=decoder_attention_mask,
         labels=lm_labels,
     )
     self.parent.assertEqual(len(outputs), 5)
     self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
     self.parent.assertEqual(outputs["loss"].size(), ())
    def setUp(self):
        """Load model, tokenizer and expected output."""

        self.tokenizer = ProphetNetTokenizer.from_pretrained(
            'microsoft/prophetnet-large-uncased')
        self.prophetnet_model = ProphetNetForConditionalGeneration.from_pretrained(
            'microsoft/prophetnet-large-uncased')

        self.source_path = 'tests/optimizer/transformers/data/cnndm_128.txt'

        # The expected output is generated based on transformers-v4.12.0 with
        # batch_size = 16.
        self.expected_output_path = 'tests/optimizer/transformers/data/expected_prophetnet_output.hypo'  # pylint: disable=line-too-long
        self.expected_outputs = []
        with open(self.expected_output_path, 'rt',
                  encoding="utf-8") as expected_output_file:
            for line in expected_output_file:
                self.expected_outputs.append(line.strip())

        self.batch_count = 0
Exemple #8
0
    def test_pretrained_checkpoint_hidden_states(self):
        model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")
        model.to(torch_device)

        # encoder-decoder outputs
        encoder_ids = torch.tensor(
            [
                [
                    2871,
                    102,
                    2048,
                    3176,
                    2780,
                    1997,
                    2871,
                    26727,
                    2169,
                    2097,
                    12673,
                    1996,
                    8457,
                    2006,
                    2049,
                    8240,
                    2859,
                    2799,
                    1012,
                    2023,
                    6512,
                    2038,
                    2174,
                    13977,
                    2195,
                    25962,
                    1012,
                    102,
                ]
            ]
        ).to(torch_device)

        decoder_prev_ids = torch.tensor([[102, 2129, 2116, 2372, 2024, 2006, 2169, 1997, 2122, 2048, 2780, 1029]]).to(
            torch_device
        )
        output = model(
            input_ids=encoder_ids,
            attention_mask=None,
            encoder_outputs=None,
            decoder_input_ids=decoder_prev_ids,
            return_dict=True,
        )
        output_predited_logits = output[0]
        expected_shape = torch.Size((1, 12, 30522))
        self.assertEqual(output_predited_logits.shape, expected_shape)
        expected_slice = torch.tensor(
            [[[-7.6213, -7.9008, -7.9979], [-7.6834, -7.8467, -8.2187], [-7.5326, -7.4762, -8.1914]]]
        ).to(torch_device)
        #        self.assertTrue(torch.allclose(output_predited_logits[:, :3, :3], expected_slice, atol=1e-4))
        assert torch.allclose(output_predited_logits[:, :3, :3], expected_slice, atol=1e-4)

        # encoder outputs
        encoder_outputs = model.prophetnet.encoder(encoder_ids)[0]
        expected_encoder_outputs_slice = torch.tensor(
            [[[-0.2526, -0.1951, -0.2185], [-0.8923, 0.2992, -0.4623], [-0.4585, 0.0165, -0.6652]]]
        ).to(torch_device)
        expected_shape_encoder = torch.Size((1, 28, 1024))
        self.assertEqual(encoder_outputs.shape, expected_shape_encoder)
        #        self.assertTrue(torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4))
        assert torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4)

        # decoder outputs
        decoder_outputs = model.prophetnet.decoder(
            decoder_prev_ids, encoder_hidden_states=encoder_outputs, return_dict=True
        )
        predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 12, -1)
        predicting_streams_logits = model.lm_head(predicting_streams)
        next_first_stream_logits = predicting_streams_logits[:, 0]
        #        self.assertTrue(torch.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4))
        assert torch.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4)
Exemple #9
0
from transformers import ProphetNetTokenizer, ProphetNetForConditionalGeneration, ProphetNetConfig

model = ProphetNetForConditionalGeneration.from_pretrained(
    'microsoft/prophetnet-large-uncased-squad-qg')
tokenizer = ProphetNetTokenizer.from_pretrained(
    'microsoft/prophetnet-large-uncased-squad-qg')

FACT_TO_GENERATE_QUESTION_FROM = "Bread can be spread with butter, dipped into liquids such as gravy, olive oil, or soup;[28] it can be topped with various sweet and savory spreads, or used to make sandwiches containing meats, cheeses, vegetables, and condiments."

inputs = tokenizer([FACT_TO_GENERATE_QUESTION_FROM], return_tensors='pt')

# Generate Summary
question_ids = model.generate(inputs['input_ids'],
                              num_beams=5,
                              early_stopping=True)
print(tokenizer.batch_decode(question_ids, skip_special_tokens=True))
def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, pytorch_dump_folder_path: str):
    """
    Copy/paste/tweak prohpetnet's weights to our prophetnet structure.
    """
    if "xprophetnet" in prophetnet_checkpoint_path:
        prophet_old = XLMProphetNetForConditionalGenerationOld.from_pretrained(prophetnet_checkpoint_path)
        prophet, loading_info = XLMProphetNetForConditionalGeneration.from_pretrained(
            prophetnet_checkpoint_path, output_loading_info=True
        )
    else:
        prophet_old = ProphetNetForConditionalGenerationOld.from_pretrained(prophetnet_checkpoint_path)
        prophet, loading_info = ProphetNetForConditionalGeneration.from_pretrained(
            prophetnet_checkpoint_path, output_loading_info=True
        )

    special_keys = ["key_proj", "value_proj", "query_proj"]

    mapping = {
        "self_attn": "ngram_self_attn",
        "cross_attn": "encoder_attn",
        "cross_attn_layer_norm": "encoder_attn_layer_norm",
        "feed_forward_layer_norm": "final_layer_norm",
        "feed_forward": "",
        "intermediate": "fc1",
        "output": "fc2",
        "key_proj": "k_proj",
        "query_proj": "q_proj",
        "value_proj": "v_proj",
        "word_embeddings": "embed_tokens",
        "embeddings_layer_norm": "emb_layer_norm",
        "relative_pos_embeddings": "relative_linear",
        "ngram_embeddings": "ngram_input_embed",
        "position_embeddings": "embed_positions",
    }

    for key in loading_info["missing_keys"]:
        attributes = key.split(".")

        if attributes[0] == "lm_head":
            model = prophet
            old_model = prophet_old
        else:
            model = prophet.prophetnet
            old_model = prophet_old.model

        is_key_init = False
        for attribute in attributes:
            if attribute in mapping:
                old_attribute = mapping[attribute]
                if not hasattr(old_model, old_attribute) and len(old_attribute) > 0:
                    old_attribute = attribute
            elif hasattr(old_model, attribute):
                old_attribute = attribute

            if attribute == "weight":
                assert old_model.weight.shape == model.weight.shape, "Shapes have to match!"
                model.weight = old_model.weight
                logger.info(f"{attribute} is initialized.")
                is_key_init = True
                break
            elif attribute == "bias":
                assert old_model.bias.shape == model.bias.shape, "Shapes have to match!"
                model.bias = old_model.bias
                logger.info(f"{attribute} is initialized")
                is_key_init = True
                break
            elif attribute in special_keys and hasattr(old_model, "in_proj_weight"):
                embed_dim = old_model.in_proj_weight.shape[0] // 3
                param = getattr(model, attribute)
                param.weight.shape == old_model.in_proj_weight[:embed_dim, :].shape, "Shapes have to match"
                param.bias.shape == old_model.in_proj_bias[:embed_dim].shape, "Shapes have to match"
                if attribute == "query_proj":
                    model.query_proj.weight = torch.nn.Parameter(old_model.in_proj_weight[:embed_dim, :])
                    model.query_proj.bias = torch.nn.Parameter(old_model.in_proj_bias[:embed_dim])

                elif attribute == "key_proj":
                    model.key_proj.weight = torch.nn.Parameter(old_model.in_proj_weight[embed_dim : 2 * embed_dim, :])
                    model.key_proj.bias = torch.nn.Parameter(old_model.in_proj_bias[embed_dim : 2 * embed_dim])
                elif attribute == "value_proj":
                    model.value_proj.weight = torch.nn.Parameter(old_model.in_proj_weight[2 * embed_dim :, :])
                    model.value_proj.bias = torch.nn.Parameter(old_model.in_proj_bias[2 * embed_dim :])
                is_key_init = True
                break
            elif attribute == "position_embeddings":
                assert (
                    model.position_embeddings.weight.shape[-1] == old_model.embed_positions.weight.shape[-1]
                ), "Hidden size has to match"
                assert model.position_embeddings.weight.shape[0] == 512, "We want 512 position_embeddings."
                model.position_embeddings.weight = torch.nn.Parameter(old_model.embed_positions.weight[:512, :])
                is_key_init = True
                break

            if attribute.isdigit():
                model = model[int(attribute)]
                old_model = old_model[int(old_attribute)]
            else:
                model = getattr(model, attribute)

                if old_attribute == "":
                    old_model = old_model
                else:
                    if not hasattr(old_model, old_attribute):
                        raise ValueError(f"{old_model} does not have {old_attribute}")
                    old_model = getattr(old_model, old_attribute)

        if not is_key_init:
            raise ValueError(f"{key} was not correctly initialized!")

    print(f"Saving model to {pytorch_dump_folder_path}")
    prophet.save_pretrained(pytorch_dump_folder_path)