Ejemplo n.º 1
0
    def test_roberta_large_wsc(self):
        with contextlib.redirect_stdout(StringIO()):
            roberta = fb_hub.load('roberta.large.wsc',
                                  user_dir='examples/roberta/wsc')
            roberta.eval()  # disable dropout

            ans = roberta.disambiguate_pronoun(
                'The _trophy_ would not fit in the brown suitcase because [it] was too big.'
            )
            self.assertTrue(ans)

            ans = roberta.disambiguate_pronoun(
                'The trophy would not fit in the brown _suitcase_ because [it] was too big.'
            )
            self.assertFalse(ans)

            ans = roberta.disambiguate_pronoun(
                'The city councilmen refused the demonstrators a permit because [they] feared violence.'
            )
            self.assertEqual(ans, 'The city councilmen')

            ans = roberta.disambiguate_pronoun(
                'The city councilmen refused the demonstrators a permit because [they] advocated violence.'
            )
            self.assertEqual(ans, 'demonstrators')
Ejemplo n.º 2
0
    def test_bart_large_mnli(self):
        with contextlib.redirect_stdout(StringIO()):
            # Download BART already finetuned for MNLI
            bart = fb_hub.load('bart.large.mnli')
            bart.eval()  # disable dropout for evaluation

            # Encode a pair of sentences and make a prediction
            tokens = bart.encode('BART is a seq2seq model.',
                                 'BART is not sequence to sequence.')
            prediction = bart.predict('mnli', tokens).argmax().item()
            self.assertEqual(prediction, 0)  # contradiction

            # Encode another pair of sentences
            tokens = bart.encode('BART is denoising autoencoder.',
                                 'BART is version of autoencoder.')
            prediction = bart.predict('mnli', tokens).argmax().item()
            self.assertEqual(prediction, 2)  # entailment

            # Test batched prediction
            from fairseq.data.data_utils import collate_tokens
            batch_of_pairs = [
                [
                    'BART is a seq2seq model.',
                    'BART is not sequence to sequence.'
                ],
                [
                    'BART is denoising autoencoder.',
                    'BART is version of autoencoder.'
                ],
            ]
            batch = collate_tokens(
                [bart.encode(pair[0], pair[1]) for pair in batch_of_pairs],
                pad_idx=1)
            logprobs = bart.predict('mnli', batch)
            self.assertEqual(logprobs.argmax(dim=1).tolist(), [0, 2])
Ejemplo n.º 3
0
    def test_roberta_base(self):
        with contextlib.redirect_stdout(StringIO()):
            # Load RoBERTa
            roberta = fb_hub.load('roberta.base')
            roberta.eval()  # disable dropout

            # Apply Byte-Pair Encoding (BPE) to input text
            tokens = roberta.encode('Hello world!')
            self.assertEqual(tokens.tolist(), [0, 31414, 232, 328, 2])
            self.assertEqual(roberta.decode(tokens), 'Hello world!')

            # Extract the last layer's features
            last_layer_features = roberta.extract_features(tokens)
            self.assertEqual(last_layer_features.size(),
                             torch.Size([1, 5, 768]))

            # Extract all layer's features (layer 0 is the embedding layer)
            all_layers = roberta.extract_features(tokens,
                                                  return_all_hiddens=True)
            self.assertEqual(len(all_layers), 13)
            self.assertTrue(torch.all(all_layers[-1] == last_layer_features))

            # Register a new (randomly initialized) classification head
            roberta.register_classification_head('new_task', num_classes=3)
            logprobs = roberta.predict('new_task', tokens)  # noqa

            # Test mask filling
            res = roberta.fill_mask(
                'The first Star wars movie came out in <mask>', topk=3)
            self.assertEqual(len(res), 3)
            self.assertEqual(res[0][2], ' 1977')
Ejemplo n.º 4
0
    def test_transformer_lm_wmt19_en(self):
        with contextlib.redirect_stdout(StringIO()):
            # Load an English LM trained on WMT'19 News Crawl data
            en_lm = fb_hub.load('transformer_lm.wmt19.en')
            en_lm.eval()  # disable dropout

            # Sample from the language model
            en_lm.sample('Barack Obama',
                         beam=1,
                         sampling=True,
                         sampling_topk=10,
                         temperature=0.8)

            ppl = en_lm.score(
                'Barack Obama is coming to Sydney and New Zealand'
            )['positional_scores'].mean().neg().exp()
            self.assertAlmostEqual(ppl.item(), 15.1474, places=4)
Ejemplo n.º 5
0
    def test_transformer_wmt14_en_fr(self):
        with contextlib.redirect_stdout(StringIO()):
            # Load an En-Fr Transformer model trained on WMT'14 data
            en2fr = fb_hub.load('transformer.wmt14.en-fr',
                                tokenizer='moses',
                                bpe='subword_nmt')
            en2fr.eval()  # disable dropout

            # Translate with beam search
            fr = en2fr.translate('Hello world!', beam=5)
            self.assertEqual(fr, 'Bonjour à tous !')

            # Manually tokenize
            en_toks = en2fr.tokenize('Hello world!')
            self.assertEqual(en_toks, 'Hello world !')

            # Manually apply BPE
            en_bpe = en2fr.apply_bpe(en_toks)
            self.assertEqual(en_bpe, 'H@@ ello world !')

            # Manually binarize
            en_bin = en2fr.binarize(en_bpe)
            self.assertEqual(en_bin.tolist(), [329, 14044, 682, 812, 2])

            # Generate five translations with top-k sampling
            fr_bin = en2fr.generate(en_bin,
                                    beam=5,
                                    sampling=True,
                                    sampling_topk=20)
            self.assertEqual(len(fr_bin), 5)

            # Convert one of the samples to a string and detokenize
            fr_sample = fr_bin[0]['tokens']
            fr_bpe = en2fr.string(fr_sample)
            fr_toks = en2fr.remove_bpe(fr_bpe)
            fr = en2fr.detokenize(fr_toks)
            self.assertEqual(fr, en2fr.decode(fr_sample))

            # Batched translation
            fr_batch = en2fr.translate(
                ['Hello world', 'The cat sat on the mat.'])
            self.assertEqual(
                fr_batch,
                ['Bonjour à tous.', 'Le chat était assis sur le tapis.'])
Ejemplo n.º 6
0
    def test_transformer_wmt19_en_de_single_model(self):
        with contextlib.redirect_stdout(StringIO()):
            # Load an En-De Transformer model trained on WMT'19 data
            en2de = fb_hub.load('transformer.wmt19.en-de.single_model',
                                tokenizer='moses',
                                bpe='fastbpe')
            en2de.eval()  # disable dropout

            # Access the underlying TransformerModel
            self.assertTrue(isinstance(en2de.models[0], torch.nn.Module))

            # Translate from En-De
            de = en2de.translate(
                'PyTorch Hub is a pre-trained model repository designed to facilitate research reproducibility.'
            )
            self.assertEqual(
                de,
                'PyTorch Hub ist ein vorgefertigtes Modell-Repository, das die Reproduzierbarkeit der Forschung erleichtern soll.'
            )
Ejemplo n.º 7
0
    def test_roberta_large_mnli(self):
        with contextlib.redirect_stdout(StringIO()):
            # Download RoBERTa already finetuned for MNLI
            roberta = fb_hub.load('roberta.large.mnli')
            roberta.eval()  # disable dropout for evaluation

            # Encode a pair of sentences and make a prediction
            tokens = roberta.encode(
                'Roberta is a heavily optimized version of BERT.',
                'Roberta is not very optimized.')
            prediction = roberta.predict('mnli', tokens).argmax().item()
            self.assertEqual(prediction, 0)  # contradiction

            # Encode another pair of sentences
            tokens = roberta.encode(
                'Roberta is a heavily optimized version of BERT.',
                'Roberta is based on BERT.')
            prediction = roberta.predict('mnli', tokens).argmax().item()
            self.assertEqual(prediction, 2)  # entailment

            # Test batched prediction
            from fairseq.data.data_utils import collate_tokens
            batch_of_pairs = [
                [
                    'Roberta is a heavily optimized version of BERT.',
                    'Roberta is not very optimized.'
                ],
                [
                    'Roberta is a heavily optimized version of BERT.',
                    'Roberta is based on BERT.'
                ],
                ['potatoes are awesome.', 'I like to run.'],
                ['Mars is very far from earth.', 'Mars is very close.'],
            ]
            batch = collate_tokens(
                [roberta.encode(pair[0], pair[1]) for pair in batch_of_pairs],
                pad_idx=1)
            logprobs = roberta.predict('mnli', batch)
            self.assertEqual(logprobs.argmax(dim=1).tolist(), [0, 2, 1, 0])
Ejemplo n.º 8
0
    def test_camembert(self):
        with contextlib.redirect_stdout(StringIO()):
            camembert = fb_hub.load('camembert.v0')
            camembert.eval()  # disable dropout

            # Filling masks
            masked_line = 'Le camembert est <mask> :)'
            res = camembert.fill_mask(masked_line, topk=3)
            self.assertEqual(len(res), 3)
            self.assertEqual(res[0][2], ' délicieux')

            # Extract the last layer's features
            line = "J'aime le camembert!"
            tokens = camembert.encode(line)
            last_layer_features = camembert.extract_features(tokens)
            self.assertEqual(last_layer_features.size(),
                             torch.Size([1, 10, 768]))

            # Extract all layer's features (layer 0 is the embedding layer)
            all_layers = camembert.extract_features(tokens,
                                                    return_all_hiddens=True)
            self.assertEqual(len(all_layers), 13)
            self.assertTrue(torch.all(all_layers[-1] == last_layer_features))
Ejemplo n.º 9
0
    def test_xlmr(self):
        with contextlib.redirect_stdout(StringIO()):
            xlmr = fb_hub.load('xlmr.large')
            xlmr.eval()  # disable dropout

            # Test sentencepiece
            en_tokens = xlmr.encode('Hello world!')
            self.assertEqual(en_tokens.tolist(), [0, 35378, 8999, 38, 2])
            xlmr.decode(en_tokens)  # 'Hello world!'

            zh_tokens = xlmr.encode('你好,世界')
            self.assertEqual(zh_tokens.tolist(), [0, 6, 124084, 4, 3221, 2])
            xlmr.decode(zh_tokens)  # '你好,世界'

            hi_tokens = xlmr.encode('नमस्ते दुनिया')
            self.assertEqual(hi_tokens.tolist(), [0, 68700, 97883, 29405, 2])
            xlmr.decode(hi_tokens)  # 'नमस्ते दुनिया'

            ar_tokens = xlmr.encode('مرحبا بالعالم')
            self.assertEqual(ar_tokens.tolist(),
                             [0, 665, 193478, 258, 1705, 77796, 2])
            xlmr.decode(ar_tokens)  # 'مرحبا بالعالم'

            fr_tokens = xlmr.encode('Bonjour le monde')
            self.assertEqual(fr_tokens.tolist(), [0, 84602, 95, 11146, 2])
            xlmr.decode(fr_tokens)  # 'Bonjour le monde'

            # Extract the last layer's features
            last_layer_features = xlmr.extract_features(zh_tokens)
            self.assertEqual(last_layer_features.size(),
                             torch.Size([1, 6, 1024]))

            # Extract all layer's features (layer 0 is the embedding layer)
            all_layers = xlmr.extract_features(zh_tokens,
                                               return_all_hiddens=True)
            self.assertEqual(len(all_layers), 25)
            self.assertTrue(torch.all(all_layers[-1] == last_layer_features))
Ejemplo n.º 10
0
    def test_bart_large_cnn(self):
        with contextlib.redirect_stdout(StringIO()):
            # Download BART already finetuned for MNLI
            bart = fb_hub.load('bart.large.cnn')
            bart.eval()  # disable dropout for evaluation

            hypothesis = bart.sample([
                """This is the first time anyone has been \
recorded to run a full marathon of 42.195 kilometers \
(approximately 26 miles) under this pursued landmark time. \
It was not, however, an officially sanctioned world record, \
as it was not an "open race" of the IAAF. His time was \
1 hour 59 minutes 40.2 seconds. Kipchoge ran in Vienna, Austria. \
It was an event specifically designed to help Kipchoge \
break the two hour barrier. Kenyan runner Eliud Kipchoge \
has run a marathon in less than two hours."""
            ])

            # Encode a pair of sentences and make a prediction
            self.assertEqual(
                hypothesis[0],
                """Eliud Kipchoge has run a marathon in less than two hours. \
Kenyan ran in Vienna, Austria. It was not an officially sanctioned world record."""
            )
Ejemplo n.º 11
0
    def test_bart_large(self):
        with contextlib.redirect_stdout(StringIO()):
            # Load BART
            bart = fb_hub.load('bart.large')
            bart.eval()  # disable dropout

            # Apply Byte-Pair Encoding (BPE) to input text
            tokens = bart.encode('Hello world!')
            self.assertEqual(tokens.tolist(), [0, 31414, 232, 328, 2])
            self.assertEqual(bart.decode(tokens), 'Hello world!')

            # Extract the last layer's features
            last_layer_features = bart.extract_features(tokens)
            self.assertEqual(last_layer_features.size(),
                             torch.Size([1, 5, 1024]))

            # Extract all layer's features from decoder (layer 0 is the embedding layer)
            all_layers = bart.extract_features(tokens, return_all_hiddens=True)
            self.assertEqual(len(all_layers), 13)
            self.assertTrue(torch.all(all_layers[-1] == last_layer_features))

            # Register a new (randomly initialized) classification head
            bart.register_classification_head('new_task', num_classes=3)
            logprobs = bart.predict('new_task', tokens)  # noqa