def test_for_correctness_with_fixture(self):
        bpe_path = "https://allennlp.s3.amazonaws.com/models/openai-transformer-lm-2018.07.23.tar.gz"
        indexer = OpenaiTransformerBytePairIndexer(model_path=bpe_path)

        with open(self.FIXTURES_ROOT / 'openai_transformer' / 'text.txt',
                  'r') as fin:
            sentences = fin.read().strip().split('\n')
        with open(
                self.FIXTURES_ROOT / 'openai_transformer' /
                'indexed_text.json', 'r') as fin:
            expected_indices = json.load(fin)

        # tokenize and check that indices are correct
        nlp = spacy.load('en_core_web_sm')

        for k, sentence in enumerate(sentences):
            tokens = [
                token.text for token in nlp(text_standardize(sentence))
                if not token.is_space
            ]
            indices = indexer.tokens_to_indices(
                [Token(token) for token in tokens], Vocabulary(),
                'openai_indexer')
            non_padded_indices = [
                i for i in indices['openai_indexer'] if i != 0
            ]
            assert non_padded_indices == expected_indices[k]
예제 #2
0
    def setUp(self):
        super(TestOpenaiTransformerBytePairIndexer, self).setUp()

        encoder_path = self.TEST_DIR / u'encoder.json'
        bpe_path = self.TEST_DIR / u'vocab.bpe'
        transformer_model_path = self.TEST_DIR / u'model.tar.gz'

        symbols = [u"e", u"w", u"o", u"wo", u"."]
        byte_pairs = [
            (sym1, sym2 + end)
            for sym1 in symbols  # prefer earlier first symbol
            for sym2 in symbols  # if tie, prefer earlier second symbol
            for end in (u'</w>', u'')
        ]  # if tie, prefer ending a word
        encoding = dict(("{sym1}{sym2}", idx + 1)
                        for idx, (sym1, sym2) in enumerate(byte_pairs))

        with open(encoder_path, u'w') as encoder_file:
            json.dump(encoding, encoder_file)

        with open(bpe_path, u'w') as bpe_file:
            bpe_file.write(u"#version 0.0\n")
            for sym1, sym2 in byte_pairs:
                bpe_file.write("{sym1} {sym2}\n")
            bpe_file.write(u"\n")

        with tarfile.open(transformer_model_path, u'w') as tf:
            tf.add(encoder_path, u'model/encoder_bpe_40000.json')
            tf.add(bpe_path, u'model/vocab_40000.bpe')

        self.indexer = OpenaiTransformerBytePairIndexer(encoding, byte_pairs)
예제 #3
0
    def setUp(self):
        super().setUp()

        encoder_path = self.TEST_DIR / 'encoder.json'
        bpe_path = self.TEST_DIR / 'vocab.bpe'
        transformer_model_path = self.TEST_DIR / 'model.tar.gz'

        symbols = ["e", "w", "o", "wo", "."]
        byte_pairs = [
            (sym1, sym2 + end)
            for sym1 in symbols  # prefer earlier first symbol
            for sym2 in symbols  # if tie, prefer earlier second symbol
            for end in ('</w>', '')
        ]  # if tie, prefer ending a word
        encoding = {
            f"{sym1}{sym2}": idx + 1
            for idx, (sym1, sym2) in enumerate(byte_pairs)
        }

        with open(encoder_path, 'w') as encoder_file:
            json.dump(encoding, encoder_file)

        with open(bpe_path, 'w') as bpe_file:
            bpe_file.write("#version 0.0\n")
            for sym1, sym2 in byte_pairs:
                bpe_file.write(f"{sym1} {sym2}\n")
            bpe_file.write("\n")

        with tarfile.open(transformer_model_path, 'w') as tf:
            tf.add(encoder_path, 'model/encoder_bpe_40000.json')
            tf.add(bpe_path, 'model/vocab_40000.bpe')

        self.indexer = OpenaiTransformerBytePairIndexer(encoding, byte_pairs)
        self.vocab = Vocabulary(non_padded_namespaces=['openai_transformer'])
예제 #4
0
    def test_openai_transformer_matches_tensorflow(self):
        model_path = "https://allennlp.s3.amazonaws.com/models/openai-transformer-lm-2018.07.23.tar.gz"
        indexer = OpenaiTransformerBytePairIndexer(model_path=model_path)
        transformer = OpenaiTransformer(model_path=model_path)

        # get the test sentences
        with open(self.FIXTURES_ROOT / 'openai_transformer' / 'text.txt',
                  'r') as fin:
            sentences = fin.read().strip().split('\n')

        # tokenize and check that indices are correct
        nlp = spacy.load('en_core_web_sm')

        # make a batch of two sentences
        batch_indices = []
        batch_lengths = []
        for k, sentence in enumerate(sentences):
            tokens = [
                token.text for token in nlp(text_standardize(sentence))
                if not token.is_space
            ]
            indices = indexer.tokens_to_indices(
                [Token(token) for token in tokens], Vocabulary(),
                'openai_indexer')
            batch_indices.append(indices['openai_indexer'])
            batch_lengths.append(
                len([i for i in indices['openai_indexer'] if i != 0]))
        batch_indices = torch.from_numpy(numpy.array(batch_indices))
        batch_size, num_timesteps = batch_indices.size()
        vocab_size = transformer.vocab_size - transformer.n_ctx
        positional_encodings = get_range_vector(num_timesteps,
                                                device=-1) + vocab_size

        # Combine the inputs with positional encodings
        batch_tensor = torch.stack(
            [
                batch_indices,  # (batch_size, num_timesteps)
                positional_encodings.expand(batch_size, num_timesteps)
            ],
            dim=-1)

        # run the LM
        transformer.eval()
        activations = transformer(batch_tensor)

        # load the expected activations
        expected_activations = []
        with h5py.File(
                self.FIXTURES_ROOT / 'openai_transformer' /
                'expected_embeddings.hdf5', 'r') as fin:
            expected_activations.append(fin['0'][...])
            expected_activations.append(fin['1'][...])

        # just check the top layer
        for k in range(2):
            actual = activations[-1][k, :batch_lengths[k], :].numpy()
            expected = expected_activations[k]
            numpy.testing.assert_almost_equal(expected, actual, decimal=5)
    def test_openai_transformer_matches_tensorflow(self):
        model_path = "https://s3-us-west-2.amazonaws.com/allennlp/models/openai-transformer-lm-2018.07.23.tar.gz"
        indexer = OpenaiTransformerBytePairIndexer(model_path=model_path)
        transformer = OpenaiTransformer(model_path=model_path)

        # get the test sentences
        with open(self.FIXTURES_ROOT / 'openai_transformer' / 'text.txt', 'r') as fin:
            sentences = fin.read().strip().split('\n')

        # tokenize and check that indices are correct
        nlp = spacy.load('en_core_web_sm')

        # make a batch of two sentences
        batch_indices = []
        batch_lengths = []
        for k, sentence in enumerate(sentences):
            tokens = [token.text for token in nlp(text_standardize(sentence)) if not token.is_space]
            indices = indexer.tokens_to_indices(
                    [Token(token) for token in tokens], Vocabulary(), 'openai_indexer'
            )
            batch_indices.append(indices['openai_indexer'])
            batch_lengths.append(len([i for i in indices['openai_indexer'] if i != 0]))
        batch_indices = torch.from_numpy(numpy.array(batch_indices))
        batch_size, num_timesteps = batch_indices.size()
        vocab_size = transformer.vocab_size - transformer.n_ctx
        positional_encodings = get_range_vector(num_timesteps, device=-1) + vocab_size

        # Combine the inputs with positional encodings
        batch_tensor = torch.stack([
                batch_indices,   # (batch_size, num_timesteps)
                positional_encodings.expand(batch_size, num_timesteps)
        ], dim=-1)

        # run the LM
        transformer.eval()
        activations = transformer(batch_tensor)

        # load the expected activations
        expected_activations = []
        with h5py.File(self.FIXTURES_ROOT / 'openai_transformer' / 'expected_embeddings.hdf5', 'r') as fin:
            expected_activations.append(fin['0'][...])
            expected_activations.append(fin['1'][...])

        # just check the top layer
        for k in range(2):
            actual = activations[-1][k, :batch_lengths[k], :].numpy()
            expected = expected_activations[k]
            numpy.testing.assert_almost_equal(expected, actual, decimal=5)
예제 #6
0
class TestOpenaiTransformerBytePairIndexer(AllenNlpTestCase):
    def setUp(self):
        super(TestOpenaiTransformerBytePairIndexer, self).setUp()

        encoder_path = self.TEST_DIR / u'encoder.json'
        bpe_path = self.TEST_DIR / u'vocab.bpe'
        transformer_model_path = self.TEST_DIR / u'model.tar.gz'

        symbols = [u"e", u"w", u"o", u"wo", u"."]
        byte_pairs = [
            (sym1, sym2 + end)
            for sym1 in symbols  # prefer earlier first symbol
            for sym2 in symbols  # if tie, prefer earlier second symbol
            for end in (u'</w>', u'')
        ]  # if tie, prefer ending a word
        encoding = dict(("{sym1}{sym2}", idx + 1)
                        for idx, (sym1, sym2) in enumerate(byte_pairs))

        with open(encoder_path, u'w') as encoder_file:
            json.dump(encoding, encoder_file)

        with open(bpe_path, u'w') as bpe_file:
            bpe_file.write(u"#version 0.0\n")
            for sym1, sym2 in byte_pairs:
                bpe_file.write("{sym1} {sym2}\n")
            bpe_file.write(u"\n")

        with tarfile.open(transformer_model_path, u'w') as tf:
            tf.add(encoder_path, u'model/encoder_bpe_40000.json')
            tf.add(bpe_path, u'model/vocab_40000.bpe')

        self.indexer = OpenaiTransformerBytePairIndexer(encoding, byte_pairs)

    def test_bpe(self):

        # [e, w, o, e</w>] -> best pair (e, w)
        # [ew, o, e</w>] -> best pair (o, e</w>)
        # [ew, oe</w>] -> done
        token = Token(u"ewoe")
        assert self.indexer.byte_pair_encode(token) == [u'ew', u'oe</w>']

        # Prefer "ew" to "we"
        token = Token(u"ewe")
        assert self.indexer.byte_pair_encode(token) == [u'ew', u'e</w>']

        # Prefer ending a word
        token = Token(u"eee")
        assert self.indexer.byte_pair_encode(token) == [u'e', u'ee</w>']

        # Encodes up to a single symbol when appropriate
        token = Token(u"woe")
        assert self.indexer.byte_pair_encode(token) == [u'woe</w>']

    def test_tokens_to_indices(self):
        tokens = [Token(u'ewoe'), Token(u'woe'), Token(u'ewe'), Token(u'ee')]

        indices = self.indexer.tokens_to_indices(tokens, None, u'test')

        assert set(indices.keys()) == set([u"test", u"test-offsets", u"mask"])

        text_tokens = indices[u'test']
        offsets = indices[u'test-offsets']

        assert text_tokens[:6] == [
            self.indexer.encoder.get(symbol, 0)
            for symbol in [u'ew', u'oe</w>'] + [u'woe</w>'] +
            [u'ew', u'e</w>'] + [u'ee</w>']
        ]

        assert offsets == [
            1,  # end of first word
            2,  # end of second word
            4,  # end of third word
            5,  # end of last word
        ]

    def test_raises_with_too_long_sentence(self):
        tokens = [Token(u'a') for _ in range(513)]

        with pytest.raises(RuntimeError):
            self.indexer.tokens_to_indices(tokens, None, u'should-fail')
 def _create_indexer_vocab(self, tokens_to_add=None):
     # pylint: disable=attribute-defined-outside-init
     self.indexer = OpenaiTransformerBytePairIndexer(
         self.encoding, self.byte_pairs, tokens_to_add=tokens_to_add)
     self.vocab = Vocabulary(non_padded_namespaces=['openai_transformer'])
class TestOpenaiTransformerBytePairIndexer(AllenNlpTestCase):
    def setUp(self):
        super().setUp()

        encoder_path = self.TEST_DIR / 'encoder.json'
        bpe_path = self.TEST_DIR / 'vocab.bpe'
        transformer_model_path = self.TEST_DIR / 'model.tar.gz'

        symbols = ["e", "w", "o", "wo", "."]
        byte_pairs = [
            (sym1, sym2 + end)
            for sym1 in symbols  # prefer earlier first symbol
            for sym2 in symbols  # if tie, prefer earlier second symbol
            for end in ('</w>', '')
        ]  # if tie, prefer ending a word
        encoding = {
            f"{sym1}{sym2}": idx + 1
            for idx, (sym1, sym2) in enumerate(byte_pairs)
        }

        with open(encoder_path, 'w') as encoder_file:
            json.dump(encoding, encoder_file)

        with open(bpe_path, 'w') as bpe_file:
            bpe_file.write("#version 0.0\n")
            for sym1, sym2 in byte_pairs:
                bpe_file.write(f"{sym1} {sym2}\n")
            bpe_file.write("\n")

        with tarfile.open(transformer_model_path, 'w') as tf:
            tf.add(encoder_path, 'model/encoder_bpe_40000.json')
            tf.add(bpe_path, 'model/vocab_40000.bpe')

        self.encoding = encoding
        self.byte_pairs = byte_pairs

    def _create_indexer_vocab(self, tokens_to_add=None):
        # pylint: disable=attribute-defined-outside-init
        self.indexer = OpenaiTransformerBytePairIndexer(
            self.encoding, self.byte_pairs, tokens_to_add=tokens_to_add)
        self.vocab = Vocabulary(non_padded_namespaces=['openai_transformer'])

    def test_bpe(self):
        self._create_indexer_vocab()

        # [e, w, o, e</w>] -> best pair (e, w)
        # [ew, o, e</w>] -> best pair (o, e</w>)
        # [ew, oe</w>] -> done
        token = Token("ewoe")
        assert self.indexer.byte_pair_encode(token) == ['ew', 'oe</w>']

        # Prefer "ew" to "we"
        token = Token("ewe")
        assert self.indexer.byte_pair_encode(token) == ['ew', 'e</w>']

        # Prefer ending a word
        token = Token("eee")
        assert self.indexer.byte_pair_encode(token) == ['e', 'ee</w>']

        # Encodes up to a single symbol when appropriate
        token = Token("woe")
        assert self.indexer.byte_pair_encode(token) == ['woe</w>']

    def test_tokens_to_indices(self):
        self._create_indexer_vocab()

        tokens = [Token('ewoe'), Token('woe'), Token('ewe'), Token('ee')]

        # vocab should be empty initially
        assert 'openai_transformer' not in self.vocab._index_to_token
        assert 'openai_transformer' not in self.vocab._token_to_index

        indices = self.indexer.tokens_to_indices(tokens, self.vocab, 'test')

        # vocab should be full now
        i2t = self.vocab._index_to_token.get('openai_transformer')
        t2i = self.vocab._token_to_index.get('openai_transformer')
        assert len(i2t) == 5 * 5 * 2
        assert len(t2i) == 5 * 5 * 2

        assert set(indices.keys()) == {"test", "test-offsets", "mask"}

        text_tokens = indices['test']
        offsets = indices['test-offsets']

        assert text_tokens[:6] == [
            self.indexer.encoder.get(symbol, 0)
            for symbol in ['ew', 'oe</w>'] + ['woe</w>'] + ['ew', 'e</w>'] +
            ['ee</w>']
        ]

        assert offsets == [
            1,  # end of first word
            2,  # end of second word
            4,  # end of third word
            5,  # end of last word
        ]

    def test_raises_with_too_long_sentence(self):
        self._create_indexer_vocab()

        tokens = [Token('a') for _ in range(513)]

        with pytest.raises(RuntimeError):
            self.indexer.tokens_to_indices(tokens, self.vocab, 'should-fail')

    def test_with_extra_tokens(self):
        self._create_indexer_vocab(tokens_to_add=["<start>", "<predict>"])
        tokens = [
            Token('<start>'),
            Token('ewoe'),
            Token('woe'),
            Token('ewe'),
            Token('ee'),
            Token('<predict>')
        ]
        indices = self.indexer.tokens_to_indices(tokens, self.vocab, 'openai')
        assert indices['openai'][:9] == [50, 4, 21, 31, 4, 0, 1, 51, 0]

    @pytest.mark.skip()
    def test_for_correctness_with_fixture(self):
        bpe_path = "https://allennlp.s3.amazonaws.com/models/openai-transformer-lm-2018.07.23.tar.gz"
        indexer = OpenaiTransformerBytePairIndexer(model_path=bpe_path)

        with open(self.FIXTURES_ROOT / 'openai_transformer' / 'text.txt',
                  'r') as fin:
            sentences = fin.read().strip().split('\n')
        with open(
                self.FIXTURES_ROOT / 'openai_transformer' /
                'indexed_text.json', 'r') as fin:
            expected_indices = json.load(fin)

        # tokenize and check that indices are correct
        nlp = spacy.load('en_core_web_sm')

        for k, sentence in enumerate(sentences):
            tokens = [
                token.text for token in nlp(text_standardize(sentence))
                if not token.is_space
            ]
            indices = indexer.tokens_to_indices(
                [Token(token) for token in tokens], Vocabulary(),
                'openai_indexer')
            non_padded_indices = [
                i for i in indices['openai_indexer'] if i != 0
            ]
            assert non_padded_indices == expected_indices[k]
예제 #9
0
class TestOpenaiTransformerBytePairIndexer(AllenNlpTestCase):
    def setUp(self):
        super().setUp()

        encoder_path = self.TEST_DIR / 'encoder.json'
        bpe_path = self.TEST_DIR / 'vocab.bpe'
        transformer_model_path = self.TEST_DIR / 'model.tar.gz'

        symbols = ["e", "w", "o", "wo", "."]
        byte_pairs = [
            (sym1, sym2 + end)
            for sym1 in symbols  # prefer earlier first symbol
            for sym2 in symbols  # if tie, prefer earlier second symbol
            for end in ('</w>', '')
        ]  # if tie, prefer ending a word
        encoding = {
            f"{sym1}{sym2}": idx + 1
            for idx, (sym1, sym2) in enumerate(byte_pairs)
        }

        with open(encoder_path, 'w') as encoder_file:
            json.dump(encoding, encoder_file)

        with open(bpe_path, 'w') as bpe_file:
            bpe_file.write("#version 0.0\n")
            for sym1, sym2 in byte_pairs:
                bpe_file.write(f"{sym1} {sym2}\n")
            bpe_file.write("\n")

        with tarfile.open(transformer_model_path, 'w') as tf:
            tf.add(encoder_path, 'model/encoder_bpe_40000.json')
            tf.add(bpe_path, 'model/vocab_40000.bpe')

        self.indexer = OpenaiTransformerBytePairIndexer(encoding, byte_pairs)
        self.vocab = Vocabulary(non_padded_namespaces=['openai_transformer'])

    def test_bpe(self):

        # [e, w, o, e</w>] -> best pair (e, w)
        # [ew, o, e</w>] -> best pair (o, e</w>)
        # [ew, oe</w>] -> done
        token = Token("ewoe")
        assert self.indexer.byte_pair_encode(token) == ['ew', 'oe</w>']

        # Prefer "ew" to "we"
        token = Token("ewe")
        assert self.indexer.byte_pair_encode(token) == ['ew', 'e</w>']

        # Prefer ending a word
        token = Token("eee")
        assert self.indexer.byte_pair_encode(token) == ['e', 'ee</w>']

        # Encodes up to a single symbol when appropriate
        token = Token("woe")
        assert self.indexer.byte_pair_encode(token) == ['woe</w>']

    def test_tokens_to_indices(self):
        tokens = [Token('ewoe'), Token('woe'), Token('ewe'), Token('ee')]

        # vocab should be empty initially
        assert 'openai_transformer' not in self.vocab._index_to_token
        assert 'openai_transformer' not in self.vocab._token_to_index

        indices = self.indexer.tokens_to_indices(tokens, self.vocab, 'test')

        # vocab should be full now
        i2t = self.vocab._index_to_token.get('openai_transformer')
        t2i = self.vocab._token_to_index.get('openai_transformer')
        assert len(i2t) == 5 * 5 * 2
        assert len(t2i) == 5 * 5 * 2

        assert set(indices.keys()) == {"test", "test-offsets", "mask"}

        text_tokens = indices['test']
        offsets = indices['test-offsets']

        assert text_tokens[:6] == [
            self.indexer.encoder.get(symbol, 0)
            for symbol in ['ew', 'oe</w>'] + ['woe</w>'] + ['ew', 'e</w>'] +
            ['ee</w>']
        ]

        assert offsets == [
            1,  # end of first word
            2,  # end of second word
            4,  # end of third word
            5,  # end of last word
        ]

    def test_raises_with_too_long_sentence(self):
        tokens = [Token('a') for _ in range(513)]

        with pytest.raises(RuntimeError):
            self.indexer.tokens_to_indices(tokens, self.vocab, 'should-fail')
예제 #10
0
    def _create_indexer_vocab(self, tokens_to_add=None):

        self.indexer = OpenaiTransformerBytePairIndexer(
            self.encoding, self.byte_pairs, tokens_to_add=tokens_to_add)
        self.vocab = Vocabulary(non_padded_namespaces=["openai_transformer"])