コード例 #1
0
    def test_for_correctness_with_fixture(self):
        bpe_path = "https://allennlp.s3.amazonaws.com/models/openai-transformer-lm-2018.07.23.tar.gz"
        indexer = OpenaiTransformerBytePairIndexer(model_path=bpe_path)

        with open(self.FIXTURES_ROOT / 'openai_transformer' / 'text.txt',
                  'r') as fin:
            sentences = fin.read().strip().split('\n')
        with open(
                self.FIXTURES_ROOT / 'openai_transformer' /
                'indexed_text.json', 'r') as fin:
            expected_indices = json.load(fin)

        # tokenize and check that indices are correct
        nlp = spacy.load('en_core_web_sm')

        for k, sentence in enumerate(sentences):
            tokens = [
                token.text for token in nlp(text_standardize(sentence))
                if not token.is_space
            ]
            indices = indexer.tokens_to_indices(
                [Token(token) for token in tokens], Vocabulary(),
                'openai_indexer')
            non_padded_indices = [
                i for i in indices['openai_indexer'] if i != 0
            ]
            assert non_padded_indices == expected_indices[k]
コード例 #2
0
    def test_openai_transformer_matches_tensorflow(self):
        model_path = "https://allennlp.s3.amazonaws.com/models/openai-transformer-lm-2018.07.23.tar.gz"
        indexer = OpenaiTransformerBytePairIndexer(model_path=model_path)
        transformer = OpenaiTransformer(model_path=model_path)

        # get the test sentences
        with open(self.FIXTURES_ROOT / 'openai_transformer' / 'text.txt',
                  'r') as fin:
            sentences = fin.read().strip().split('\n')

        # tokenize and check that indices are correct
        nlp = spacy.load('en_core_web_sm')

        # make a batch of two sentences
        batch_indices = []
        batch_lengths = []
        for k, sentence in enumerate(sentences):
            tokens = [
                token.text for token in nlp(text_standardize(sentence))
                if not token.is_space
            ]
            indices = indexer.tokens_to_indices(
                [Token(token) for token in tokens], Vocabulary(),
                'openai_indexer')
            batch_indices.append(indices['openai_indexer'])
            batch_lengths.append(
                len([i for i in indices['openai_indexer'] if i != 0]))
        batch_indices = torch.from_numpy(numpy.array(batch_indices))
        batch_size, num_timesteps = batch_indices.size()
        vocab_size = transformer.vocab_size - transformer.n_ctx
        positional_encodings = get_range_vector(num_timesteps,
                                                device=-1) + vocab_size

        # Combine the inputs with positional encodings
        batch_tensor = torch.stack(
            [
                batch_indices,  # (batch_size, num_timesteps)
                positional_encodings.expand(batch_size, num_timesteps)
            ],
            dim=-1)

        # run the LM
        transformer.eval()
        activations = transformer(batch_tensor)

        # load the expected activations
        expected_activations = []
        with h5py.File(
                self.FIXTURES_ROOT / 'openai_transformer' /
                'expected_embeddings.hdf5', 'r') as fin:
            expected_activations.append(fin['0'][...])
            expected_activations.append(fin['1'][...])

        # just check the top layer
        for k in range(2):
            actual = activations[-1][k, :batch_lengths[k], :].numpy()
            expected = expected_activations[k]
            numpy.testing.assert_almost_equal(expected, actual, decimal=5)
コード例 #3
0
        def tokenize(sentence, tokenizer, indexer):
            if isinstance(sentence, str):
                tokens = sentence
            else:
                tokens = sentence["text"]

            tokens = text_standardize(tokens)

            tokens = strip_repeating_punctuation(tokens)

            tokens = textwrap.shorten(tokens, width=self._max_character_length)

            tokenized_text = tokenizer(tokens)

            stripped_tokens = []

            for token in tokenized_text:
                token_text = token.lower_
                token_len = len(token)

                # Disable the main checker
                if self._marked_sentences:
                    stripped_tokens.append(token)
                elif token_len > self._max_word_length:
                    if len(token.pos_) > 0:
                        stripped_tokens.append(Token(token.pos_))
                elif token_len < self._min_check_word_length or token_text in self._allowed_tokens:
                    stripped_tokens.append(token)
                elif token_text not in self._tried_tokens:

                    lookup_tokens(token_text)

                    if token_text in self._allowed_tokens and len(token_text):
                        stripped_tokens.append(token)
                    else:
                        if len(token.pos_) > 0:
                            stripped_tokens.append(Token(token.pos_))
                        print(f"Rejected token: {token_text}")

            tokenized_text = stripped_tokens

            if self._add_start_end_token:
                tokenized_text.insert(0, Token(START_SYMBOL))
                tokenized_text.append(Token(END_SYMBOL))

            if len(tokenized_text
                   ) > self._max_sequence_length and self._truncate_sequences:
                tokenized_text = tokenized_text[:self._max_sequence_length]

            token_field = TextField(tokenized_text, indexer)

            return tokens, token_field
コード例 #4
0
    def test_openai_transformer_matches_tensorflow(self):
        model_path = "https://s3-us-west-2.amazonaws.com/allennlp/models/openai-transformer-lm-2018.07.23.tar.gz"
        indexer = OpenaiTransformerBytePairIndexer(model_path=model_path)
        transformer = OpenaiTransformer(model_path=model_path)

        # get the test sentences
        with open(self.FIXTURES_ROOT / 'openai_transformer' / 'text.txt', 'r') as fin:
            sentences = fin.read().strip().split('\n')

        # tokenize and check that indices are correct
        nlp = spacy.load('en_core_web_sm')

        # make a batch of two sentences
        batch_indices = []
        batch_lengths = []
        for k, sentence in enumerate(sentences):
            tokens = [token.text for token in nlp(text_standardize(sentence)) if not token.is_space]
            indices = indexer.tokens_to_indices(
                    [Token(token) for token in tokens], Vocabulary(), 'openai_indexer'
            )
            batch_indices.append(indices['openai_indexer'])
            batch_lengths.append(len([i for i in indices['openai_indexer'] if i != 0]))
        batch_indices = torch.from_numpy(numpy.array(batch_indices))
        batch_size, num_timesteps = batch_indices.size()
        vocab_size = transformer.vocab_size - transformer.n_ctx
        positional_encodings = get_range_vector(num_timesteps, device=-1) + vocab_size

        # Combine the inputs with positional encodings
        batch_tensor = torch.stack([
                batch_indices,   # (batch_size, num_timesteps)
                positional_encodings.expand(batch_size, num_timesteps)
        ], dim=-1)

        # run the LM
        transformer.eval()
        activations = transformer(batch_tensor)

        # load the expected activations
        expected_activations = []
        with h5py.File(self.FIXTURES_ROOT / 'openai_transformer' / 'expected_embeddings.hdf5', 'r') as fin:
            expected_activations.append(fin['0'][...])
            expected_activations.append(fin['1'][...])

        # just check the top layer
        for k in range(2):
            actual = activations[-1][k, :batch_lengths[k], :].numpy()
            expected = expected_activations[k]
            numpy.testing.assert_almost_equal(expected, actual, decimal=5)
コード例 #5
0
 def _standardize(text):
     return text_standardize(ftfy.fix_text(text))
コード例 #6
0
ファイル: word_splitter.py プロジェクト: ziaridoy20/allennlp
 def _standardize(text):
     return text_standardize(ftfy.fix_text(text))