def test_for_correctness_with_fixture(self): bpe_path = "https://allennlp.s3.amazonaws.com/models/openai-transformer-lm-2018.07.23.tar.gz" indexer = OpenaiTransformerBytePairIndexer(model_path=bpe_path) with open(self.FIXTURES_ROOT / 'openai_transformer' / 'text.txt', 'r') as fin: sentences = fin.read().strip().split('\n') with open( self.FIXTURES_ROOT / 'openai_transformer' / 'indexed_text.json', 'r') as fin: expected_indices = json.load(fin) # tokenize and check that indices are correct nlp = spacy.load('en_core_web_sm') for k, sentence in enumerate(sentences): tokens = [ token.text for token in nlp(text_standardize(sentence)) if not token.is_space ] indices = indexer.tokens_to_indices( [Token(token) for token in tokens], Vocabulary(), 'openai_indexer') non_padded_indices = [ i for i in indices['openai_indexer'] if i != 0 ] assert non_padded_indices == expected_indices[k]
def test_openai_transformer_matches_tensorflow(self): model_path = "https://allennlp.s3.amazonaws.com/models/openai-transformer-lm-2018.07.23.tar.gz" indexer = OpenaiTransformerBytePairIndexer(model_path=model_path) transformer = OpenaiTransformer(model_path=model_path) # get the test sentences with open(self.FIXTURES_ROOT / 'openai_transformer' / 'text.txt', 'r') as fin: sentences = fin.read().strip().split('\n') # tokenize and check that indices are correct nlp = spacy.load('en_core_web_sm') # make a batch of two sentences batch_indices = [] batch_lengths = [] for k, sentence in enumerate(sentences): tokens = [ token.text for token in nlp(text_standardize(sentence)) if not token.is_space ] indices = indexer.tokens_to_indices( [Token(token) for token in tokens], Vocabulary(), 'openai_indexer') batch_indices.append(indices['openai_indexer']) batch_lengths.append( len([i for i in indices['openai_indexer'] if i != 0])) batch_indices = torch.from_numpy(numpy.array(batch_indices)) batch_size, num_timesteps = batch_indices.size() vocab_size = transformer.vocab_size - transformer.n_ctx positional_encodings = get_range_vector(num_timesteps, device=-1) + vocab_size # Combine the inputs with positional encodings batch_tensor = torch.stack( [ batch_indices, # (batch_size, num_timesteps) positional_encodings.expand(batch_size, num_timesteps) ], dim=-1) # run the LM transformer.eval() activations = transformer(batch_tensor) # load the expected activations expected_activations = [] with h5py.File( self.FIXTURES_ROOT / 'openai_transformer' / 'expected_embeddings.hdf5', 'r') as fin: expected_activations.append(fin['0'][...]) expected_activations.append(fin['1'][...]) # just check the top layer for k in range(2): actual = activations[-1][k, :batch_lengths[k], :].numpy() expected = expected_activations[k] numpy.testing.assert_almost_equal(expected, actual, decimal=5)
def tokenize(sentence, tokenizer, indexer): if isinstance(sentence, str): tokens = sentence else: tokens = sentence["text"] tokens = text_standardize(tokens) tokens = strip_repeating_punctuation(tokens) tokens = textwrap.shorten(tokens, width=self._max_character_length) tokenized_text = tokenizer(tokens) stripped_tokens = [] for token in tokenized_text: token_text = token.lower_ token_len = len(token) # Disable the main checker if self._marked_sentences: stripped_tokens.append(token) elif token_len > self._max_word_length: if len(token.pos_) > 0: stripped_tokens.append(Token(token.pos_)) elif token_len < self._min_check_word_length or token_text in self._allowed_tokens: stripped_tokens.append(token) elif token_text not in self._tried_tokens: lookup_tokens(token_text) if token_text in self._allowed_tokens and len(token_text): stripped_tokens.append(token) else: if len(token.pos_) > 0: stripped_tokens.append(Token(token.pos_)) print(f"Rejected token: {token_text}") tokenized_text = stripped_tokens if self._add_start_end_token: tokenized_text.insert(0, Token(START_SYMBOL)) tokenized_text.append(Token(END_SYMBOL)) if len(tokenized_text ) > self._max_sequence_length and self._truncate_sequences: tokenized_text = tokenized_text[:self._max_sequence_length] token_field = TextField(tokenized_text, indexer) return tokens, token_field
def test_openai_transformer_matches_tensorflow(self): model_path = "https://s3-us-west-2.amazonaws.com/allennlp/models/openai-transformer-lm-2018.07.23.tar.gz" indexer = OpenaiTransformerBytePairIndexer(model_path=model_path) transformer = OpenaiTransformer(model_path=model_path) # get the test sentences with open(self.FIXTURES_ROOT / 'openai_transformer' / 'text.txt', 'r') as fin: sentences = fin.read().strip().split('\n') # tokenize and check that indices are correct nlp = spacy.load('en_core_web_sm') # make a batch of two sentences batch_indices = [] batch_lengths = [] for k, sentence in enumerate(sentences): tokens = [token.text for token in nlp(text_standardize(sentence)) if not token.is_space] indices = indexer.tokens_to_indices( [Token(token) for token in tokens], Vocabulary(), 'openai_indexer' ) batch_indices.append(indices['openai_indexer']) batch_lengths.append(len([i for i in indices['openai_indexer'] if i != 0])) batch_indices = torch.from_numpy(numpy.array(batch_indices)) batch_size, num_timesteps = batch_indices.size() vocab_size = transformer.vocab_size - transformer.n_ctx positional_encodings = get_range_vector(num_timesteps, device=-1) + vocab_size # Combine the inputs with positional encodings batch_tensor = torch.stack([ batch_indices, # (batch_size, num_timesteps) positional_encodings.expand(batch_size, num_timesteps) ], dim=-1) # run the LM transformer.eval() activations = transformer(batch_tensor) # load the expected activations expected_activations = [] with h5py.File(self.FIXTURES_ROOT / 'openai_transformer' / 'expected_embeddings.hdf5', 'r') as fin: expected_activations.append(fin['0'][...]) expected_activations.append(fin['1'][...]) # just check the top layer for k in range(2): actual = activations[-1][k, :batch_lengths[k], :].numpy() expected = expected_activations[k] numpy.testing.assert_almost_equal(expected, actual, decimal=5)
def _standardize(text): return text_standardize(ftfy.fix_text(text))
def _standardize(text): return text_standardize(ftfy.fix_text(text))