def forward_on_instances( self, instances: List[Instance], cuda_device: int) -> List[Dict[str, numpy.ndarray]]: """ Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. """ dataset = Dataset(instances) dataset.index_instances(self.vocab) model_input = dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False) outputs = self.decode(self(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): if isinstance(output, torch.autograd.Variable): output = output.data.cpu().numpy() outputs[name] = output for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model options_file = os.path.join(FIXTURES, 'options.json') weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') elmo_bilm = _ElmoBiLm(options_file, weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) # Now finally we can iterate through batches. iterator = BasicIterator(3) for i, batch in enumerate(iterator(dataset, num_epochs=1, shuffle=False)): lm_embeddings = elmo_bilm(batch['elmo']['character_ids']) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings['activations'][2], lm_embeddings['mask'] ) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6 ) )
def _sentences_to_ids(self, sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def batch_to_ids(batch): """ Given a batch (as list of tokenized sentences), return a batch of padded character ids. """ instances = [] for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']