def batch_to_embeddings(self, batch): u""" Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tuple of tensors, the first representing activations (batch_size, 3, num_timesteps, 1024) and the second a mask (batch_size, num_timesteps). """ character_ids = batch_to_ids(batch) if self.cuda_device >= 0: character_ids = character_ids.cuda(device=self.cuda_device) bilm_output = self.elmo_bilm(character_ids) layer_activations = bilm_output[u'activations'] mask_with_bos_eos = bilm_output[u'mask'] # without_bos_eos is a 3 element list of (activation, mask) tensor pairs, # each with size (batch_size, num_timesteps, dim and (batch_size, num_timesteps) # respectively. without_bos_eos = [ remove_sentence_boundaries(layer, mask_with_bos_eos) for layer in layer_activations ] # Converts a list of pairs (activation, mask) tensors to a single tensor of activations. activations = torch.cat( [ele[0].unsqueeze(1) for ele in without_bos_eos], dim=1) # The mask is the same for each ELMo vector, so just take the first. mask = without_bos_eos[0][1] return activations, mask
def embed_batch( self, batch: List[List[str]], batch_metas) -> Tuple[List[Tuple[str, int]], List[torch.Tensor]]: """ Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. batch_metas : ``List[Dict]``, required A list of metadata: sentence_id: str verb_indices: List[int] Returns ------- """ character_ids = batch_to_ids(batch) if self.cuda_device >= 0: character_ids = character_ids.cuda(device=self.cuda_device) bilm_output = self.elmo_bilm(character_ids) layer_activations_with_bos_eos = bilm_output['activations'] mask_with_bos_eos = bilm_output['mask'] top_activations = remove_sentence_boundaries( layer_activations_with_bos_eos[2], mask_with_bos_eos)[0] results = [] for i, meta in enumerate(batch_metas): sid = meta["sentence_id"] for vi in meta["verb_indices"]: verb_id = {"sentenceId": sid, "verbIndex": vi} results.append((verb_id, top_activations[i, vi])) return results
def test_elmo_token_representation(self): # Load the test words and convert to char ids with open(os.path.join(self.fixtures_path, 'vocab_test.txt'), 'r') as fin: tokens = fin.read().strip().split('\n') indexer = ELMoTokenCharactersIndexer() indices = [indexer.token_to_indices(Token(token), Vocabulary()) for token in tokens] # There are 457 tokens. Reshape into 10 batches of 50 tokens. sentences = [] for k in range(10): sentences.append( indexer.pad_token_sequence( indices[(k * 50):((k + 1) * 50)], desired_num_tokens=50, padding_lengths={} ) ) batch = Variable(torch.from_numpy(numpy.array(sentences))) elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) elmo_token_embedder_output = elmo_token_embedder(batch) # Reshape back to a list of words and compare with ground truth. Need to also # remove <S>, </S> actual_embeddings = remove_sentence_boundaries( elmo_token_embedder_output['token_embedding'], elmo_token_embedder_output['mask'] )[0].data.numpy() actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1]) embedding_file = os.path.join(self.fixtures_path, 'elmo_token_embeddings.hdf5') with h5py.File(embedding_file, 'r') as fin: expected_embeddings = fin['embedding'][...] assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
def test_elmo_token_representation(self): # Load the test words and convert to char ids with open(os.path.join(self.elmo_fixtures_path, 'vocab_test.txt'), 'r') as fin: tokens = fin.read().strip().split('\n') indexer = ELMoTokenCharactersIndexer() indices = [indexer.token_to_indices(Token(token), Vocabulary()) for token in tokens] # There are 457 tokens. Reshape into 10 batches of 50 tokens. sentences = [] for k in range(10): sentences.append( indexer.pad_token_sequence( indices[(k * 50):((k + 1) * 50)], desired_num_tokens=50, padding_lengths={} ) ) batch = torch.from_numpy(numpy.array(sentences)) elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) elmo_token_embedder_output = elmo_token_embedder(batch) # Reshape back to a list of words and compare with ground truth. Need to also # remove <S>, </S> actual_embeddings = remove_sentence_boundaries( elmo_token_embedder_output['token_embedding'], elmo_token_embedder_output['mask'] )[0].data.numpy() actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1]) embedding_file = os.path.join(self.elmo_fixtures_path, 'elmo_token_embeddings.hdf5') with h5py.File(embedding_file, 'r') as fin: expected_embeddings = fin['embedding'][...] assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
def batch_to_embeddings(self, batch: List[List[str]]) -> Tuple[torch.Tensor, torch.Tensor]: """ Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tuple of tensors, the first representing activations (batch_size, 3, num_timesteps, 1024) and the second a mask (batch_size, num_timesteps). """ character_ids = batch_to_ids(batch) if self.cuda_device >= 0: character_ids = character_ids.cuda(device=self.cuda_device) bilm_output = self.elmo_bilm(character_ids) layer_activations = bilm_output['activations'] mask_with_bos_eos = bilm_output['mask'] # without_bos_eos is a 3 element list of (activation, mask) tensor pairs, # each with size (batch_size, num_timesteps, dim and (batch_size, num_timesteps) # respectively. without_bos_eos = [remove_sentence_boundaries(layer, mask_with_bos_eos) for layer in layer_activations] # Converts a list of pairs (activation, mask) tensors to a single tensor of activations. activations = torch.cat([ele[0].unsqueeze(1) for ele in without_bos_eos], dim=1) # The mask is the same for each ELMo vector, so just take the first. mask = without_bos_eos[0][1] return activations, mask
def create_cached_cnn_embeddings(self, tokens: List[str]) -> None: """ Given a list of tokens, this method precomputes word representations by running just the character convolutions and highway layers of elmo, essentially creating uncontextual word vectors. On subsequent forward passes, the word ids are looked up from an embedding, rather than being computed on the fly via the CNN encoder. This function sets 3 attributes: _word_embedding : ``torch.Tensor`` The word embedding for each word in the tokens passed to this method. _bos_embedding : ``torch.Tensor`` The embedding for the BOS token. _eos_embedding : ``torch.Tensor`` The embedding for the EOS token. Parameters ---------- tokens : ``List[str]``, required. A list of tokens to precompute character convolutions for. """ tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens timesteps = 32 batch_size = 32 chunked_tokens = lazy_groups_of(iter(tokens), timesteps) all_embeddings = [] device = get_device_of(next(self.parameters())) for batch in lazy_groups_of(chunked_tokens, batch_size): # Shape (batch_size, timesteps, 50) batched_tensor = batch_to_ids(batch) # NOTE: This device check is for when a user calls this method having # already placed the model on a device. If this is called in the # constructor, it will probably happen on the CPU. This isn't too bad, # because it's only a few convolutions and will likely be very fast. if device >= 0: batched_tensor = batched_tensor.cuda(device) output = self._token_embedder(batched_tensor) token_embedding = output["token_embedding"] mask = output["mask"] token_embedding, _ = remove_sentence_boundaries(token_embedding, mask) all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1))) full_embedding = torch.cat(all_embeddings, 0) # We might have some trailing embeddings from padding in the batch, so # we clip the embedding and lookup to the right size. full_embedding = full_embedding[:len(tokens), :] embedding = full_embedding[2:len(tokens), :] vocab_size, embedding_dim = list(embedding.size()) from allennlp.modules.token_embedders import Embedding # type: ignore self._bos_embedding = full_embedding[0, :] self._eos_embedding = full_embedding[1, :] self._word_embedding = Embedding(vocab_size, # type: ignore embedding_dim, weight=embedding.data, trainable=self._requires_grad, padding_index=0)
def create_cached_cnn_embeddings(self, tokens ) : u""" Given a list of tokens, this method precomputes word representations by running just the character convolutions and highway layers of elmo, essentially creating uncontextual word vectors. On subsequent forward passes, the word ids are looked up from an embedding, rather than being computed on the fly via the CNN encoder. This function sets 3 attributes: _word_embedding : ``torch.Tensor`` The word embedding for each word in the tokens passed to this method. _bos_embedding : ``torch.Tensor`` The embedding for the BOS token. _eos_embedding : ``torch.Tensor`` The embedding for the EOS token. Parameters ---------- tokens : ``List[str]``, required. A list of tokens to precompute character convolutions for. """ tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens timesteps = 32 batch_size = 32 chunked_tokens = lazy_groups_of(iter(tokens), timesteps) all_embeddings = [] device = get_device_of(next(self.parameters())) for batch in lazy_groups_of(chunked_tokens, batch_size): # Shape (batch_size, timesteps, 50) batched_tensor = batch_to_ids(batch) # NOTE: This device check is for when a user calls this method having # already placed the model on a device. If this is called in the # constructor, it will probably happen on the CPU. This isn't too bad, # because it's only a few convolutions and will likely be very fast. if device >= 0: batched_tensor = batched_tensor.cuda(device) output = self._token_embedder(batched_tensor) token_embedding = output[u"token_embedding"] mask = output[u"mask"] token_embedding, _ = remove_sentence_boundaries(token_embedding, mask) all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1))) full_embedding = torch.cat(all_embeddings, 0) # We might have some trailing embeddings from padding in the batch, so # we clip the embedding and lookup to the right size. full_embedding = full_embedding[:len(tokens), :] embedding = full_embedding[2:len(tokens), :] vocab_size, embedding_dim = list(embedding.size()) from allennlp.modules.token_embedders import Embedding # type: ignore self._bos_embedding = full_embedding[0, :] self._eos_embedding = full_embedding[1, :] self._word_embedding = Embedding(vocab_size, # type: ignore embedding_dim, weight=embedding.data, trainable=self._requires_grad, padding_index=0)
def forward( self, inputs: torch.Tensor, word_inputs: torch.Tensor = None, prevs=None, rev_prevs=None ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: # reshape the input if needed original_shape = inputs.size() if len(original_shape) > 3: timesteps, num_characters = original_shape[-2:] reshaped_inputs = inputs.view(-1, timesteps, num_characters) else: reshaped_inputs = inputs reshaped_word_inputs = word_inputs # run the biLM bilm_output = self._elmo_lstm(reshaped_inputs, reshaped_word_inputs, prevs, rev_prevs) layer_activations = bilm_output["activations"] mask_with_bos_eos = bilm_output["mask"] # compute the elmo representations representations = [] for i in range(len(self._scalar_mixes)): scalar_mix = getattr(self, "scalar_mix_{}".format(i)) representation_with_bos_eos = scalar_mix(layer_activations, mask_with_bos_eos) if self._keep_sentence_boundaries: processed_representation = representation_with_bos_eos processed_mask = mask_with_bos_eos else: representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos) processed_representation = representation_without_bos_eos processed_mask = mask_without_bos_eos representations.append(self._dropout(processed_representation)) # reshape if necessary if word_inputs is not None and len(original_word_size) > 2: mask = processed_mask.view(original_word_size) elmo_representations = [ representation.view(original_word_size + (-1, )) for representation in representations ] elif len(original_shape) > 3: mask = processed_mask.view(original_shape[:-1]) elmo_representations = [ representation.view(original_shape[:-1] + (-1, )) for representation in representations ] else: mask = processed_mask elmo_representations = representations return {"elmo_representations": elmo_representations, "mask": mask}
def forward( self, # pylint: disable=arguments-differ inputs: torch.Tensor ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: """ Parameters ---------- inputs : ``torch.autograd.Variable`` Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. We also accept tensors with additional optional dimensions: ``(batch_size, dim0, dim1, ..., dimn, timesteps, 50)`` Returns ------- Dict with keys: ``'elmo_representations'``: ``List[torch.autograd.Variable]`` A ``num_output_representations`` list of ELMo representations for the input sequence. Each representation is shape ``(batch_size, timesteps, embedding_dim)`` ``'mask'``: ``torch.autograd.Variable`` Shape ``(batch_size, timesteps)`` long tensor with sequence mask. """ # reshape the input if needed original_shape = inputs.size() timesteps, num_characters = original_shape[-2:] if len(original_shape) > 3: reshaped_inputs = inputs.view(-1, timesteps, num_characters) else: reshaped_inputs = inputs # run the biLM bilm_output = self._elmo_lstm(reshaped_inputs) layer_activations = bilm_output['activations'] mask_with_bos_eos = bilm_output['mask'] # compute the elmo representations representations = [] for i in range(len(self._scalar_mixes)): scalar_mix = getattr(self, 'scalar_mix_{}'.format(i)) representation_with_bos_eos = scalar_mix(layer_activations, mask_with_bos_eos) representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos) representations.append( self._dropout(representation_without_bos_eos)) # reshape if necessary if len(original_shape) > 3: mask = mask_without_bos_eos.view(original_shape[:-1]) elmo_representations = [ representation.view(original_shape[:-1] + (-1, )) for representation in representations ] else: mask = mask_without_bos_eos elmo_representations = representations return {'elmo_representations': elmo_representations, 'mask': mask}
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model options_file = os.path.join(FIXTURES, 'options.json') weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') elmo_bilm = _ElmoBiLm(options_file, weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) # Now finally we can iterate through batches. iterator = BasicIterator(3) for i, batch in enumerate(iterator(dataset, num_epochs=1, shuffle=False)): batch_tensor = Variable(torch.from_numpy(batch['elmo']['character_ids'])) lm_embeddings = elmo_bilm(batch_tensor) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings['activations'][2], lm_embeddings['mask'] ) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6 ) )
def forward(self, # pylint: disable=arguments-differ inputs: torch.Tensor) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: """ Parameters ---------- inputs : ``torch.autograd.Variable`` Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. We also accept tensors with additional optional dimensions: ``(batch_size, dim0, dim1, ..., dimn, timesteps, 50)`` Returns ------- Dict with keys: ``'elmo_representations'``: ``List[torch.autograd.Variable]`` A ``num_output_representations`` list of ELMo representations for the input sequence. Each representation is shape ``(batch_size, timesteps, embedding_dim)`` ``'mask'``: ``torch.autograd.Variable`` Shape ``(batch_size, timesteps)`` long tensor with sequence mask. """ # reshape the input if needed original_shape = inputs.size() timesteps, num_characters = original_shape[-2:] if len(original_shape) > 3: reshaped_inputs = inputs.view(-1, timesteps, num_characters) else: reshaped_inputs = inputs # run the biLM bilm_output = self._elmo_lstm(reshaped_inputs) layer_activations = bilm_output['activations'] mask_with_bos_eos = bilm_output['mask'] # compute the elmo representations representations = [] for i in range(len(self._scalar_mixes)): scalar_mix = getattr(self, 'scalar_mix_{}'.format(i)) representation_with_bos_eos = scalar_mix(layer_activations, mask_with_bos_eos) representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos ) representations.append(self._dropout(representation_without_bos_eos)) # reshape if necessary if len(original_shape) > 3: mask = mask_without_bos_eos.view(original_shape[:-1]) elmo_representations = [representation.view(original_shape[:-1] + (-1, )) for representation in representations] else: mask = mask_without_bos_eos elmo_representations = representations return {'elmo_representations': elmo_representations, 'mask': mask}
def forward(self, inputs): """ Parameters ---------- inputs: ``torch.Tensor``, required. Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. word_inputs : ``torch.Tensor``, required. If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``, which represent word ids which have been pre-cached. Returns ------- Dict with keys: """ # reshape the input if needed original_shape = inputs.size() if len(original_shape) > 3: timesteps, num_characters = original_shape[-2:] reshaped_inputs = inputs.view(-1, timesteps, num_characters) else: reshaped_inputs = inputs # run the biLM bilm_output = self._elmo_lstm(reshaped_inputs, None) layer_activations = bilm_output['activations'] mask_with_bos_eos = bilm_output['mask'] word_embedding_and_hiddens = torch.cat(layer_activations, dim=-1) assert self.output_dim * len( layer_activations) == word_embedding_and_hiddens.size(-1) # compute the elmo representations representation_with_bos_eos = word_embedding_and_hiddens representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos) processed_representation = representation_without_bos_eos processed_mask = mask_without_bos_eos # reshape if necessary out_representations = [] out_representations.append( processed_representation[:, :, :self.output_dim]) if len(layer_activations) > 1: for i in range(1, len(layer_activations)): out_representations.append( processed_representation[:, :, self.output_dim * i:self.output_dim * (i + 1)]) return { 'elmo_representations': out_representations, 'mask': processed_mask }
def forward( self, # type: ignore tokens: torch.Tensor, ) -> Dict[str, torch.Tensor]: """ # Parameters tokens : `torch.Tensor` Shape `(batch_size, timesteps, ...)` of token ids representing the current batch. These must have been produced using the same indexer the LM was trained on. # Returns The bidirectional language model representations for the input sequence, shape `(batch_size, timesteps, embedding_dim)` """ if self._bos_indices is not None: num_wrapping_dims = max(tokens.dim() - 2, 0) mask = get_text_field_mask({"": { "": tokens }}, num_wrapping_dims=num_wrapping_dims) tokens, mask = add_sentence_boundary_token_ids( tokens, mask, self._bos_indices, self._eos_indices) source = {self._token_name: {"token_characters": tokens}} result_dict = self._lm(source) # shape (batch_size, timesteps, embedding_size) noncontextual_token_embeddings = result_dict[ "noncontextual_token_embeddings"] contextual_embeddings = result_dict["lm_embeddings"] # Typically the non-contextual embeddings are smaller than the contextualized embeddings. # Since we're averaging all the layers we need to make their dimensions match. Simply # repeating the non-contextual embeddings is a crude, but effective, way to do this. duplicated_character_embeddings = torch.cat( [noncontextual_token_embeddings] * self._character_embedding_duplication_count, -1) averaged_embeddings = self._scalar_mix( [duplicated_character_embeddings] + contextual_embeddings) # Add dropout averaged_embeddings = self._dropout(averaged_embeddings) if self._remove_bos_eos: averaged_embeddings, _ = remove_sentence_boundaries( averaged_embeddings, result_dict["mask"]) return averaged_embeddings
def embed(self, tokens): character_ids = batch_to_ids([tokens]) bilm_out = self._elmo_bilm(character_ids) wo_bos_eos = [ remove_sentence_boundaries(layer, bilm_out['mask']) for layer in bilm_out['activations'] ] emb = torch.cat([ele[0][:, None] for ele in wo_bos_eos], dim=1) sep = int(wo_bos_eos[0][1][0, :].sum()) emb = emb[0, :, :sep, :].detach()[self._level] return emb
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() # Now finally we can iterate through batches. iterator = BasicIterator(3) iterator.index_with(vocab) for i, batch in enumerate(iterator(instances, num_epochs=1, shuffle=False)): lm_embeddings = elmo_bilm(batch['elmo']['character_ids']) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings['activations'][2], lm_embeddings['mask'] ) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6 ) )
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() # Now finally we can iterate through batches. iterator = BasicIterator(3) iterator.index_with(vocab) for i, batch in enumerate( iterator(instances, num_epochs=1, shuffle=False)): lm_embeddings = elmo_bilm(batch["elmo"]["character_ids"]["tokens"]) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings["activations"][2], lm_embeddings["mask"]) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [ expected_lm_embeddings[k][i] for k in range(3) ] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6, ))
def character_ids_to_embeddings(character_ids, elmo_bilm, device): # returns (batch_size, 3, num_times, 1024) embeddings and (batch_size, num_times) mask if device >= 0: character_ids = character_ids.cuda(device=device) bilm_output = elmo_bilm(character_ids) layer_activations = bilm_output['activations'] mask_with_bos_eos = bilm_output['mask'] without_bos_eos = [ remove_sentence_boundaries(layer, mask_with_bos_eos) for layer in layer_activations ] # without_bos_eos is a 3 element list of (batch_size, num_times, dim) arrays activations = torch.cat([ele[0].unsqueeze(1) for ele in without_bos_eos], dim=1) mask = without_bos_eos[0][1] return activations, mask
def forward(self, # type: ignore inputs: torch.Tensor) -> Dict[str, torch.Tensor]: """ Parameters ---------- inputs: ``torch.Tensor`` Shape ``(batch_size, timesteps, ...)`` of token ids representing the current batch. These must have been produced using the same indexer the LM was trained on. Returns ------- The bidirectional language model representations for the input sequence, shape ``(batch_size, timesteps, embedding_dim)`` """ # pylint: disable=arguments-differ if self._bos_indices is not None: mask = get_text_field_mask({"": inputs}) inputs, mask = add_sentence_boundary_token_ids( inputs, mask, self._bos_indices, self._eos_indices ) source = {self._token_name: inputs} result_dict = self._lm(source) # shape (batch_size, timesteps, embedding_size) noncontextual_token_embeddings = result_dict["noncontextual_token_embeddings"] contextual_embeddings = result_dict["lm_embeddings"] # Typically the non-contextual embeddings are smaller than the contextualized embeddings. # Since we're averaging all the layers we need to make their dimensions match. Simply # repeating the non-contextual embeddings is a crude, but effective, way to do this. duplicated_character_embeddings = torch.cat( [noncontextual_token_embeddings] * self._character_embedding_duplication_count, -1 ) averaged_embeddings = self._scalar_mix( [duplicated_character_embeddings] + contextual_embeddings ) # Add dropout averaged_embeddings = self._dropout(averaged_embeddings) if self._remove_bos_eos: averaged_embeddings, _ = remove_sentence_boundaries( averaged_embeddings, result_dict["mask"] ) return averaged_embeddings
def forward(self, inputs, elmo_lstm_output): texts = self.inputs_to_texts(inputs) instances = self.texts_to_instances(texts) dataset = Batch(instances) dataset.index_instances(self.model.vocab) cp_inputs = util.move_to_device(dataset.as_tensor_dict(), self.cuda_device) words, pos_tags = cp_inputs['tokens'], cp_inputs['pos_tags'] mask = get_text_field_mask(words) layer_activations = elmo_lstm_output['activations'] mask_with_bos_eos = elmo_lstm_output['mask'] # compute the elmo representations representations = [] for i in range(len(self._scalar_mixes)): scalar_mix = getattr(self, 'scalar_mix_{}'.format(i)) representation_with_bos_eos = scalar_mix(layer_activations, mask_with_bos_eos) if self._keep_sentence_boundaries: processed_representation = representation_with_bos_eos processed_mask = mask_with_bos_eos else: representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos) processed_representation = representation_without_bos_eos processed_mask = mask_without_bos_eos representations.append(self._dropout(processed_representation)) # reshape if necessary mask = processed_mask elmo_representations = representations embedded_text_input = elmo_representations[0] if pos_tags is not None and self.model.pos_tag_embedding is not None: embedded_pos_tags = self.model.pos_tag_embedding(pos_tags) embedded_text_input = torch.cat( [embedded_text_input, embedded_pos_tags], -1) elif self.model.pos_tag_embedding is not None: raise ConfigurationError( "Model uses a POS embedding, but no POS tags were passed.") encoded_text = self.model.encoder(embedded_text_input, mask) return encoded_text.detach()
def forward(self, inputs): bilm_output = self.elmo(inputs) layer_activations = bilm_output['activations'] mask_with_bos_eos = bilm_output['mask'] representations = [] for representation in layer_activations: r, mask = remove_sentence_boundaries(representation, mask_with_bos_eos) representations.append(r) repr_forward, repr_backward = representations[-1].split( self.output_dim_half, dim=2) logits_forward = self.decoder(repr_forward) logits_backward = self.decoder(repr_backward) return logits_forward, logits_backward, representations, mask
def test_remove_sentence_boundaries(self): tensor = Variable(torch.from_numpy(numpy.random.rand(3, 5, 7))) mask = Variable( torch.from_numpy( numpy.array([[1, 1, 1, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]))).long() new_tensor, new_mask = util.remove_sentence_boundaries(tensor, mask) expected_new_tensor = Variable(torch.zeros(3, 3, 7)) expected_new_tensor[0, 0, :] = tensor[0, 1, :] expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :] expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :] assert_array_almost_equal(new_tensor.data.numpy(), expected_new_tensor.data.numpy()) expected_new_mask = Variable( torch.from_numpy(numpy.array([[1, 0, 0], [1, 1, 1], [1, 1, 0]]))).long() assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all()
def test_remove_sentence_boundaries(self): tensor = torch.from_numpy(numpy.random.rand(3, 5, 7)) mask = torch.from_numpy( # The mask with two elements is to test the corner case # of an empty sequence, so here we are removing boundaries # from "<S> </S>" numpy.array([[1, 1, 0, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 0]])).long() new_tensor, new_mask = util.remove_sentence_boundaries(tensor, mask) expected_new_tensor = torch.zeros(3, 3, 7) expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :] expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :] assert_array_almost_equal(new_tensor.data.numpy(), expected_new_tensor.data.numpy()) expected_new_mask = torch.from_numpy( numpy.array([[0, 0, 0], [1, 1, 1], [1, 1, 0]])).long() assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all()
def test_elmo_token_representation(self): # Load the test words and convert to char ids with open(os.path.join(self.elmo_fixtures_path, "vocab_test.txt"), "r") as fin: words = fin.read().strip().split("\n") vocab = Vocabulary() indexer = ELMoTokenCharactersIndexer() tokens = [Token(word) for word in words] indices = indexer.tokens_to_indices(tokens, vocab) # There are 457 tokens. Reshape into 10 batches of 50 tokens. sentences = [] for k in range(10): char_indices = indices["elmo_tokens"][(k * 50):((k + 1) * 50)] sentences.append( indexer.as_padded_tensor_dict( {"elmo_tokens": char_indices}, padding_lengths={"elmo_tokens": 50})["elmo_tokens"]) batch = torch.stack(sentences) elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) elmo_token_embedder_output = elmo_token_embedder(batch) # Reshape back to a list of words and compare with ground truth. Need to also # remove <S>, </S> actual_embeddings = remove_sentence_boundaries( elmo_token_embedder_output["token_embedding"], elmo_token_embedder_output["mask"])[0].data.numpy() actual_embeddings = actual_embeddings.reshape( -1, actual_embeddings.shape[-1]) embedding_file = os.path.join(self.elmo_fixtures_path, "elmo_token_embeddings.hdf5") with h5py.File(embedding_file, "r") as fin: expected_embeddings = fin["embedding"][...] assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
def test_remove_sentence_boundaries(self): tensor = torch.from_numpy(numpy.random.rand(3, 5, 7)) mask = torch.from_numpy( # The mask with two elements is to test the corner case # of an empty sequence, so here we are removing boundaries # from "<S> </S>" numpy.array([[1, 1, 0, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 0]])).long() new_tensor, new_mask = util.remove_sentence_boundaries(tensor, mask) expected_new_tensor = torch.zeros(3, 3, 7) expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :] expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :] assert_array_almost_equal(new_tensor.data.numpy(), expected_new_tensor.data.numpy()) expected_new_mask = torch.from_numpy( numpy.array([[0, 0, 0], [1, 1, 1], [1, 1, 0]])).long() assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all()
def __call__(self, docs: List[Document], infos: List[InfoPb], input_fields: List[str], output_field: str, max_tokens_count: int): from allennlp.modules.elmo import batch_to_ids from allennlp.nn.util import remove_sentence_boundaries batch = [] for doc_num, doc in enumerate(docs): sample = " ".join( [getattr(doc, input_field) for input_field in input_fields]) tokens = self.preprocess(sample)[:max_tokens_count] batch.append(tokens) character_ids = batch_to_ids(batch) if self.cuda_device >= 0: character_ids = character_ids.cuda(device=self.cuda_device) bilm_output = self.elmo_bilm(character_ids) layer_activations = bilm_output['activations'] mask_with_bos_eos = bilm_output['mask'] without_bos_eos = [ remove_sentence_boundaries(layer, mask_with_bos_eos) for layer in layer_activations ] embeddings = torch.cat( [pair[0].unsqueeze(1) for pair in without_bos_eos], dim=1) mask = without_bos_eos[0][1] for doc_num, info in enumerate(infos): length = int(mask[doc_num, :].sum()) doc_embeddings = np.zeros((3, 0, 1024)) if length != 0: doc_embeddings = embeddings[ doc_num, :, :length, :].detach().cpu().numpy() doc_embeddings = doc_embeddings.swapaxes(0, 1).reshape( doc_embeddings.shape[0], -1) mean_embeddings = doc_embeddings.mean(axis=0) max_embeddings = doc_embeddings.max(axis=0) final_embedding = np.concatenate((mean_embeddings, max_embeddings), axis=0) getattr(info, output_field).extend(final_embedding)
def forward(self, inputs, elmo_lstm_output): texts = self.inputs_to_texts(inputs) instances = self.texts_to_instances(texts) dataset = Batch(instances) dataset.index_instances(self.model.vocab) cp_inputs = util.move_to_device(dataset.as_tensor_dict(), self.cuda_device) tokens = cp_inputs['tokens'] mask = get_text_field_mask(tokens) layer_activations = elmo_lstm_output['activations'] mask_with_bos_eos = elmo_lstm_output['mask'] # compute the elmo representations representations = [] for i in range(len(self._scalar_mixes)): scalar_mix = getattr(self, 'scalar_mix_{}'.format(i)) representation_with_bos_eos = scalar_mix(layer_activations, mask_with_bos_eos) if self._keep_sentence_boundaries: processed_representation = representation_with_bos_eos processed_mask = mask_with_bos_eos else: representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos) processed_representation = representation_without_bos_eos processed_mask = mask_without_bos_eos representations.append(self._dropout(processed_representation)) # reshape if necessary mask = processed_mask elmo_representations = representations embedded_text_input = elmo_representations[0] encoded_text = self.model.encoder(embedded_text_input, mask) return encoded_text.detach()
def forward( self, # pylint: disable=arguments-differ inputs: torch.Tensor ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: """ Parameters ---------- inputs: ``torch.autograd.Variable`` Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. Returns ------- Dict with keys: ``'elmo_representations'``: ``List[torch.autograd.Variable]`` A ``num_output_representations`` list of ELMo representations for the input sequence. Each representation is shape ``(batch_size, timesteps, embedding_dim)`` ``'mask'``: ``torch.autograd.Variable`` Shape ``(batch_size, timesteps)`` long tensor with sequence mask. """ bilm_output = self._elmo_lstm(inputs) layer_activations = bilm_output['activations'] mask_with_bos_eos = bilm_output['mask'] elmo_representations = [] for scalar_mix in self._scalar_mixes: representation_with_bos_eos = scalar_mix.forward( layer_activations, mask_with_bos_eos) representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos) elmo_representations.append(representation_without_bos_eos) return { 'elmo_representations': elmo_representations, 'mask': mask_without_bos_eos }
def forward(self, x1, x1_c, x1_f, x1_pos, x1_ner, x1_mask, x2_full, x2_c, x2_full_mask, x3=None): """Inputs: x1 = document word indices [batch * len_d] x1_c = document char indices [batch * len_d * len_w] or [1] x1_f = document word features indices [batch * q_num * len_d * nfeat] x1_pos = document POS tags [batch * len_d] x1_ner = document entity tags [batch * len_d] x1_mask = document padding mask [batch * len_d] x2_full = question word indices [batch * q_num * len_q] x2_c = question char indices [(batch * q_num) * len_q * len_w] x2_full_mask = question padding mask [batch * q_num * len_q] x3 = answer word indices [batch * q_num * len_a] """ # precomputing ELMo is only for context (to speedup computation) if self.opt['use_elmo'] and self.opt['elmo_batch_size'] > self.opt[ 'batch_size']: # precomputing ELMo is used if x1_c.dim() != 1: # precomputation is needed precomputed_bilm_output = self.elmo._elmo_lstm(x1_c) self.precomputed_layer_activations = [ t.detach().cpu() for t in precomputed_bilm_output['activations'] ] self.precomputed_mask_with_bos_eos = precomputed_bilm_output[ 'mask'].detach().cpu() self.precomputed_cnt = 0 # get precomputed ELMo layer_activations = [ t[x1.size(0) * self.precomputed_cnt:x1.size(0) * (self.precomputed_cnt + 1), :, :] for t in self.precomputed_layer_activations ] mask_with_bos_eos = self.precomputed_mask_with_bos_eos[ x1.size(0) * self.precomputed_cnt:x1.size(0) * (self.precomputed_cnt + 1), :] if x1.is_cuda: layer_activations = [t.cuda() for t in layer_activations] mask_with_bos_eos = mask_with_bos_eos.cuda() representations = [] for i in range(len(self.elmo._scalar_mixes)): scalar_mix = getattr(self.elmo, 'scalar_mix_{}'.format(i)) representation_with_bos_eos = scalar_mix( layer_activations, mask_with_bos_eos) representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos) representations.append( self.elmo._dropout(representation_without_bos_eos)) x1_elmo = representations[0][:, :x1.size(1), :] self.precomputed_cnt += 1 precomputed_elmo = True else: precomputed_elmo = False """ x1_full = document word indices [batch * q_num * len_d] x1_full_mask = document padding mask [batch * q_num * len_d] """ x1_full = x1.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), x1.size(1)).contiguous() x1_full_mask = x1_mask.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), x1.size(1)).contiguous() drnn_input_list, qrnn_input_list = [], [] x2 = x2_full.view(-1, x2_full.size(-1)) x2_mask = x2_full_mask.view(-1, x2_full.size(-1)) if self.opt['use_wemb']: # Word embedding for both document and question emb = self.embedding if self.training else self.eval_embed x1_emb = emb(x1) x2_emb = emb(x2) # Dropout on embeddings if self.opt['dropout_emb'] > 0: x1_emb = layers.dropout(x1_emb, p=self.opt['dropout_emb'], training=self.training) x2_emb = layers.dropout(x2_emb, p=self.opt['dropout_emb'], training=self.training) drnn_input_list.append(x1_emb) qrnn_input_list.append(x2_emb) if self.opt['CoVe_opt'] > 0: x1_cove_mid, x1_cove_high = self.CoVe(x1, x1_mask) x2_cove_mid, x2_cove_high = self.CoVe(x2, x2_mask) # Dropout on contexualized embeddings if self.opt['dropout_emb'] > 0: x1_cove_mid = layers.dropout(x1_cove_mid, p=self.opt['dropout_emb'], training=self.training) x1_cove_high = layers.dropout(x1_cove_high, p=self.opt['dropout_emb'], training=self.training) x2_cove_mid = layers.dropout(x2_cove_mid, p=self.opt['dropout_emb'], training=self.training) x2_cove_high = layers.dropout(x2_cove_high, p=self.opt['dropout_emb'], training=self.training) drnn_input_list.append(x1_cove_mid) qrnn_input_list.append(x2_cove_mid) if self.opt['use_elmo']: if not precomputed_elmo: x1_elmo = self.elmo(x1_c)['elmo_representations'][ 0] #torch.zeros(x1_emb.size(0), x1_emb.size(1), 1024, dtype=x1_emb.dtype, layout=x1_emb.layout, device=x1_emb.device) x2_elmo = self.elmo(x2_c)['elmo_representations'][ 0] #torch.zeros(x2_emb.size(0), x2_emb.size(1), 1024, dtype=x2_emb.dtype, layout=x2_emb.layout, device=x2_emb.device) # Dropout on contexualized embeddings if self.opt['dropout_emb'] > 0: x1_elmo = layers.dropout(x1_elmo, p=self.opt['dropout_emb'], training=self.training) x2_elmo = layers.dropout(x2_elmo, p=self.opt['dropout_emb'], training=self.training) drnn_input_list.append(x1_elmo) qrnn_input_list.append(x2_elmo) if self.opt['use_pos']: x1_pos_emb = self.pos_embedding(x1_pos) drnn_input_list.append(x1_pos_emb) if self.opt['use_ner']: x1_ner_emb = self.ner_embedding(x1_ner) drnn_input_list.append(x1_ner_emb) x1_input = torch.cat(drnn_input_list, dim=2) x2_input = torch.cat(qrnn_input_list, dim=2) def expansion_for_doc(z): return z.unsqueeze(1).expand(z.size(0), x2_full.size(1), z.size(1), z.size(2)).contiguous().view( -1, z.size(1), z.size(2)) x1_emb_expand = expansion_for_doc(x1_emb) x1_cove_high_expand = expansion_for_doc(x1_cove_high) #x1_elmo_expand = expansion_for_doc(x1_elmo) if self.opt['no_em']: x1_f = x1_f[:, :, :, 3:] x1_input = torch.cat([ expansion_for_doc(x1_input), x1_f.view(-1, x1_f.size(-2), x1_f.size(-1)) ], dim=2) x1_mask = x1_full_mask.view(-1, x1_full_mask.size(-1)) if self.opt['do_prealign']: x1_atten = self.pre_align(x1_emb_expand, x2_emb, x2_mask) x1_input = torch.cat([x1_input, x1_atten], dim=2) # === Start processing the dialog === # cur_h: [batch_size * max_qa_pair, context_length, hidden_state] # flow : fn (rnn) def flow_operation(cur_h, flow): flow_in = cur_h.transpose(0, 1).view(x1_full.size(2), x1_full.size(0), x1_full.size(1), -1) flow_in = flow_in.transpose(0, 2).contiguous().view( x1_full.size(1), x1_full.size(0) * x1_full.size(2), -1).transpose(0, 1) # [bsz * context_length, max_qa_pair, hidden_state] if self.opt['residual_step']: flow_out, residual_out = flow(flow_in) else: flow_out = flow(flow_in) # [bsz * context_length, max_qa_pair, flow_hidden_state_dim (hidden_state/2)] if self.opt['no_dialog_flow']: flow_out = flow_out * 0 flow_out = flow_out.transpose(0, 1).view(x1_full.size(1), x1_full.size(0), x1_full.size(2), -1).transpose( 0, 2).contiguous() # [bsz * max_qa_pair, context_length, flow_hidden_state_dim] flow_out = flow_out.view(x1_full.size(2), x1_full.size(0) * x1_full.size(1), -1).transpose(0, 1) if self.opt['residual_step']: residual_out = residual_out.transpose(0, 1).view( x1_full.size(1), x1_full.size(0), x1_full.size(2), -1).transpose(0, 2).contiguous() residual_out = residual_out.view( x1_full.size(2), x1_full.size(0) * x1_full.size(1), -1).transpose(0, 1) return flow_out, residual_out else: return flow_out, None # Encode document with RNN doc_abstr_ls = [] doc_hiddens = self.doc_rnn1(x1_input, x1_mask) doc_hiddens_flow, residual_flow = flow_operation( doc_hiddens, self.dialog_flow1) doc_abstr_ls.append(doc_hiddens) #doc_hiddens_flow = torch.cat((doc_hiddens_flow, residual_flow), dim=2) doc_hiddens = self.doc_rnn2( torch.cat((doc_hiddens, doc_hiddens_flow, x1_cove_high_expand), dim=2), x1_mask) doc_hiddens_flow, residual_flow = flow_operation( doc_hiddens, self.dialog_flow2) doc_abstr_ls.append(doc_hiddens) #doc_hiddens_flow = torch.cat((doc_hiddens_flow, residual_flow), dim=2) #with open('flow_bef_att.pkl', 'wb') as output: # pickle.dump(doc_hiddens_flow, output, pickle.HIGHEST_PROTOCOL) #while(1): # pass # Encode question with RNN _, que_abstr_ls = self.question_rnn(x2_input, x2_mask, return_list=True, additional_x=x2_cove_high) # Final question layer question_hiddens = self.high_lvl_qrnn(torch.cat(que_abstr_ls, 2), x2_mask) que_abstr_ls += [question_hiddens] # Main Attention Fusion Layer doc_info = self.deep_attn( [torch.cat([x1_emb_expand, x1_cove_high_expand], 2)], doc_abstr_ls, [torch.cat([x2_emb, x2_cove_high], 2)], que_abstr_ls, x1_mask, x2_mask) doc_hiddens = self.deep_attn_rnn( torch.cat((doc_info, doc_hiddens_flow), dim=2), x1_mask) doc_hiddens_flow, residual_flow = flow_operation( doc_hiddens, self.dialog_flow3) doc_abstr_ls += [doc_hiddens] #doc_hiddens_flow = torch.cat((doc_hiddens_flow, residual_flow), dim=2) #if self.opt['residual_step']: #doc_abstr_ls.append(residual_flow) # Self Attention Fusion Layer if self.opt['use_hoc']: # handle history of context, considering batch=1 x1_att = torch.cat(doc_abstr_ls, 2) hoc = torch.cat( (doc_hiddens[0, :, :].unsqueeze(0), doc_hiddens[:-1, :, :]), dim=0) x1_att = torch.cat((x1_att, hoc), dim=2) else: x1_att = torch.cat(doc_abstr_ls, 2) if self.opt['self_attention_opt'] > 0: highlvl_self_attn_hiddens = self.highlvl_self_att( x1_att, x1_att, x1_mask, x3=doc_hiddens, drop_diagonal=True) doc_hiddens = self.high_lvl_crnn( torch.cat( [doc_hiddens, highlvl_self_attn_hiddens, doc_hiddens_flow], dim=2), x1_mask) elif self.opt['self_attention_opt'] == 0: doc_hiddens = self.high_lvl_crnn( torch.cat([doc_hiddens, doc_hiddens_flow], dim=2), x1_mask) doc_abstr_ls += [doc_hiddens] # Merge the question hidden vectors q_merge_weights = self.self_attn(question_hiddens, x2_mask) question_avg_hidden = layers.weighted_avg(question_hiddens, q_merge_weights) if self.opt['do_hierarchical_query']: question_avg_hidden = self.hier_query_rnn( question_avg_hidden.view(x1_full.size(0), x1_full.size(1), -1)) question_avg_hidden = question_avg_hidden.contiguous().view( -1, question_avg_hidden.size(-1)) # Get Start, End span start_scores, end_scores = self.get_answer(doc_hiddens, question_avg_hidden, x1_mask) all_start_scores = start_scores.view_as( x1_full) # batch x q_num x len_d all_end_scores = end_scores.view_as(x1_full) # batch x q_num x len_d # Get whether there is an answer # doc_hiddens: [bsz * max_qa_pair, context_length, hidden_size] doc_avg_hidden = torch.cat( (torch.max(doc_hiddens, dim=1)[0], torch.mean(doc_hiddens, dim=1)), dim=1) class_scores = self.ans_type_prediction(doc_avg_hidden, question_avg_hidden) all_class_scores = class_scores.view(x1_full.size(0), x1_full.size(1), -1) # batch x q_num x class_num all_class_scores = all_class_scores.squeeze(-1) # when class_num = 1 return all_start_scores, all_end_scores, all_class_scores
def forward(self, x1, x1_c, x1_f, x1_pos, x1_ner, x1_mask, x2_full, x2_c, x2_full_mask, node_id, node_mask, edge_id): """Inputs: x1 = document word indices [batch * len_d] x1_c = document char indices [batch * len_d * len_w] or [1] x1_f = document word features indices [batch * q_num * len_d * nfeat] x1_pos = document POS tags [batch * len_d] x1_ner = document entity tags [batch * len_d] x1_mask = document padding mask [batch * len_d] x2_full = question word indices [batch * q_num * len_q] x2_c = question char indices [(batch * q_num) * len_q * len_w] x2_full_mask = question padding mask [batch * q_num * len_q] node_id [batch * max_node_num * max_node_length] node__mask [batch * max_node_num * max_node_length] edge_id [batch * max_node_num * max_node_num ] """ # print('node_id{}'.format(node_id)) # print('x1{}'.format(x1)) # precomputing ELMo is only for context (to speedup computation) # print('startembeddingweight{}'.format(self.embedding.weight)) if self.opt['use_elmo'] and self.opt['elmo_batch_size'] > self.opt[ 'batch_size']: # precomputing ELMo is used if x1_c.dim() != 1: # precomputation is needed precomputed_bilm_output = self.elmo._elmo_lstm(x1_c) self.precomputed_layer_activations = [ t.detach().cpu() for t in precomputed_bilm_output['activations'] ] self.precomputed_mask_with_bos_eos = precomputed_bilm_output[ 'mask'].detach().cpu() self.precomputed_cnt = 0 # get precomputed ELMo layer_activations = [ t[x1.size(0) * self.precomputed_cnt:x1.size(0) * (self.precomputed_cnt + 1), :, :] for t in self.precomputed_layer_activations ] mask_with_bos_eos = self.precomputed_mask_with_bos_eos[ x1.size(0) * self.precomputed_cnt:x1.size(0) * (self.precomputed_cnt + 1), :] if x1.is_cuda: layer_activations = [t.cuda() for t in layer_activations] mask_with_bos_eos = mask_with_bos_eos.cuda() representations = [] for i in range(len(self.elmo._scalar_mixes)): scalar_mix = getattr(self.elmo, 'scalar_mix_{}'.format(i)) representation_with_bos_eos = scalar_mix( layer_activations, mask_with_bos_eos) representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos) representations.append( self.elmo._dropout(representation_without_bos_eos)) x1_elmo = representations[0][:, :x1.size(1), :] self.precomputed_cnt += 1 precomputed_elmo = True else: precomputed_elmo = False """ x1 = document word indices [batch * len_d] x1_c = document char indices [batch * len_d * len_w] or [1] x1_f = document word features indices [batch * q_num * len_d * nfeat] x1_pos = document POS tags [batch * len_d] x1_ner = document entity tags [batch * len_d] x1_mask = document padding mask [batch * len_d] x2_full = question word indices [batch * q_num * len_q] x2_c = question char indices [(batch * q_num) * len_q * len_w] x2_full_mask = question padding mask [batch * q_num * len_q] x1_full = document word indices [batch * q_num * len_d] x1_full_mask = document padding mask [batch * q_num * len_d] x2_full = question word indices [batch * q_num * len_q] x2_full_mask = question padding mask [batch * q_num * len_q] node_id [batch * max_node_num * max_node_length] node__mask [batch * max_node_num * max_node_length] edge_id [batch * max_node_num * max_node_num ] """ # x1_full [batch * 1 * len_d] -> batch, q_num, len_d # x1_full_mask batch, q_num, len_d x1_full = x1.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), x1.size(1)).contiguous() x1_full_mask = x1_mask.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), x1.size(1)).contiguous() #[batch * max_node_num * max_node_length]-> batch, 1, max_node_num, max_node_length -> batch, q_num , max_node_num, max_node_length # node=node_id.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), node_id.size(1), node_id.size(2)).contiguous() # node_full_mask=node_mask.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), node_mask.size(1), node_mask.size(2)).contiguous() # edge=edge_id.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), edge_id.size(1), edge_id.size(2)).contiguous() node = node_id.view(-1, node_id.size( -1)).contiguous() #(batch*max_node_num), max_node_length # print('node{}'.format(node)) node_full_mask = node_mask.view( -1, node_mask.size(-1)) ##(batch*max_node_num), max_node_length drnn_input_list, qrnn_input_list, grnn_input_list = [], [], [] # x2 [(batch * q_num) * len_q] # x2_mask [(batch * q_num) * len_q] x2 = x2_full.view(-1, x2_full.size(-1)) #((batch*q_num),len_q) x2_mask = x2_full_mask.view(-1, x2_full.size(-1)) #((batch*q_num),len_q) # print('embeddingweight{}'.format(self.embedding.weight)) if self.opt['use_wemb']: # Word embedding for both document and question emb = self.embedding if self.training else self.eval_embed x1_emb = emb(x1) #batch, len_d, emb_size x2_emb = emb(x2) #(batch * q_num), q_length, emb_size node_emb = emb( node) #(batch*max_node_num), max_node_length, emb_size # print('node_emb{}'.format(node_emb[0, 0, :])) # Dropout on embeddings if self.opt['dropout_emb'] > 0: x1_emb = layers.dropout(x1_emb, p=self.opt['dropout_emb'], training=self.training) x2_emb = layers.dropout(x2_emb, p=self.opt['dropout_emb'], training=self.training) node_emb = layers.dropout(node_emb, p=self.opt['dropout_emb'], training=self.training) drnn_input_list.append(x1_emb) qrnn_input_list.append(x2_emb) grnn_input_list.append(node_emb) if self.opt['CoVe_opt'] > 0: x1_cove_mid, x1_cove_high = self.CoVe(x1, x1_mask) #MTLSTM x2_cove_mid, x2_cove_high = self.CoVe(x2, x2_mask) # node_cove_mid, node_cove_high = self.CoVe(node, node_full_mask) # Dropout on contexualized embeddings if self.opt['dropout_emb'] > 0: x1_cove_mid = layers.dropout(x1_cove_mid, p=self.opt['dropout_emb'], training=self.training) x1_cove_high = layers.dropout(x1_cove_high, p=self.opt['dropout_emb'], training=self.training) x2_cove_mid = layers.dropout(x2_cove_mid, p=self.opt['dropout_emb'], training=self.training) x2_cove_high = layers.dropout(x2_cove_high, p=self.opt['dropout_emb'], training=self.training) # node_cove_mid = layers.dropout(node_cove_mid, p=self.opt['dropout_emb'], training=self.training) # node_cove_high = layers.dropout(node_cove_high, p=self.opt['dropout_emb'], training=self.training) drnn_input_list.append(x1_cove_mid) qrnn_input_list.append(x2_cove_mid) if self.opt['use_elmo']: if not precomputed_elmo: x1_elmo = self.elmo(x1_c)['elmo_representations'][ 0] #torch.zeros(x1_emb.size(0), x1_emb.size(1), 1024, dtype=x1_emb.dtype, layout=x1_emb.layout, device=x1_emb.device) x2_elmo = self.elmo(x2_c)['elmo_representations'][ 0] #torch.zeros(x2_emb.size(0), x2_emb.size(1), 1024, dtype=x2_emb.dtype, layout=x2_emb.layout, device=x2_emb.device) # Dropout on contexualized embeddings if self.opt['dropout_emb'] > 0: x1_elmo = layers.dropout(x1_elmo, p=self.opt['dropout_emb'], training=self.training) x2_elmo = layers.dropout(x2_elmo, p=self.opt['dropout_emb'], training=self.training) drnn_input_list.append(x1_elmo) qrnn_input_list.append(x2_elmo) if self.opt['use_pos']: x1_pos_emb = self.pos_embedding(x1_pos) drnn_input_list.append(x1_pos_emb) if self.opt['use_ner']: x1_ner_emb = self.ner_embedding(x1_ner) drnn_input_list.append(x1_ner_emb) x1_input = torch.cat(drnn_input_list, dim=2) #barch,len_d,? x2_input = torch.cat(qrnn_input_list, dim=2) #(batch*q_num),len_q,?? node_input = torch.cat( grnn_input_list, dim=2) #(batch*max_node_num), max_node_length, emb_size def expansion_for_doc(z): #x2_full = question word indices [batch * q_num * len_q] return z.unsqueeze(1).expand(z.size(0), x2_full.size(1), z.size(1), z.size(2)).contiguous().view( -1, z.size(1), z.size(2)) # x1_emb batch, len_d, emb_size x1_emb_expand x1_emb_expand = expansion_for_doc( x1_emb) #(batch*q_num),len_d,emb_size x1_cove_high_expand = expansion_for_doc(x1_cove_high) #node_id [batch * max_node_num * max_node_length] # node_emb (batch*max_node_num), max_node_length, emb_size ->batch, max_node_num, max_node_length, emb_size #batch, 1, max_node_num, max_node_length, emb_size->batch, q_num, max_node_num, max_node_length, emb_size pre_node_emb = node_emb.view(node_id.size(0), node_id.size(1), node_emb.size(1), node_emb.size(2)).contiguous() pre_node_emb_expand = pre_node_emb.unsqueeze(1).expand( pre_node_emb.size(0), x2_full.size(1), pre_node_emb.size(1), pre_node_emb.size(2), pre_node_emb.size(3)).contiguous() #(batch*q_num), max_node_num, max_node_length, emb_size #要用的 node_emb_expand = pre_node_emb_expand.view(-1, pre_node_emb_expand.size(2), pre_node_emb_expand.size(3), pre_node_emb_expand.size(4)) #node_mask [batch * max_node_num * max_node_length]->[batch * q_num * max_node_num * max_node_length] pre_node_emb_expand_mask = node_mask.unsqueeze(1).expand( node_mask.size(0), x2_full.size(1), node_mask.size(1), node_mask.size(2)).contiguous() # #(batch*q_num), max_node_num, max_node_length #要用的 node_emb_expand_mask = pre_node_emb_expand_mask.view( -1, node_mask.size(1), node_mask.size(2)) #edge_id [batch * max_node_num * max_node_num ] [batch *q_num * max_node_num * max_node_num ] pre_edge_expand = edge_id.unsqueeze(1).expand( edge_id.size(0), x2_full.size(1), edge_id.size(1), edge_id.size(2)).contiguous() edge_expand = pre_edge_expand.view(-1, edge_id.size(1), edge_id.size(2)) #x1_elmo_expand = expansion_for_doc(x1_elmo) if self.opt['no_em']: x1_f = x1_f[:, :, :, 3:] x1_input = torch.cat([ expansion_for_doc(x1_input), x1_f.view(-1, x1_f.size(-2), x1_f.size(-1)) ], dim=2) x1_mask = x1_full_mask.view( -1, x1_full_mask.size(-1)) # (batch*q_num, len_d) if self.opt['do_prealign']: x1_atten = self.pre_align( x1_emb_expand, x2_emb, x2_mask) # # batch*q_num* lend * xq_input_size x1_input = torch.cat([x1_input, x1_atten], dim=2) # === Start processing the dialog === # cur_h: [batch_size * max_qa_pair, context_length, hidden_state] # flow : fn (rnn) # x1_full: [batch_size, max_qa_pair, context_length] x1_full = document word indices [batch * q_num * len_d] def flow_operation(cur_h, flow): # ( len_d, batch*q_num, hidden_size)-> len_d,batch, q_num , hidden_size #例如执行view操作之后,不会开辟新的内存空间来存放处理之后的数据,实际上新数据与原始数据共享同一块内存。 # #而在调用contiguous()之后,PyTorch会开辟一块新的内存空间存放变换之后的数据,并会真正改变Tensor的内容,按照变换之后的顺序存放数据。 flow_in = cur_h.transpose(0, 1).view(x1_full.size(2), x1_full.size(0), x1_full.size(1), -1) #q_num,batch, len_d , hidden_size q_num, batch*lend ,hidden flow_in = flow_in.transpose(0, 2).contiguous().view( x1_full.size(1), x1_full.size(0) * x1_full.size(2), -1).transpose(0, 1) # [bsz * context_length, max_qa_pair, hidden_state] flow_out = flow(flow_in) # [bsz * context_length, max_qa_pair, flow_hidden_state_dim (hidden_state/2)] if self.opt['no_dialog_flow']: flow_out = flow_out * 0 flow_out = flow_out.transpose(0, 1).view(x1_full.size(1), x1_full.size(0), x1_full.size(2), -1).transpose( 0, 2).contiguous() flow_out = flow_out.view(x1_full.size(2), x1_full.size(0) * x1_full.size(1), -1).transpose(0, 1) # [bsz * max_qa_pair, context_length, flow_hidden_state_dim] return flow_out # Encode document with RNN doc_abstr_ls = [] doc_hiddens = self.doc_rnn1( x1_input, x1_mask) # (batch*q_num, len_d, hidden_size) graph_output = self.graph_encoder( doc_hiddens, x1_mask, node_emb_expand, node_emb_expand_mask, edge_expand) # bsz', max_node_num, hidden # doc_hiddens=graph_output doc_hiddens_flow = flow_operation( doc_hiddens, self.dialog_flow1) # [bsz * q_num, len_d, flow_hidden_state_dim] doc_abstr_ls.append(graph_output) doc_hiddens = self.doc_rnn2( torch.cat((doc_hiddens, doc_hiddens_flow, x1_cove_high_expand), dim=2), x1_mask) #opt['hidden_size'] * 2 + flow_size + CoVe_size doc_hiddens_flow = flow_operation( doc_hiddens, self.dialog_flow2) # [bsz * q_num, len_d, flow_hidden_state_dim] doc_abstr_ls.append(doc_hiddens) #with open('flow_bef_att.pkl', 'wb') as output: # pickle.dump(doc_hiddens_flow, output, pickle.HIGHEST_PROTOCOL) #while(1): # pass # Encode question with RNN x2_input (batch*q_num),len_q,x2_input_size _, que_abstr_ls = self.question_rnn( x2_input, x2_mask, return_list=True, additional_x=x2_cove_high) # [((batch*q_num), len_q, hidden_size)] # Final question layer question_hiddens = self.high_lvl_qrnn(torch.cat(que_abstr_ls, 2), x2_mask) que_abstr_ls += [question_hiddens] # Main Attention Fusion Layer # x1_emb_expand x1_cove_high_expand (batch*q_num),len_d,emb_size doc_abstr_ls [(batch*q_num, len_d, hidden_size),(batch*q_num, len_d, hidden_size)] # x2_em (batch*q_num),len_q,embsize) que_abstr_ls [(batch*q_num), len_q, hidden_size),(batch*q_num), len_q, hidden_size)] doc_info = self.deep_attn( [torch.cat([x1_emb_expand, x1_cove_high_expand], 2)], doc_abstr_ls, [torch.cat([x2_emb, x2_cove_high], 2)], que_abstr_ls, x1_mask, x2_mask) # # batch*q_num * len1 * x2_input_size #doc_hiddens_flow [bsz * q_num, len_d, flow_hidden_state_dim] doc_hiddens = self.deep_attn_rnn( torch.cat((doc_info, doc_hiddens_flow), dim=2), x1_mask) doc_hiddens_flow = flow_operation(doc_hiddens, self.dialog_flow3) doc_abstr_ls += [ doc_hiddens ] #[(batch*q_num, len_d, hidden_size),(batch*q_num, len_d, hidden_size),(batch*q_num, len_d, hidden_size)] # Self Attention Fusion Layer x1_att = torch.cat(doc_abstr_ls, 2) if self.opt['self_attention_opt'] > 0: #x1_att c1,c2,c3 highlvl_self_attn_hiddens = self.highlvl_self_att( x1_att, x1_att, x1_mask, x3=doc_hiddens, drop_diagonal=True) ## highlvl_self_attn_hiddens batch * len1 * x2_input_size fully aware context on c3 doc_hiddens = self.high_lvl_crnn( torch.cat( [doc_hiddens, highlvl_self_attn_hiddens, doc_hiddens_flow], dim=2), x1_mask) elif self.opt['self_attention_opt'] == 0: doc_hiddens = self.high_lvl_crnn( torch.cat([doc_hiddens, doc_hiddens_flow], dim=2), x1_mask) # (batch*q_num, seq_len, hidden_size) doc_abstr_ls += [doc_hiddens] # Merge the question hidden vectors q_merge_weights = self.self_attn(question_hiddens, x2_mask) question_avg_hidden = layers.weighted_avg( question_hiddens, q_merge_weights) #(batch*q_num )* hidden if self.opt['do_hierarchical_query']: #x1_full: [batch_size, q_num context_length] #question_avg_hidden (bsz, q_num, hidden) question_avg_hidden = self.hier_query_rnn( question_avg_hidden.view(x1_full.size(0), x1_full.size(1), -1)) question_avg_hidden = question_avg_hidden.contiguous().view( -1, question_avg_hidden.size(-1)) # (batch*q_num ), hidden # Get Start, End span # question_avg_hidden (batch*q_num ), hidden doc_hiddens doc_hiddens_flow [bsz * q_num, len_d, flow_hidden_state_dim] start_scores, end_scores = self.get_answer(doc_hiddens, question_avg_hidden, x1_mask) all_start_scores = start_scores.view_as( x1_full) # batch x q_num x len_d all_end_scores = end_scores.view_as(x1_full) # batch x q_num x len_d # Get whether there is an answer #doc_hiddens doc_hiddens_flow [bsz * q_num, len_d, flow_hidden_state_dim] #torch max 函数会返回两个tensor,第一个tensor是每行的最大值 第二个tensor是每行最大值的索引 doc_avg_hidden = torch.cat( (torch.max(doc_hiddens, dim=1)[0], torch.mean(doc_hiddens, dim=1)), dim=1) #batch, hidden class_scores = self.ans_type_prediction(doc_avg_hidden, question_avg_hidden) all_class_scores = class_scores.view(x1_full.size(0), x1_full.size(1), -1) # batch x q_num x class_num all_class_scores = all_class_scores.squeeze(-1) # when class_num = 1 return all_start_scores, all_end_scores, all_class_scores
def forward(self, # pylint: disable=arguments-differ inputs , word_inputs = None) : u""" Parameters ---------- inputs: ``torch.Tensor``, required. Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. word_inputs : ``torch.Tensor``, required. If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``, which represent word ids which have been pre-cached. Returns ------- Dict with keys: ``'elmo_representations'``: ``List[torch.Tensor]`` A ``num_output_representations`` list of ELMo representations for the input sequence. Each representation is shape ``(batch_size, timesteps, embedding_dim)`` ``'mask'``: ``torch.Tensor`` Shape ``(batch_size, timesteps)`` long tensor with sequence mask. """ # reshape the input if needed original_shape = inputs.size() if len(original_shape) > 3: timesteps, num_characters = original_shape[-2:] reshaped_inputs = inputs.view(-1, timesteps, num_characters) else: reshaped_inputs = inputs if word_inputs is not None: original_word_size = word_inputs.size() if self._has_cached_vocab and len(original_word_size) > 2: reshaped_word_inputs = word_inputs.view(-1, original_word_size[-1]) elif not self._has_cached_vocab: logger.warning(u"Word inputs were passed to ELMo but it does not have a cached vocab.") reshaped_word_inputs = None else: reshaped_word_inputs = word_inputs else: reshaped_word_inputs = word_inputs # run the biLM bilm_output = self._elmo_lstm(reshaped_inputs, reshaped_word_inputs) layer_activations = bilm_output[u'activations'] mask_with_bos_eos = bilm_output[u'mask'] # compute the elmo representations representations = [] for i in range(len(self._scalar_mixes)): scalar_mix = getattr(self, u'scalar_mix_{}'.format(i)) representation_with_bos_eos = scalar_mix(layer_activations, mask_with_bos_eos) representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos ) representations.append(self._dropout(representation_without_bos_eos)) # reshape if necessary if word_inputs is not None and len(original_word_size) > 2: mask = mask_without_bos_eos.view(original_word_size) elmo_representations = [representation.view(original_word_size + (-1, )) for representation in representations] elif len(original_shape) > 3: mask = mask_without_bos_eos.view(original_shape[:-1]) elmo_representations = [representation.view(original_shape[:-1] + (-1, )) for representation in representations] else: mask = mask_without_bos_eos elmo_representations = representations return {u'elmo_representations': elmo_representations, u'mask': mask}
def forward( self, inputs: torch.Tensor, word_inputs: torch.Tensor = None ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: """ # Parameters inputs : `torch.Tensor`, required. Shape `(batch_size, timesteps, 50)` of character ids representing the current batch. word_inputs : `torch.Tensor`, required. If you passed a cached vocab, you can in addition pass a tensor of shape `(batch_size, timesteps)`, which represent word ids which have been pre-cached. # Returns Dict with keys: `'elmo_representations'` : `List[torch.Tensor]` A `num_output_representations` list of ELMo representations for the input sequence. Each representation is shape `(batch_size, timesteps, embedding_dim)` `'mask'`: `torch.Tensor` Shape `(batch_size, timesteps)` long tensor with sequence mask. """ # reshape the input if needed original_shape = inputs.size() if len(original_shape) > 3: timesteps, num_characters = original_shape[-2:] reshaped_inputs = inputs.view(-1, timesteps, num_characters) else: reshaped_inputs = inputs if word_inputs is not None: original_word_size = word_inputs.size() if self._has_cached_vocab and len(original_word_size) > 2: reshaped_word_inputs = word_inputs.view( -1, original_word_size[-1]) elif not self._has_cached_vocab: logger.warning( "Word inputs were passed to ELMo but it does not have a cached vocab." ) reshaped_word_inputs = None else: reshaped_word_inputs = word_inputs else: reshaped_word_inputs = word_inputs # run the biLM bilm_output = self._elmo_lstm(reshaped_inputs, reshaped_word_inputs) layer_activations = bilm_output["activations"] mask_with_bos_eos = bilm_output["mask"] # compute the elmo representations representations = [] for i in range(len(self._scalar_mixes)): scalar_mix = getattr(self, "scalar_mix_{}".format(i)) representation_with_bos_eos = scalar_mix(layer_activations, mask_with_bos_eos) if self._keep_sentence_boundaries: processed_representation = representation_with_bos_eos processed_mask = mask_with_bos_eos else: representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos) processed_representation = representation_without_bos_eos processed_mask = mask_without_bos_eos representations.append(self._dropout(processed_representation)) # reshape if necessary if word_inputs is not None and len(original_word_size) > 2: mask = processed_mask.view(original_word_size) elmo_representations = [ representation.view(original_word_size + (-1, )) for representation in representations ] elif len(original_shape) > 3: mask = processed_mask.view(original_shape[:-1]) elmo_representations = [ representation.view(original_shape[:-1] + (-1, )) for representation in representations ] else: mask = processed_mask elmo_representations = representations return {"elmo_representations": elmo_representations, "mask": mask}
def test_fast_elmo_with_allennlp_do_layer_norm(): fast = FastElmo( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, num_output_representations=1, scalar_mix_parameters=[1.0, 1.0, 1.0], do_layer_norm=True, ) allennlp = Elmo( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, num_output_representations=1, dropout=0.0, scalar_mix_parameters=[1.0, 1.0, 1.0], do_layer_norm=True, ) sentences = [ ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.'], ['The', 'sentence', '.'], ] character_ids = _sentences_to_ids(sentences) fast_out = fast(character_ids) allennlp_out = allennlp(character_ids) # Since we don't include the BOS/EOS reprs during layer normalization, # the result will be different from AllenNLP's implementation. np.testing.assert_raises( AssertionError, np.testing.assert_array_almost_equal, fast_out['elmo_representations'][0], allennlp_out['elmo_representations'][0], ) # We can pack BOS/EOS to inputs manually _beginning_of_sentence_characters = torch.from_numpy( np.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1) _end_of_sentence_characters = torch.from_numpy( np.array(ELMoCharacterMapper.end_of_sentence_characters) + 1) mask = ((character_ids > 0).long().sum(dim=-1) > 0).long() character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids( character_ids, mask, _beginning_of_sentence_characters, _end_of_sentence_characters, ) # And disable the mock BOS/EOS actions in FastElmo. fast.exec_managed_lstm_bos_eos = False fast_out_2 = fast(character_ids_with_bos_eos) fast_mixed_repr_2, _ = remove_sentence_boundaries( fast_out_2['elmo_representations'][0], fast_out_2['mask'], ) allennlp_out_2 = allennlp(character_ids) np.testing.assert_array_almost_equal( fast_mixed_repr_2, allennlp_out_2['elmo_representations'][0], )
def forward( self, # type: ignore source: Dict[str, torch.LongTensor] ) -> Dict[str, torch.Tensor]: """ Computes the averaged forward and backward LM loss from the batch. By convention, the input dict is required to have at least a ``"tokens"`` entry that's the output of a ``SingleIdTokenIndexer``, which is used to compute the language model targets. If the model was instantiated with ``remove_bos_eos=True``, then it is expected that each of the input sentences was augmented with begin-sentence and end-sentence tokens. Parameters ---------- tokens: ``torch.Tensor``, required. The output of ``Batch.as_tensor_dict()`` for a batch of sentences. Returns ------- Dict with keys: ``'loss'``: ``torch.Tensor`` averaged forward/backward negative log likelihood ``'forward_loss'``: ``torch.Tensor`` forward direction negative log likelihood ``'backward_loss'``: ``torch.Tensor`` backward direction negative log likelihood ``'lm_embeddings'``: ``torch.Tensor`` (batch_size, timesteps, embed_dim) tensor of top layer contextual representations ``'mask'``: ``torch.Tensor`` (batch_size, timesteps) mask for the embeddings """ # pylint: disable=arguments-differ mask = get_text_field_mask(source) # We must have token_ids so that we can compute targets token_ids = source.get("tokens") if token_ids is None: raise ConfigurationError( "Your data must have a 'tokens': SingleIdTokenIndexer() " "in order to use the BidirectionalLM") # Use token_ids to compute targets forward_targets = torch.zeros_like(token_ids) backward_targets = torch.zeros_like(token_ids) forward_targets[:, 0:-1] = token_ids[:, 1:] backward_targets[:, 1:] = token_ids[:, 0:-1] # shape (batch_size, timesteps + 2, embedding_size) embeddings = self._text_field_embedder(source) contextual_embeddings = self._contextualizer(embeddings, mask) # add dropout contextual_embeddings = self._dropout(contextual_embeddings) # compute softmax loss forward_loss, backward_loss = self._compute_loss( contextual_embeddings, embeddings, forward_targets, backward_targets) num_targets = torch.sum((forward_targets > 0).long()) if num_targets > 0: average_loss = 0.5 * (forward_loss + backward_loss) / num_targets.float() else: average_loss = torch.tensor(0.0).to(forward_targets.device) # pylint: disable=not-callable # this is stored to compute perplexity if needed self._last_average_loss[0] = average_loss.detach().item() if num_targets > 0: # loss is directly minimized if self._loss_scale == 'n_samples': scale_factor = num_targets.float() else: scale_factor = self._loss_scale return_dict = { 'loss': average_loss * scale_factor, 'forward_loss': forward_loss * scale_factor / num_targets.float(), 'backward_loss': backward_loss * scale_factor / num_targets.float() } else: # average_loss zero tensor, return it for all return_dict = { 'loss': average_loss, 'forward_loss': average_loss, 'backward_loss': average_loss } if self._remove_bos_eos: contextual_embeddings, mask = remove_sentence_boundaries( contextual_embeddings, mask) return_dict.update({ 'lm_embeddings': contextual_embeddings, 'mask': mask }) return return_dict
def test_elmo_character_encoder_with_allennlp(): allennlp_embedder = _ElmoCharacterEncoder( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, ) embedder = ElmoCharacterEncoderFactory( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, ).create() allennlp_parameters = [ '_char_embedding_weights', 'char_conv_0.bias', 'char_conv_0.weight', 'char_conv_1.bias', 'char_conv_1.weight', 'char_conv_2.bias', 'char_conv_2.weight', 'char_conv_3.bias', 'char_conv_3.weight', 'char_conv_4.bias', 'char_conv_4.weight', '_projection.bias', '_projection.weight', ] embedder_parameters = [ 'char_embedding.weight', 'char_conv_0.bias', 'char_conv_0.weight', 'char_conv_1.bias', 'char_conv_1.weight', 'char_conv_2.bias', 'char_conv_2.weight', 'char_conv_3.bias', 'char_conv_3.weight', 'char_conv_4.bias', 'char_conv_4.weight', 'output_proj.bias', 'output_proj.weight', ] allennlp_parameters_diff = [ '_highways._layers.0.bias', '_highways._layers.0.weight', '_highways._layers.1.bias', '_highways._layers.1.weight', ] embedder_parameters_diff = [ 'highway.layers_0.bias', 'highway.layers_0.weight', 'highway.layers_1.bias', 'highway.layers_1.weight', ] assert len(allennlp_parameters) == len(embedder_parameters) assert len(allennlp_parameters_diff) == len(embedder_parameters_diff) allennlp_embedder_named_parameters = dict( allennlp_embedder.named_parameters()) # Same. for allennlp_param, embedder_param in zip(allennlp_parameters, embedder_parameters): allennlp_w = allennlp_embedder_named_parameters[allennlp_param].data embedder_w = embedder.named_parameters()[embedder_param].data np.testing.assert_array_equal(embedder_w.numpy(), allennlp_w.numpy()) assert embedder_w.dtype == allennlp_w.dtype # Diff on highway. for allennlp_param, embedder_param in zip(allennlp_parameters_diff, embedder_parameters_diff): allennlp_w = allennlp_embedder_named_parameters[allennlp_param].data embedder_w = embedder.named_parameters()[embedder_param].data assert embedder_w.dtype == allennlp_w.dtype np.testing.assert_raises( AssertionError, np.testing.assert_array_equal, embedder_w.numpy(), allennlp_w.numpy(), ) sentences = [ ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.'], ['The', 'sentence', '.'], ] # `(2, 7, 50)` character_ids = _sentences_to_ids(sentences) # AllenNLP. out = allennlp_embedder(character_ids) allennlp_token_embedding, _ = remove_sentence_boundaries( out['token_embedding'], out['mask']) assert list(allennlp_token_embedding.shape) == [2, 7, 16] # Ours. inputs = pack_padded_sequence(character_ids, [7, 3], batch_first=True) out = embedder(inputs.data) ours_token_embedding = _unpack(out, inputs.batch_sizes) assert list(ours_token_embedding.shape) == [2, 7, 16] np.testing.assert_array_almost_equal( ours_token_embedding.data.numpy(), allennlp_token_embedding.data.numpy(), )
def forward(self, # pylint: disable=arguments-differ inputs: torch.Tensor, word_inputs: torch.Tensor = None) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: """ Parameters ---------- inputs: ``torch.Tensor``, required. Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. word_inputs : ``torch.Tensor``, required. If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``, which represent word ids which have been pre-cached. Returns ------- Dict with keys: ``'elmo_representations'``: ``List[torch.Tensor]`` A ``num_output_representations`` list of ELMo representations for the input sequence. Each representation is shape ``(batch_size, timesteps, embedding_dim)`` ``'mask'``: ``torch.Tensor`` Shape ``(batch_size, timesteps)`` long tensor with sequence mask. """ # reshape the input if needed original_shape = inputs.size() if len(original_shape) > 3: timesteps, num_characters = original_shape[-2:] reshaped_inputs = inputs.view(-1, timesteps, num_characters) else: reshaped_inputs = inputs if word_inputs is not None: original_word_size = word_inputs.size() if self._has_cached_vocab and len(original_word_size) > 2: reshaped_word_inputs = word_inputs.view(-1, original_word_size[-1]) elif not self._has_cached_vocab: logger.warning("Word inputs were passed to ELMo but it does not have a cached vocab.") reshaped_word_inputs = None else: reshaped_word_inputs = word_inputs else: reshaped_word_inputs = word_inputs # run the biLM bilm_output = self._elmo_lstm(reshaped_inputs, reshaped_word_inputs) layer_activations = bilm_output['activations'] mask_with_bos_eos = bilm_output['mask'] # compute the elmo representations representations = [] for i in range(len(self._scalar_mixes)): scalar_mix = getattr(self, 'scalar_mix_{}'.format(i)) representation_with_bos_eos = scalar_mix(layer_activations, mask_with_bos_eos) representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos ) representations.append(self._dropout(representation_without_bos_eos)) # reshape if necessary if word_inputs is not None and len(original_word_size) > 2: mask = mask_without_bos_eos.view(original_word_size) elmo_representations = [representation.view(original_word_size + (-1, )) for representation in representations] elif len(original_shape) > 3: mask = mask_without_bos_eos.view(original_shape[:-1]) elmo_representations = [representation.view(original_shape[:-1] + (-1, )) for representation in representations] else: mask = mask_without_bos_eos elmo_representations = representations return {'elmo_representations': elmo_representations, 'mask': mask}
def forward(self, x1, x1_c, x1_f, x1_pos, x1_ner, x1_mask, x2_full, x2_c, x2_full_mask): #在model_CoQA.py中的QA_model中被调用。 #输入的9个是 #context_id, context_cid, context_feature, context_tag, context_ent, context_mask, # question_id, question_cid, question_mask, """Inputs: x1 = document word indices [batch * len_d] len_d:len_document x1_c = document char indices [batch * len_d * len_w] or [1] x1_c have precompute times batch example , that 's why , i hope i can got answer here x1_f = document word features indices [batch * q_num * len_d * nfeat] x1_pos = document POS tags [batch * len_d] x1_ner = document entity tags [batch * len_d] x1_mask = document padding mask [batch * len_d] x2_full = question word indices [batch * q_num * len_q] x2_c = question char indices [(batch * q_num) * len_q * len_w] x2_full_mask = question padding mask [batch * q_num * len_q] """ ''' context_id, context_cid, context_feature, context_tag, context_ent, context_mask, question_id, x2_full = question_cid, question_mask, overall_mask, ''' # precomputing ELMo is only for context (to speedup computation) if self.opt['use_elmo'] and self.opt['elmo_batch_size'] > self.opt[ 'batch_size']: # precomputing ELMo is used if x1_c.dim() != 1: # precomputation is needed precomputed_bilm_output = self.elmo._elmo_lstm( x1_c ) #这个_elmo_lstm()是会在一个句子的前后加上<s> 和</s> ,就是比batch_to_id给出的数据的sentence_len维度多2 self.precomputed_layer_activations = [ t.detach().cpu() for t in precomputed_bilm_output['activations'] ] #detach()从当前的图中分离,.cpu()放到cpu()上 self.precomputed_mask_with_bos_eos = precomputed_bilm_output[ 'mask'].detach().cpu() #先一次性,将很多倍于batch_size的elmo向量拿出来 self.precomputed_cnt = 0 #下面precomputed_cnt 这个值会加1,程序采用这种做法,讲context的elmo的embeddding提前取出来,存在self.precomputed_layer_activates 和self.precomputed_mask_with_bos_eos中 #每次还是取正常的一个batch大小,precompute_cnt的值会在0-elmo_batch_size // batch_size 之间变化 # get precomputed ELMo layer_activations = [ t[x1.size(0) * self.precomputed_cnt:x1.size(0) * (self.precomputed_cnt + 1), :, :] for t in self.precomputed_layer_activations ] mask_with_bos_eos = self.precomputed_mask_with_bos_eos[ x1.size(0) * self.precomputed_cnt:x1.size(0) * (self.precomputed_cnt + 1), :] # 用precomputed_cnt * x1.size(0) 来计数,每个batch的训练,取这么多的数据 if x1.is_cuda: layer_activations = [t.cuda() for t in layer_activations] mask_with_bos_eos = mask_with_bos_eos.cuda() representations = [] for i in range(len( self.elmo._scalar_mixes)): #len(elmo._scalar_mixes) 就是等于2 ''' elmo._scalar_mixes = [ScalarMix( (scalar_parameters): ParameterList( (0): Parameter containing: [torch.FloatTensor of size 1] (1): Parameter containing: [torch.FloatTensor of size 1] (2): Parameter containing: [torch.FloatTensor of size 1] ) ), ScalarMix( (scalar_parameters): ParameterList( (0): Parameter containing: [torch.FloatTensor of size 1] (1): Parameter containing: [torch.FloatTensor of size 1] (2): Parameter containing: [torch.FloatTensor of size 1] ) )] ''' scalar_mix = getattr(self.elmo, 'scalar_mix_{}'.format(i)) representation_with_bos_eos = scalar_mix( layer_activations, mask_with_bos_eos) representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( representation_with_bos_eos, mask_with_bos_eos) representations.append( self.elmo._dropout(representation_without_bos_eos)) #循环一共两遍,所以一共两个元素,每个元素是[句子个数, 句子长度, 1024]的尺度,这个句子长度中是不包含前后特殊符号的 #而且在我的样例中,数值还是一样的,那为了什么要循环两次呢。 x1_elmo = representations[0][:, :x1.size( 1), :] #x1.size(1)是为了截取最大长度以内的向量 self.precomputed_cnt += 1 precomputed_elmo = True else: precomputed_elmo = False """ x1_full = document word indices [batch * q_num * len_d] x1_full_mask = document padding mask [batch * q_num * len_d] x2_full question word indices [batch * q_num * len_q] x2_full_mask = question padding mask [batch * q_num * len_q] """ # x1 [batch , len_d]-->unsqueeze(1)-->[batch , 1 , len_d] -->expand-->[batch , num_q , len_d] x1_full = x1.unsqueeze(1).expand( x2_full.size(0), x2_full.size(1), x1.size(1)).contiguous() #第二个维度扩展为句子数目的维度 # x1_mask [batch , len_d] --> [batch ,1 , len_d] -->[batch , num_q , len_d] x1_full_mask = x1_mask.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), x1.size(1)).contiguous() drnn_input_list, qrnn_input_list = [], [ ] #处理document的rnn和处理question的rnn x2 = x2_full.view(-1, x2_full.size( -1)) #[batch , q_num , len_q] -> [batch * q_num , len_q] x2_mask = x2_full_mask.view(-1, x2_full.size(-1)) if self.opt['use_wemb']: # Word embedding for both document and question emb = self.embedding if self.training else self.eval_embed x1_emb = emb(x1) x2_emb = emb(x2) # Dropout on embeddings if self.opt['dropout_emb'] > 0: x1_emb = layers.dropout(x1_emb, p=self.opt['dropout_emb'], training=self.training) x2_emb = layers.dropout(x2_emb, p=self.opt['dropout_emb'], training=self.training) drnn_input_list.append(x1_emb) qrnn_input_list.append(x2_emb) if self.opt['CoVe_opt'] > 0: x1_cove_mid, x1_cove_high = self.CoVe(x1, x1_mask) x2_cove_mid, x2_cove_high = self.CoVe(x2, x2_mask) # Dropout on contexualized embeddings if self.opt['dropout_emb'] > 0: x1_cove_mid = layers.dropout(x1_cove_mid, p=self.opt['dropout_emb'], training=self.training) x1_cove_high = layers.dropout(x1_cove_high, p=self.opt['dropout_emb'], training=self.training) x2_cove_mid = layers.dropout(x2_cove_mid, p=self.opt['dropout_emb'], training=self.training) x2_cove_high = layers.dropout(x2_cove_high, p=self.opt['dropout_emb'], training=self.training) drnn_input_list.append(x1_cove_mid) qrnn_input_list.append(x2_cove_mid) if self.opt['use_elmo']: if not precomputed_elmo: x1_elmo = self.elmo(x1_c)['elmo_representations'][ 0] #torch.zeros(x1_emb.size(0), x1_emb.size(1), 1024, dtype=x1_emb.dtype, layout=x1_emb.layout, device=x1_emb.device) x2_elmo = self.elmo(x2_c)['elmo_representations'][ 0] #torch.zeros(x2_emb.size(0), x2_emb.size(1), 1024, dtype=x2_emb.dtype, layout=x2_emb.layout, device=x2_emb.device) # Dropout on contexualized embeddings if self.opt['dropout_emb'] > 0: x1_elmo = layers.dropout(x1_elmo, p=self.opt['dropout_emb'], training=self.training) x2_elmo = layers.dropout(x2_elmo, p=self.opt['dropout_emb'], training=self.training) drnn_input_list.append(x1_elmo) qrnn_input_list.append(x2_elmo) if self.opt['use_pos']: x1_pos_emb = self.pos_embedding(x1_pos) drnn_input_list.append(x1_pos_emb) if self.opt['use_ner']: x1_ner_emb = self.ner_embedding(x1_ner) drnn_input_list.append(x1_ner_emb) x1_input = torch.cat(drnn_input_list, dim=2) x2_input = torch.cat(qrnn_input_list, dim=2) def expansion_for_doc(z): return z.unsqueeze(1).expand(z.size(0), x2_full.size(1), z.size(1), z.size(2)).contiguous().view( -1, z.size(1), z.size(2)) #[batch * num_q , len_d , emb_dim] x1_emb_expand = expansion_for_doc(x1_emb) x1_cove_high_expand = expansion_for_doc(x1_cove_high) #x1_elmo_expand = expansion_for_doc(x1_elmo) if self.opt[ 'no_em']: #x1_f = document word features indices [batch * q_num * len_d * nfeat] x1_f = x1_f[:, :, :, 3:] x1_input = torch.cat([ expansion_for_doc(x1_input), x1_f.view(-1, x1_f.size(-2), x1_f.size(-1)) ], dim=2) x1_mask = x1_full_mask.view(-1, x1_full_mask.size(-1)) # Interaction Layer(1.flow 2.integration 两者交互) if self.opt[ 'do_prealign']: #x1_emb_expand [batch * num_q , len_d, emb_dim] 这里面的emb_dim是最纯朴的单词的词向量,不是elmo也不是CoVe # x2_emb [batch * num_q , len_q , emb_dim] x1_atten = self.pre_align( x1_emb_expand, x2_emb, x2_mask ) #self.pre_align = layers.GetAttentionHiddens(embedding_dim, opt['prealign_hidden'], similarity_attention=True) x1_input = torch.cat([x1_input, x1_atten], dim=2) #有了问题信息加权的篇章表示 # === Start processing the dialog === # cur_h: [batch_size * max_qa_pair, context_length, hidden_state] # flow : fn (rnn) # x1_full: [batch_size, max_qa_pair, context_length] def flow_operation(cur_h, flow): #flow操作就是在经过rnn之前要保证对qa_pairs这个维度滚rnn # cur_h [batch * max_qa_pair, len_d , hidden * 2] --> [len_d , batch * num_q , hidden * 2] -> [len_d , batch , num_q , hidden * 2] flow_in = cur_h.transpose(0, 1).view(x1_full.size(2), x1_full.size(0), x1_full.size(1), -1) # [len_d , batch , num_q , hidden * 2] -> [num_q ,batch * len_d , hidden * 2] ->[batch * len_d , num_q , hidden * 2] flow_in = flow_in.transpose(0, 2).contiguous().view( x1_full.size(1), x1_full.size(0) * x1_full.size(2), -1).transpose(0, 1) # [bsz * context_length, max_qa_pair, hidden_state] flow_out = flow(flow_in) # [bsz * context_length, max_qa_pair, flow_hidden_state_dim (hidden_state/2)] if self.opt['no_dialog_flow']: flow_out = flow_out * 0 flow_out = flow_out.transpose(0, 1).view(x1_full.size(1), x1_full.size(0), x1_full.size(2), -1).transpose( 0, 2).contiguous() flow_out = flow_out.view(x1_full.size(2), x1_full.size(0) * x1_full.size(1), -1).transpose(0, 1) # [bsz * max_qa_pair, context_length, flow_hidden_state_dim] return flow_out # Encode document with RNN; Passage and Question Interaction doc_abstr_ls = [] doc_hiddens = self.doc_rnn1(x1_input, x1_mask) #[batch , len_d , hidden * 2] doc_hiddens_flow = flow_operation(doc_hiddens, self.dialog_flow1) doc_abstr_ls.append(doc_hiddens) doc_hiddens = self.doc_rnn2( torch.cat((doc_hiddens, doc_hiddens_flow, x1_cove_high_expand), dim=2), x1_mask) doc_hiddens_flow = flow_operation(doc_hiddens, self.dialog_flow2) doc_abstr_ls.append(doc_hiddens) ''' #with open('flow_bef_att.pkl', 'wb') as output: # pickle.dump(doc_hiddens_flow, output, pickle.HIGHEST_PROTOCOL) #while(1): # pass ''' # Encode question with RNN _, que_abstr_ls = self.question_rnn(x2_input, x2_mask, return_list=True, additional_x=x2_cove_high) # que_abstr_ls 将两层的问题向量都返回了,每一层都是[batch * q_num , len_q , hidden * 2] # Final question layer question_hiddens = self.high_lvl_qrnn(torch.cat(que_abstr_ls, 2), x2_mask) #[batch * num_q , len_q , hidden * 2] que_abstr_ls += [question_hiddens] # Main Attention Fusion Layer doc_info = self.deep_attn( [torch.cat([x1_emb_expand, x1_cove_high_expand], 2)], doc_abstr_ls, [torch.cat([x2_emb, x2_cove_high], 2)], que_abstr_ls, x1_mask, x2_mask) # history-aware attention,(修改question的的某一层的向量的时候,将passage和question所有的层拼接起来作为query和key) # query:all_layer_cancated_passage, key:all_layer_concated_question, value:question_layer[i] when calculating the i-th question_layer embedding # 修改问题之后,注意力加权平均,得到与doc在len_d维度一样的tensor,拼接到第二个flow层输出的doc表征上 doc_hiddens = self.deep_attn_rnn( torch.cat((doc_info, doc_hiddens_flow), dim=2), x1_mask) #过了rnn的结果 doc_hiddens_flow = flow_operation(doc_hiddens, self.dialog_flow3) doc_abstr_ls += [doc_hiddens] # Self Attention Fusion Layer # For Passage do self attention x1_att = torch.cat( doc_abstr_ls, 2) # x1_att是过往所有层passage结合question之后的信息在hid_dim维度上的拼接 if self.opt['self_attention_opt'] > 0: highlvl_self_attn_hiddens = self.highlvl_self_att( x1_att, x1_att, x1_mask, x3=doc_hiddens, drop_diagonal=True) # 在第三个flow处 doc_hiddens:passage的在len_d的维度上走过rnn的 doc_hiddens_flow:passage在max_qa_pairs这个维度上走过rnn,即第三个,最后一个,flow的输出 # 拼接之后在len_d这个维度上走过rnn doc_hiddens = self.high_lvl_crnn( torch.cat( [doc_hiddens, highlvl_self_attn_hiddens, doc_hiddens_flow], dim=2), x1_mask) elif self.opt['self_attention_opt'] == 0: doc_hiddens = self.high_lvl_crnn( torch.cat([doc_hiddens, doc_hiddens_flow], dim=2), x1_mask) doc_abstr_ls += [doc_hiddens] # Merge the question hidden vectors q_merge_weights = self.self_attn( question_hiddens, x2_mask ) #question_hiddens is the final question hidden layer [batch * num_q , len_q , hidden * 2] # 计算出了自注意力的权重,#这个不是真的自注意力机制,是利用一个额外的向量z,对各个hidden进行点乘的注意力分数 question_avg_hidden = layers.weighted_avg( question_hiddens, q_merge_weights) #按照自注意力权重获得加权平均 #[batch , hid] if self.opt['do_hierarchical_query']: #default True # [batch, max_qa_pair , hid ] # [batch , max_qa_pair , hid] 只是单向的,所以隐层还是hid,我好奇他最后是取句子级别的最后一个隐层单元吗?还是有attention,pooling一下 question_avg_hidden = self.hier_query_rnn( question_avg_hidden.view(x1_full.size(0), x1_full.size(1), -1)) question_avg_hidden = question_avg_hidden.contiguous().view( -1, question_avg_hidden.size(-1)) #[batch * max_qa_pair , hid] # Prediction Layer # Get Start, End span start_scores, end_scores = self.get_answer(doc_hiddens, question_avg_hidden, x1_mask) # both are [batch * q_num, len_d] all_start_scores = start_scores.view_as( x1_full) # batch x q_num x len_d all_end_scores = end_scores.view_as(x1_full) # batch x q_num x len_d # Get whether there is an answer # torch.cat( [batch , hidden] ,[batch , hidden] , dim = 1) -> [batch , 2 * hidden] doc_avg_hidden = torch.cat( (torch.max(doc_hiddens, dim=1)[0], torch.mean(doc_hiddens, dim=1)), dim=1) # 预测答案的类型 class_scores = self.ans_type_prediction(doc_avg_hidden, question_avg_hidden) all_class_scores = class_scores.view(x1_full.size(0), x1_full.size(1), -1) # batch x q_num x class_num all_class_scores = all_class_scores.squeeze(-1) # when class_num = 1 #all_class_scores 没有在最后的class_num 维度上归一化softmax,这是为了方式class_num = 1的情况吧,当种类数目是1的时候,结果无论真实的分数是什么,softmax之后都是1 return all_start_scores, all_end_scores, all_class_scores
def forward( self, # pylint: disable=arguments-differ character_ids: torch.Tensor, mask: torch.Tensor, mask_with_bos_eos: torch.Tensor, seg_ends: torch.Tensor, seg_map: torch.Tensor, seg_starts: torch.Tensor, tags: torch.Tensor) -> torch.Tensor: """ Parameters ---------- """ # TODO(Swabha/Matt): detach tensors??? - Matt args_dict = { "mask": mask_with_bos_eos, "seg_ends": seg_ends, "seg_map": seg_map, "seg_starts": seg_starts, "tags": tags } if isinstance(self.seglm, LanguageModel): args_dict["tokens"] = {"elmo": character_ids} else: args_dict["character_ids"] = character_ids lm_output_dict = self.seglm(**args_dict) sequential_embeddings = lm_output_dict["sequential"] segmental_embeddings = lm_output_dict["segmental"] projection_embeddings = lm_output_dict["projection"] embeddings_list = [] if self.use_all_base_layers: if isinstance(self.seglm, LanguageModel): raise NotImplementedError base_layer_embeddings = [ emb.squeeze(1) for emb in lm_output_dict["activations"] ] embeddings_list.append(base_layer_embeddings) else: embeddings_list.append(sequential_embeddings) # Always include segmental layer. embeddings_list.append(segmental_embeddings) if self.use_projection_layer: embeddings_list.append(projection_embeddings) if self._scalar_mix is None: averaged_embeddings = segmental_embeddings elif self.concat_segmental: averaged_embeddings = torch.cat( (sequential_embeddings, segmental_embeddings), dim=-1) else: averaged_embeddings = self._dropout( self._scalar_mix(embeddings_list)) averaged_embeddings_no_bos_eos, _ = remove_sentence_boundaries( averaged_embeddings, mask_with_bos_eos) return averaged_embeddings_no_bos_eos
from allennlp.commands.elmo import ElmoEmbedder from allennlp.nn.util import remove_sentence_boundaries # url to the pre-trained model options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" # the ELMo class elmo_bilm = ElmoEmbedder(options_file, weight_file).elmo_bilm elmo_bilm.cuda() sentences = [['Today', 'is', 'sunny', '.'], ['Hello', '!']] # obtain character ids for each word. Size: batch_size × max_sentence_len × word_len character_ids = batch_to_ids(sentences).cuda() # ELMo's output bilm_output = elmo_bilm(character_ids) # ELMo embeddings for each layer layer_activations = bilm_output['activations'] # indicate whether there is a word at each position mask_with_bos_eos = bilm_output['mask'] # remove the special sentence start and end symbols added by ELMo without_bos_eos = [remove_sentence_boundaries(layer, mask_with_bos_eos) for layer in layer_activations] # three layers of 1024D ELMo embeddings. Size: 3 × batch_size × max_sentence_len × 1024 all_layers = torch.cat([ele[0].unsqueeze(0) for ele in without_bos_eos], dim=0) # paraemeters for weighted sum s = nn.Parameter(torch.Tensor([1., 1., 1.]), requires_grad=True).cuda() # normalize the weights s = F.softmax(s, dim=0) # the multiplier γ gamma = nn.Parameter(torch.Tensor(1, 1), requires_grad=True).cuda() # ELMo embedding. Size: batch_size × max_sentence_len × 1024 res = (all_layers[0]*s[0]+ all_layers[1]*s[1]+ all_layers[2]*s[2]) * gamma print(res.shape)