Esempio n. 1
0
    def test_elmo_token_representation(self):
        # Load the test words and convert to char ids
        with open(os.path.join(self.elmo_fixtures_path, 'vocab_test.txt'), 'r') as fin:
            tokens = fin.read().strip().split('\n')

        indexer = ELMoTokenCharactersIndexer()
        indices = [indexer.token_to_indices(Token(token), Vocabulary()) for token in tokens]
        # There are 457 tokens. Reshape into 10 batches of 50 tokens.
        sentences = []
        for k in range(10):
            sentences.append(
                    indexer.pad_token_sequence(
                            indices[(k * 50):((k + 1) * 50)], desired_num_tokens=50, padding_lengths={}
                    )
            )
        batch = torch.from_numpy(numpy.array(sentences))

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file)
        elmo_token_embedder_output = elmo_token_embedder(batch)

        # Reshape back to a list of words and compare with ground truth.  Need to also
        # remove <S>, </S>
        actual_embeddings = remove_sentence_boundaries(
                elmo_token_embedder_output['token_embedding'],
                elmo_token_embedder_output['mask']
        )[0].data.numpy()
        actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1])

        embedding_file = os.path.join(self.elmo_fixtures_path, 'elmo_token_embeddings.hdf5')
        with h5py.File(embedding_file, 'r') as fin:
            expected_embeddings = fin['embedding'][...]

        assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
Esempio n. 2
0
    def test_elmo_token_representation(self):
        # Load the test words and convert to char ids
        with open(os.path.join(self.fixtures_path, 'vocab_test.txt'), 'r') as fin:
            tokens = fin.read().strip().split('\n')

        indexer = ELMoTokenCharactersIndexer()
        indices = [indexer.token_to_indices(Token(token), Vocabulary()) for token in tokens]
        # There are 457 tokens. Reshape into 10 batches of 50 tokens.
        sentences = []
        for k in range(10):
            sentences.append(
                    indexer.pad_token_sequence(
                            indices[(k * 50):((k + 1) * 50)], desired_num_tokens=50, padding_lengths={}
                    )
            )
        batch = Variable(torch.from_numpy(numpy.array(sentences)))

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file)
        elmo_token_embedder_output = elmo_token_embedder(batch)

        # Reshape back to a list of words and compare with ground truth.  Need to also
        # remove <S>, </S>
        actual_embeddings = remove_sentence_boundaries(
                elmo_token_embedder_output['token_embedding'],
                elmo_token_embedder_output['mask']
        )[0].data.numpy()
        actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1])

        embedding_file = os.path.join(self.fixtures_path, 'elmo_token_embeddings.hdf5')
        with h5py.File(embedding_file, 'r') as fin:
            expected_embeddings = fin['embedding'][...]

        assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
Esempio n. 3
0
 def __init__(self):
     from allennlp.modules.elmo import _ElmoCharacterEncoder
     if not path.isdir(self.path('elmo')):
         makedirs(self.path('elmo'))
     self.fweights = self.ensure_file(path.join('elmo', 'weights.hdf5'),
                                      url=self.settings['weights'])
     self.foptions = self.ensure_file(path.join('elmo', 'options.json'),
                                      url=self.settings['options'])
     self.embeddings = _ElmoCharacterEncoder(self.foptions, self.fweights)
Esempio n. 4
0
    def test_elmo_token_representation_bos_eos(self):
        # The additional <S> and </S> embeddings added by the embedder should be as expected.
        indexer = ELMoTokenCharactersIndexer()

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file)

        for correct_index, token in [[0, '<S>'], [2, '</S>']]:
            indices = indexer.token_to_indices(Token(token), Vocabulary())
            indices = torch.from_numpy(numpy.array(indices)).view(1, 1, -1)
            embeddings = elmo_token_embedder(indices)['token_embedding']
            assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(), embeddings[0, 1, :].data.numpy())
Esempio n. 5
0
    def test_elmo_token_representation_bos_eos(self):
        # The additional <S> and </S> embeddings added by the embedder should be as expected.
        indexer = ELMoTokenCharactersIndexer()

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file)

        for correct_index, token in [[0, '<S>'], [2, '</S>']]:
            indices = indexer.tokens_to_indices([Token(token)], Vocabulary(), "correct")
            indices = torch.from_numpy(numpy.array(indices["correct"])).view(1, 1, -1)
            embeddings = elmo_token_embedder(indices)['token_embedding']
            assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(), embeddings[0, 1, :].data.numpy())
    def __init__(
        self,
        options_file: str,
        weight_file: str,
        requires_grad: bool = False,
        vocab_to_cache: List[str] = None,
    ) -> None:
        super(_ElmoBiLm, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(
            options_file, weight_file, requires_grad=requires_grad
        )

        self._requires_grad = requires_grad
        if requires_grad and vocab_to_cache:
            logging.warning(
                "You are fine tuning ELMo and caching char CNN word vectors. "
                "This behaviour is not guaranteed to be well defined, particularly. "
                "if not all of your inputs will occur in the vocabulary cache."
            )
        # This is an embedding, used to look up cached
        # word vectors built from character level cnn embeddings.
        self._word_embedding = None
        self._bos_embedding: torch.Tensor = None
        self._eos_embedding: torch.Tensor = None
        if vocab_to_cache:
            logging.info("Caching character cnn layers for words in vocabulary.")
            # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding.
            # They are set in the method so they can be accessed from outside the
            # constructor.
            self.create_cached_cnn_embeddings(vocab_to_cache)

        with open(cached_path(options_file), "r") as fin:
            options = json.load(fin)
        if not options["lstm"].get("use_skip_connections"):
            raise ConfigurationError("We only support pretrained biLMs with residual connections")

        self._elmo_lstm = ElmoLstm(
            input_size=options["lstm"]["projection_dim"],
            hidden_size=options["lstm"]["projection_dim"],
            cell_size=options["lstm"]["dim"],
            num_layers=options["lstm"]["n_layers"],
            memory_cell_clip_value=options["lstm"]["cell_clip"],
            state_projection_clip_value=options["lstm"]["proj_clip"],
            requires_grad=requires_grad,
        )
        self._elmo_lstm.load_weights(weight_file)
        # Number of representation layers including context independent layer
        self.num_layers = options["lstm"]["n_layers"] + 1
Esempio n. 7
0
    def test_elmo_token_representation(self):
        # Load the test words and convert to char ids
        with open(os.path.join(self.elmo_fixtures_path, "vocab_test.txt"),
                  "r") as fin:
            words = fin.read().strip().split("\n")

        vocab = Vocabulary()
        indexer = ELMoTokenCharactersIndexer()
        tokens = [Token(word) for word in words]

        indices = indexer.tokens_to_indices(tokens, vocab)
        # There are 457 tokens. Reshape into 10 batches of 50 tokens.
        sentences = []
        for k in range(10):
            char_indices = indices["elmo_tokens"][(k * 50):((k + 1) * 50)]
            sentences.append(
                indexer.as_padded_tensor_dict(
                    {"elmo_tokens": char_indices},
                    padding_lengths={"elmo_tokens": 50})["elmo_tokens"])
        batch = torch.stack(sentences)

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file,
                                                    self.weight_file)
        elmo_token_embedder_output = elmo_token_embedder(batch)

        # Reshape back to a list of words and compare with ground truth.  Need to also
        # remove <S>, </S>
        actual_embeddings = remove_sentence_boundaries(
            elmo_token_embedder_output["token_embedding"],
            elmo_token_embedder_output["mask"])[0].data.numpy()
        actual_embeddings = actual_embeddings.reshape(
            -1, actual_embeddings.shape[-1])

        embedding_file = os.path.join(self.elmo_fixtures_path,
                                      "elmo_token_embeddings.hdf5")
        with h5py.File(embedding_file, "r") as fin:
            expected_embeddings = fin["embedding"][...]

        assert numpy.allclose(actual_embeddings[:len(tokens)],
                              expected_embeddings,
                              atol=1e-6)
Esempio n. 8
0
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False,
                 vocab_to_cache: List[str] = None,
                 combine_method="weighted-sum",
                 random_init=False) -> None:
        super(ElmoBiLm, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(
            options_file, weight_file, requires_grad=requires_grad)

        self._requires_grad = requires_grad
        self._word_embedding = None
        self._bos_embedding: torch.Tensor = None
        self._eos_embedding: torch.Tensor = None

        with open(cached_path(options_file), "r") as fin:
            options = json.load(fin)

        if not options["lstm"].get("use_skip_connections"):
            raise ConfigurationError(
                "We only support pretrained biLMs with residual connections")

        self._elmo_lstm = LatticeElmoLstm(
            input_size=options["lstm"]["projection_dim"],
            hidden_size=options["lstm"]["projection_dim"],
            cell_size=options["lstm"]["dim"],
            num_layers=options["lstm"]["n_layers"],
            memory_cell_clip_value=options["lstm"]["cell_clip"],
            state_projection_clip_value=options["lstm"]["proj_clip"],
            requires_grad=requires_grad,
            combine_method=combine_method)

        if not random_init:
            self._elmo_lstm.load_weights(weight_file)
        else:
            print("WARNING!!! ELMo weights will not be loaded!!!")
        # Number of representation layers including context independent layer
        self.num_layers = options["lstm"]["n_layers"] + 1
Esempio n. 9
0
def main(
    vocab_path: str,
    elmo_config_path: str,
    elmo_weights_path: str,
    output_dir: str,
    batch_size: int,
    device: int,
    use_custom_oov_token: bool = False,
):
    """
    Creates ELMo word representations from a vocabulary file. These
    word representations are _independent_ - they are the result of running
    the CNN and Highway layers of the ELMo model, but not the Bidirectional LSTM.
    ELMo requires 2 additional tokens: <S> and </S>. The first token
    in this file is assumed to be an unknown token.

    This script produces two artifacts: A new vocabulary file
    with the <S> and </S> tokens inserted and a glove formatted embedding
    file containing word : vector pairs, one per line, with all values
    separated by a space.
    """

    # Load the vocabulary words and convert to char ids
    with open(vocab_path, "r") as vocab_file:
        tokens = vocab_file.read().strip().split("\n")

    # Insert the sentence boundary tokens which elmo uses at positions 1 and 2.
    if tokens[0] != DEFAULT_OOV_TOKEN and not use_custom_oov_token:
        raise ConfigurationError(
            "ELMo embeddings require the use of a OOV token.")

    tokens = [tokens[0]] + ["<S>", "</S>"] + tokens[1:]

    indexer = ELMoTokenCharactersIndexer()
    indices = indexer.tokens_to_indices([Token(token) for token in tokens],
                                        Vocabulary())["tokens"]
    sentences = []
    for k in range((len(indices) // 50) + 1):
        sentences.append(
            indexer.as_padded_tensor_dict(indices[(k * 50):((k + 1) * 50)],
                                          padding_lengths={"tokens": 50}))

    last_batch_remainder = 50 - (len(indices) % 50)
    if device != -1:
        elmo_token_embedder = _ElmoCharacterEncoder(
            elmo_config_path, elmo_weights_path).cuda(device)
    else:
        elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path,
                                                    elmo_weights_path)

    all_embeddings = []
    for i in range((len(sentences) // batch_size) + 1):
        batch = torch.stack(sentences[i * batch_size:(i + 1) * batch_size])
        if device != -1:
            batch = batch.cuda(device)

        token_embedding = elmo_token_embedder(batch)["token_embedding"].data

        # Reshape back to a list of words of shape (batch_size * 50, encoding_dim)
        # We also need to remove the <S>, </S> tokens appended by the encoder.
        per_word_embeddings = (token_embedding[:, 1:-1, :].contiguous().view(
            -1, token_embedding.size(-1)))

        all_embeddings.append(per_word_embeddings)

    # Remove the embeddings associated with padding in the last batch.
    all_embeddings[-1] = all_embeddings[-1][:-last_batch_remainder, :]

    embedding_weight = torch.cat(all_embeddings, 0).cpu().numpy()

    # Write out the embedding in a glove format.
    os.makedirs(output_dir, exist_ok=True)
    with gzip.open(os.path.join(output_dir, "elmo_embeddings.txt.gz"),
                   "wb") as embeddings_file:
        for i, word in enumerate(tokens):
            string_array = " ".join(
                str(x) for x in list(embedding_weight[i, :]))
            embeddings_file.write(f"{word} {string_array}\n".encode("utf-8"))

    # Write out the new vocab with the <S> and </S> tokens.
    _, vocab_file_name = os.path.split(vocab_path)
    with open(os.path.join(output_dir, vocab_file_name),
              "w") as new_vocab_file:
        for word in tokens:
            new_vocab_file.write(f"{word}\n")
Esempio n. 10
0
 def __init__(self):
     super(Model, self).__init__()
     options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
     weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
     self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file)
Esempio n. 11
0
def test_elmo_character_encoder_with_allennlp():
    allennlp_embedder = _ElmoCharacterEncoder(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
    )
    embedder = ElmoCharacterEncoderFactory(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
    ).create()

    allennlp_parameters = [
        '_char_embedding_weights',
        'char_conv_0.bias',
        'char_conv_0.weight',
        'char_conv_1.bias',
        'char_conv_1.weight',
        'char_conv_2.bias',
        'char_conv_2.weight',
        'char_conv_3.bias',
        'char_conv_3.weight',
        'char_conv_4.bias',
        'char_conv_4.weight',
        '_projection.bias',
        '_projection.weight',
    ]
    embedder_parameters = [
        'char_embedding.weight',
        'char_conv_0.bias',
        'char_conv_0.weight',
        'char_conv_1.bias',
        'char_conv_1.weight',
        'char_conv_2.bias',
        'char_conv_2.weight',
        'char_conv_3.bias',
        'char_conv_3.weight',
        'char_conv_4.bias',
        'char_conv_4.weight',
        'output_proj.bias',
        'output_proj.weight',
    ]
    allennlp_parameters_diff = [
        '_highways._layers.0.bias',
        '_highways._layers.0.weight',
        '_highways._layers.1.bias',
        '_highways._layers.1.weight',
    ]
    embedder_parameters_diff = [
        'highway.layers_0.bias',
        'highway.layers_0.weight',
        'highway.layers_1.bias',
        'highway.layers_1.weight',
    ]
    assert len(allennlp_parameters) == len(embedder_parameters)
    assert len(allennlp_parameters_diff) == len(embedder_parameters_diff)

    allennlp_embedder_named_parameters = dict(
        allennlp_embedder.named_parameters())
    # Same.
    for allennlp_param, embedder_param in zip(allennlp_parameters,
                                              embedder_parameters):
        allennlp_w = allennlp_embedder_named_parameters[allennlp_param].data
        embedder_w = embedder.named_parameters()[embedder_param].data

        np.testing.assert_array_equal(embedder_w.numpy(), allennlp_w.numpy())
        assert embedder_w.dtype == allennlp_w.dtype
    # Diff on highway.
    for allennlp_param, embedder_param in zip(allennlp_parameters_diff,
                                              embedder_parameters_diff):
        allennlp_w = allennlp_embedder_named_parameters[allennlp_param].data
        embedder_w = embedder.named_parameters()[embedder_param].data

        assert embedder_w.dtype == allennlp_w.dtype
        np.testing.assert_raises(
            AssertionError,
            np.testing.assert_array_equal,
            embedder_w.numpy(),
            allennlp_w.numpy(),
        )

    sentences = [
        ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.'],
        ['The', 'sentence', '.'],
    ]
    # `(2, 7, 50)`
    character_ids = _sentences_to_ids(sentences)

    # AllenNLP.
    out = allennlp_embedder(character_ids)
    allennlp_token_embedding, _ = remove_sentence_boundaries(
        out['token_embedding'], out['mask'])
    assert list(allennlp_token_embedding.shape) == [2, 7, 16]

    # Ours.
    inputs = pack_padded_sequence(character_ids, [7, 3], batch_first=True)
    out = embedder(inputs.data)
    ours_token_embedding = _unpack(out, inputs.batch_sizes)
    assert list(ours_token_embedding.shape) == [2, 7, 16]

    np.testing.assert_array_almost_equal(
        ours_token_embedding.data.numpy(),
        allennlp_token_embedding.data.numpy(),
    )
def main(vocab_path: str,
         elmo_config_path: str,
         elmo_weights_path: str,
         output_dir: str,
         batch_size: int,
         device: int,
         use_custom_oov_token: bool = False):
    """
    Creates ELMo word representations from a vocabulary file. These
    word representations are _independent_ - they are the result of running
    the CNN and Highway layers of the ELMo model, but not the Bidirectional LSTM.
    ELMo requires 2 additional tokens: <S> and </S>. The first token
    in this file is assumed to be an unknown token.

    This script produces two artifacts: A new vocabulary file
    with the <S> and </S> tokens inserted and a glove formatted embedding
    file containing word : vector pairs, one per line, with all values
    separated by a space.
    """

    # Load the vocabulary words and convert to char ids
    with open(vocab_path, 'r') as vocab_file:
        tokens = vocab_file.read().strip().split('\n')

    # Insert the sentence boundary tokens which elmo uses at positions 1 and 2.
    if tokens[0] != DEFAULT_OOV_TOKEN and not use_custom_oov_token:
        raise ConfigurationError("ELMo embeddings require the use of a OOV token.")

    tokens = [tokens[0]] + ["<S>", "</S>"] + tokens[1:]

    indexer = ELMoTokenCharactersIndexer()
    indices = indexer.tokens_to_indices([Token(token) for token in tokens], Vocabulary(), "indices")["indices"]
    sentences = []
    for k in range((len(indices) // 50) + 1):
        sentences.append(indexer.pad_token_sequence(indices[(k * 50):((k + 1) * 50)],
                                                    desired_num_tokens=50,
                                                    padding_lengths={}))

    last_batch_remainder = 50 - (len(indices) % 50)
    if device != -1:
        elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path,
                                                    elmo_weights_path).cuda(device)
    else:
        elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path,
                                                    elmo_weights_path)

    all_embeddings = []
    for i in range((len(sentences) // batch_size) + 1):
        array = numpy.array(sentences[i * batch_size: (i + 1) * batch_size])
        if device != -1:
            batch = torch.from_numpy(array).cuda(device)
        else:
            batch = torch.from_numpy(array)

        token_embedding = elmo_token_embedder(batch)['token_embedding'].data

        # Reshape back to a list of words of shape (batch_size * 50, encoding_dim)
        # We also need to remove the <S>, </S> tokens appended by the encoder.
        per_word_embeddings = token_embedding[:, 1:-1, :].contiguous().view(-1, token_embedding.size(-1))

        all_embeddings.append(per_word_embeddings)

    # Remove the embeddings associated with padding in the last batch.
    all_embeddings[-1] = all_embeddings[-1][:-last_batch_remainder, :]

    embedding_weight = torch.cat(all_embeddings, 0).cpu().numpy()

    # Write out the embedding in a glove format.
    os.makedirs(output_dir, exist_ok=True)
    with gzip.open(os.path.join(output_dir, "elmo_embeddings.txt.gz"), 'wb') as embeddings_file:
        for i, word in enumerate(tokens):
            string_array = " ".join([str(x) for x in list(embedding_weight[i, :])])
            embeddings_file.write(f"{word} {string_array}\n".encode('utf-8'))

    # Write out the new vocab with the <S> and </S> tokens.
    _, vocab_file_name = os.path.split(vocab_path)
    with open(os.path.join(output_dir, vocab_file_name), "w") as new_vocab_file:
        for word in tokens:
            new_vocab_file.write(f"{word}\n")
Esempio n. 13
0
def main(
    vocab_path: str,
    elmo_config_path: str,
    elmo_weights_path: str,
    output_dir: str,
    batch_size: int,
    device: int,
    use_custom_oov_token: bool = False,
):

    with open(vocab_path, "r") as vocab_file:
        tokens = vocab_file.read().strip().split("\n")

    if tokens[0] != DEFAULT_OOV_TOKEN and not use_custom_oov_token:
        raise ConfigurationError("ELMo embeddings require the use of a OOV token.")

    tokens = [tokens[0]] + ["<S>", "</S>"] + tokens[1:]

    indexer = ELMoTokenCharactersIndexer()
    indices = indexer.tokens_to_indices([Token(token) for token in tokens], Vocabulary())["tokens"]
    sentences = []
    for k in range((len(indices) // 50) + 1):
        sentences.append(
            indexer.as_padded_tensor_dict(
                indices[(k * 50) : ((k + 1) * 50)], padding_lengths={"tokens": 50}
            )
        )

    last_batch_remainder = 50 - (len(indices) % 50)
    if device != -1:
        elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path, elmo_weights_path).cuda(
            device
        )
    else:
        elmo_token_embedder = _ElmoCharacterEncoder(elmo_config_path, elmo_weights_path)

    all_embeddings = []
    for i in range((len(sentences) // batch_size) + 1):
        batch = torch.stack(sentences[i * batch_size : (i + 1) * batch_size])
        if device != -1:
            batch = batch.cuda(device)

        token_embedding = elmo_token_embedder(batch)["token_embedding"].data

        per_word_embeddings = (
            token_embedding[:, 1:-1, :].contiguous().view(-1, token_embedding.size(-1))
        )

        all_embeddings.append(per_word_embeddings)

    all_embeddings[-1] = all_embeddings[-1][:-last_batch_remainder, :]

    embedding_weight = torch.cat(all_embeddings, 0).cpu().numpy()

    os.makedirs(output_dir, exist_ok=True)
    with gzip.open(os.path.join(output_dir, "elmo_embeddings.txt.gz"), "wb") as embeddings_file:
        for i, word in enumerate(tokens):
            string_array = " ".join(str(x) for x in list(embedding_weight[i, :]))
            embeddings_file.write(f"{word} {string_array}\n".encode("utf-8"))

    _, vocab_file_name = os.path.split(vocab_path)
    with open(os.path.join(output_dir, vocab_file_name), "w") as new_vocab_file:
        for word in tokens:
            new_vocab_file.write(f"{word}\n")