Beispiel #1
0
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
                "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
                "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
                "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
                "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
                }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
                'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
                'embedding_dim': 5
                })
        with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"):
            Embedding.from_params(vocab, params)

        for ext in ['.zip', '.tar.gz']:
            archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt')
            params = Params({
                    'pretrained_file': file_uri,
                    'embedding_dim': 5
                    })
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in token2vec.items():
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
Beispiel #2
0
    def test_embedding_vocab_extension_is_no_op_when_extension_should_not_happen(
            self):
        # Case1: When vocab is already in sync with embeddings it should be a no-op.
        vocab = Vocabulary({"tokens": {"word1": 1, "word2": 1}})
        embedding_params = Params({
            "vocab_namespace": "tokens",
            "embedding_dim": 10
        })
        embedder = Embedding.from_params(embedding_params, vocab=vocab)
        original_weight = embedder.weight
        embedder.extend_vocab(vocab, "tokens")
        assert torch.all(embedder.weight == original_weight)

        # Case2: Shouldn't wrongly assuming "tokens" namespace for extension if no
        # information on vocab_namespece is available. Rather log a warning and be a no-op.
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word1", "tokens")
        vocab.add_token_to_namespace("word2", "tokens")
        embedding_params = Params({
            "vocab_namespace": "tokens",
            "embedding_dim": 10
        })
        embedder = Embedding.from_params(embedding_params, vocab=vocab)
        # Previous models won't have _vocab_namespace attribute. Force it to be None
        embedder._vocab_namespace = None
        embedder.weight = torch.nn.Parameter(embedder.weight[:1, :])
        assert embedder.weight.shape[0] == 1
        embedder.extend_vocab(vocab)  # Don't specify namespace
        assert embedder.weight.shape[0] == 1
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
                u"think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
                u"make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
                u"difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
                u"àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
                }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
                u'pretrained_file': unicode(self.FIXTURES_ROOT / u'embeddings/multi-file-archive.zip'),
                u'embedding_dim': 5
                })
        with pytest.raises(ValueError, message=u"No ValueError when pretrained_file is a multi-file archive"):
            Embedding.from_params(vocab, params)

        for ext in [u'.zip', u'.tar.gz']:
            archive_path = unicode(self.FIXTURES_ROOT / u'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(archive_path, u'folder/fake_embeddings.5d.txt')
            params = Params({
                    u'pretrained_file': file_uri,
                    u'embedding_dim': 5
                    })
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in list(token2vec.items()):
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i], vec), u'Problem with format ' + archive_path
Beispiel #4
0
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
                "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
                "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
                "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
                "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
                }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
                'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
                'embedding_dim': 5
                })
        with pytest.raises(ValueError,
                           match="The archive .*/embeddings/multi-file-archive.zip contains multiple files, "
                                 "so you must select one of the files inside "
                                 "providing a uri of the type: "
                                 "\\(path_or_url_to_archive\\)#path_inside_archive\\."):
            Embedding.from_params(vocab, params)

        for ext in ['.zip', '.tar.gz']:
            archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt')
            params = Params({
                    'pretrained_file': file_uri,
                    'embedding_dim': 5
                    })
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in token2vec.items():
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
    def from_params(cls,params: Params, vocab: Vocabulary) -> 'Embedding':  # type: ignore

        cuda_device = params.pop("cuda_device",-1)
        use_glove_embedding = params.pop("use_glove_embedding", False)
        #glove_dimension_size = params.pop("glove_dimension_size",300)
        use_elmo_embedding = params.pop("use_elmo_embedding", False)
        use_verb_index_embedding = params.pop("use_verb_index_embedding",False)
        verb_index_embedding_dimension = params.pop("verb_index_embedding_dimension",50)
        use_visual_score_embedding = params.pop("use_visual_score_embedding",False)

        num_embeddings = vocab.get_vocab_size() #0 = padding, 1 = unknow, the rest is vocabulary
        embedding_dim = 0
        
        # test if to use elmo embedding 
        if use_elmo_embedding:
            elmo_token_embedder = Elmo.from_params(params.pop("elmo"))
            embedding_dim = embedding_dim + elmo_token_embedder.get_output_dim() # current dimension for elmo embedding - 512*2 = 1024 
        else:
            elmo_token_embedder = None

        if use_glove_embedding:
            # glove_embeddings an Embeddings with dimension of 300
            #glove_embedder = get_glove_embedder(num_embeddings,glove_dimension_size,vocab)
            glove_embedder = Embedding.from_params(vocab, params.pop("glove_embedder"))
            embedding_dim = embedding_dim + glove_embedder.get_output_dim()
        else:
            glove_embedder = None
			
        if use_verb_index_embedding:
            # suffix_embeddings: need two elements for 0 (non-metaphore) and 1 (is metaphore)
            verb_index_embedder = Embedding(2, verb_index_embedding_dimension)
            embedding_dim = embedding_dim + verb_index_embedder.get_output_dim() # for suffix embedding
        else:
            verb_index_embedder = None
			
        if use_visual_score_embedding:
            # use pretrained weight matrix
            visual_score_embedder = Embedding.from_params(vocab, params.pop("visual_embedder"))
            embedding_dim = embedding_dim + visual_score_embedder.get_output_dim()
        else:
            visual_score_embedder = None
			
        if cuda_device == -1:
            is_gpu = False
        else:
            is_gpu = True
	
        return cls(num_embeddings=num_embeddings,embedding_dim=embedding_dim, glove_embedder=glove_embedder, 
                    elmo_embedder=elmo_token_embedder, verb_index_embedder=verb_index_embedder, 
                    visual_score_embedder=visual_score_embedder,is_gpu=is_gpu)
Beispiel #6
0
 def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
     vocab.add_token_to_namespace(unicode_space)
     embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
     with gzip.open(embeddings_filename, "wb") as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8"))
         embeddings_file.write(
             f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8"))
     params = Params({
         "pretrained_file": embeddings_filename,
         "embedding_dim": 3
     })
     embedding_layer = Embedding.from_params(params, vocab=vocab)
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         "word")]
     assert numpy.allclose(word_vector.numpy(),
                           numpy.array([1.0, 2.3, -1.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         unicode_space)]
     assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3,
                                                             5.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         "word2")]
     assert not numpy.allclose(word_vector.numpy(),
                               numpy.array([1.0, 2.3, -1.0]))
Beispiel #7
0
    def __init__(self, vocab, use_postags_only=True, embed_dim=100, hidden_size=200, recurrent_dropout_probability=0.3,
                 use_highway=False,
                 maxpool=True):
        super(BLSTMModel, self).__init__()

        self.embeds = Embedding.from_params(
            vocab,
            Params({'vocab_namespace': 'pos' if use_postags_only else 'tokens',
                    'embedding_dim': embed_dim,
                    'trainable': True,
                    'padding_index': 0,
                    'pretrained_file': None if use_postags_only else 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz',
                    }))
        self.binary_feature_embedding = Embedding(2, embed_dim)

        self.fwd_lstm = PytorchSeq2SeqWrapper(AugmentedLstm(
            input_size=embed_dim * 2, hidden_size=hidden_size, go_forward=True,
            recurrent_dropout_probability=recurrent_dropout_probability,
            use_input_projection_bias=False, use_highway=use_highway), stateful=False)

        self.bwd_lstm = PytorchSeq2SeqWrapper(AugmentedLstm(
            input_size=embed_dim * 2, hidden_size=hidden_size, go_forward=False,
            recurrent_dropout_probability=recurrent_dropout_probability,
            use_input_projection_bias=False, use_highway=use_highway), stateful=False)

        self.maxpool = maxpool
        self.fc = nn.Linear(hidden_size * 2, 1, bias=False)
Beispiel #8
0
    def test_embedding_vocab_extension_works_with_pretrained_embedding_file(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace('word1')
        vocab.add_token_to_namespace('word2')

        embeddings_filename = str(self.TEST_DIR / "embeddings2.gz")
        with gzip.open(embeddings_filename, 'wb') as embeddings_file:
            embeddings_file.write("word3 0.5 0.3 -6.0\n".encode('utf-8'))
            embeddings_file.write("word4 1.0 2.3 -1.0\n".encode('utf-8'))
            embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8'))
            embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))

        embedding_params = Params({"vocab_namespace": "tokens", "embedding_dim": 3,
                                   "pretrained_file": embeddings_filename})
        embedder = Embedding.from_params(vocab, embedding_params)

        # Change weight to simulate embedding training
        embedder.weight.data += 1
        assert torch.all(embedder.weight[2:, :] == torch.Tensor([[2.0, 3.3, 0.0], [1.1, 1.4, -3.0]]))
        original_weight = embedder.weight

        assert tuple(original_weight.size()) == (4, 3)  # 4 because of padding and OOV

        vocab.add_token_to_namespace('word3')
        embedder.extend_vocab(vocab, extension_pretrained_file=embeddings_filename) # default namespace
        extended_weight = embedder.weight

        # Make sure extenstion happened for extra token in extended vocab
        assert tuple(extended_weight.size()) == (5, 3)

        # Make sure extension doesn't change original trained weights.
        assert torch.all(original_weight[:4, :] == extended_weight[:4, :])

        # Make sure extended weight is taken from the embedding file.
        assert torch.all(extended_weight[4, :] == torch.Tensor([0.5, 0.3, -6.0]))
Beispiel #9
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'TreeAttention':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)

        premise_encoder_params = params.pop("premise_encoder", None)
        premise_encoder = Seq2SeqEncoder.from_params(premise_encoder_params)

        attention_similarity = SimilarityFunction.from_params(params.pop('attention_similarity'))
        phrase_probability = FeedForward.from_params(params.pop('phrase_probability'))
        edge_probability = FeedForward.from_params(params.pop('edge_probability'))

        edge_embedding = Embedding.from_params(vocab, params.pop('edge_embedding'))
        use_encoding_for_node = params.pop('use_encoding_for_node')
        ignore_edges = params.pop('ignore_edges', False)

        init_params = params.pop('initializer', None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None
                       else InitializerApplicator())

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   phrase_probability=phrase_probability,
                   edge_probability=edge_probability,
                   premise_encoder=premise_encoder,
                   edge_embedding=edge_embedding,
                   use_encoding_for_node=use_encoding_for_node,
                   attention_similarity=attention_similarity,
                   ignore_edges=ignore_edges,
                   initializer=initializer)
Beispiel #10
0
 def test_embedding_vocab_extension_raises_error_for_incorrect_vocab(self):
     # When vocab namespace of extension vocab is smaller than embeddings
     # it should raise configuration error.
     vocab = Vocabulary({"tokens": {"word1": 1, "word2": 1}})
     embedding_params = Params({"vocab_namespace": "tokens", "embedding_dim": 10})
     embedder = Embedding.from_params(vocab, embedding_params)
     with pytest.raises(ConfigurationError):
         embedder.extend_vocab(Vocabulary(), "tokens")
Beispiel #11
0
 def from_params(cls, vocab: Vocabulary,
                 params: Params) -> 'AfixEmbedding':  # type: ignore
     # pylint: disable=arguments-differ
     embedding_params: Params = params.pop("embedding")
     # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
     # that to be "token_characters" by default.
     embedding_params.setdefault("vocab_namespace", "afixes")
     embedding = Embedding.from_params(vocab, embedding_params)
     dropout = params.pop_float("dropout", 0.0)
     params.assert_empty(cls.__name__)
     return cls(embedding, dropout)
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'TokenCharactersEncoder':
     embedding_params: Params = params.pop("embedding")
     # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
     # that to be "tokens" by default.
     embedding_params.setdefault("vocab_namespace", "token_bpe")
     embedding = Embedding.from_params(vocab, embedding_params)
     encoder_params: Params = params.pop("encoder")
     encoder = Seq2VecEncoder.from_params(encoder_params)
     dropout = params.pop("dropout", 0.0)
     params.assert_empty(cls.__name__)
     return cls(embedding, encoder, dropout)
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'TokenCharactersEncoder':  # type: ignore
     # pylint: disable=arguments-differ
     embedding_params: Params = params.pop("embedding")
     # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
     # that to be "token_characters" by default.
     embedding_params.setdefault("vocab_namespace", "token_characters")
     embedding = Embedding.from_params(vocab, embedding_params)
     encoder_params: Params = params.pop("encoder")
     encoder = Seq2VecEncoder.from_params(encoder_params)
     dropout = params.pop_float("dropout", 0.0)
     params.assert_empty(cls.__name__)
     return cls(embedding, encoder, dropout)
 def from_params(cls, vocab, params):  # type: ignore
     # pylint: disable=arguments-differ
     embedding_params = params.pop(u"embedding")
     # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
     # that to be "token_characters" by default.
     embedding_params.setdefault(u"vocab_namespace", u"token_characters")
     embedding = Embedding.from_params(vocab, embedding_params)
     encoder_params = params.pop(u"encoder")
     encoder = Seq2VecEncoder.from_params(encoder_params)
     dropout = params.pop_float(u"dropout", 0.0)
     params.assert_empty(cls.__name__)
     return cls(embedding, encoder, dropout)
 def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
     params = Params({
             'pretrained_file': embeddings_filename,
             'embedding_dim': 3,
             })
     embedding_layer = Embedding.from_params(vocab, params)
     word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
     assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0]))
 def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace(u"word")
     vocab.add_token_to_namespace(u"word2")
     embeddings_filename = unicode(self.TEST_DIR / u"embeddings.gz")
     with gzip.open(embeddings_filename, u'wb') as embeddings_file:
         embeddings_file.write(u"word 1.0 2.3 -1.0\n".encode(u'utf-8'))
     params = Params({
             u'pretrained_file': embeddings_filename,
             u'embedding_dim': 3,
             })
     embedding_layer = Embedding.from_params(vocab, params)
     word_vector = embedding_layer.weight.data[vocab.get_token_index(u"word2")]
     assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0]))
    def test_read_hdf5_raises_on_invalid_shape(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace(u"word")
        embeddings_filename = unicode(self.TEST_DIR / u"embeddings.hdf5")
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 10)
        with h5py.File(embeddings_filename, u'w') as fout:
            _ = fout.create_dataset(
                    u'embedding', embeddings.shape, dtype=u'float32', data=embeddings
            )

        params = Params({
                u'pretrained_file': embeddings_filename,
                u'embedding_dim': 5,
                })
        with pytest.raises(ConfigurationError):
            _ = Embedding.from_params(vocab, params)
    def test_read_hdf5_raises_on_invalid_shape(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        embeddings_filename = self.TEST_DIR + "embeddings.hdf5"
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 10)
        with h5py.File(embeddings_filename, 'w') as fout:
            _ = fout.create_dataset(
                    'embedding', embeddings.shape, dtype='float32', data=embeddings
            )

        params = Params({
                'pretrained_file': embeddings_filename,
                'embedding_dim': 5,
                })
        with pytest.raises(ConfigurationError):
            _ = Embedding.from_params(vocab, params)
Beispiel #19
0
 def test_min_pretrained_embeddings(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace('the')
     vocab.add_token_to_namespace('a')
     params = Params({
         'pretrained_file':
         str(self.FIXTURES_ROOT / 'embeddings/glove.6B.100d.sample.txt.gz'),
         'embedding_dim':
         100,
         'min_pretrained_embeddings':
         50
     })
     # This will now update vocab
     _ = Embedding.from_params(vocab, params)
     assert vocab.get_vocab_size() >= 50
     assert vocab.get_token_index("his") > 1  # not @@UNKNOWN@@
Beispiel #20
0
 def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
     params = Params({
             'pretrained_file': embeddings_filename,
             'embedding_dim': 3,
             })
     embedding_layer = Embedding.from_params(vocab, params)
     word_vector = embedding_layer.weight.data[vocab.get_token_index("word")]
     assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
     assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
    def test_forward_works_with_projection_layer(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace('the')
        vocab.add_token_to_namespace('a')
        params = Params({
                'pretrained_file': 'tests/fixtures/glove.6B.300d.sample.txt.gz',
                'embedding_dim': 300,
                'projection_dim': 20
                })
        embedding_layer = Embedding.from_params(vocab, params)
        input_tensor = Variable(torch.LongTensor([[3, 2, 1, 0]]))
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 4, 20)

        input_tensor = Variable(torch.LongTensor([[[3, 2, 1, 0]]]))
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 1, 4, 20)
    def test_read_hdf5_format_file(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        vocab.add_token_to_namespace("word2")
        embeddings_filename = self.TEST_DIR + "embeddings.hdf5"
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 5)
        with h5py.File(embeddings_filename, 'w') as fout:
            _ = fout.create_dataset(
                    'embedding', embeddings.shape, dtype='float32', data=embeddings
            )

        params = Params({
                'pretrained_file': embeddings_filename,
                'embedding_dim': 5,
                })
        embedding_layer = Embedding.from_params(vocab, params)
        assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
    def test_read_hdf5_format_file(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace(u"word")
        vocab.add_token_to_namespace(u"word2")
        embeddings_filename = unicode(self.TEST_DIR / u"embeddings.hdf5")
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 5)
        with h5py.File(embeddings_filename, u'w') as fout:
            _ = fout.create_dataset(
                    u'embedding', embeddings.shape, dtype=u'float32', data=embeddings
            )

        params = Params({
                u'pretrained_file': embeddings_filename,
                u'embedding_dim': 5,
                })
        embedding_layer = Embedding.from_params(vocab, params)
        assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
Beispiel #24
0
    def test_forward_works_with_projection_layer(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace('the')
        vocab.add_token_to_namespace('a')
        params = Params({
            'pretrained_file': 'tests/fixtures/glove.6B.300d.sample.txt.gz',
            'embedding_dim': 300,
            'projection_dim': 20
        })
        embedding_layer = Embedding.from_params(vocab, params)
        input_tensor = Variable(torch.LongTensor([[3, 2, 1, 0]]))
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 4, 20)

        input_tensor = Variable(torch.LongTensor([[[3, 2, 1, 0]]]))
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 1, 4, 20)
Beispiel #25
0
 def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(
         self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
     with gzip.open(embeddings_filename, "wb") as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8"))
     params = Params({
         "pretrained_file": embeddings_filename,
         "embedding_dim": 3
     })
     embedding_layer = Embedding.from_params(params, vocab=vocab)
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         "word2")]
     assert not numpy.allclose(word_vector.numpy(),
                               numpy.array([0.0, 0.0, 0.0]))
    def from_params(  # type: ignore
            cls, vocab: Vocabulary,
            params: Params) -> "TokenCharactersEncoder":

        embedding_params: Params = params.pop("embedding")
        # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
        # that to be "token_characters" by default. If num_embeddings is present, set default namespace
        # to None so that extend_vocab call doesn't misinterpret that some namespace was originally used.
        default_namespace = (None if embedding_params.get(
            "num_embeddings", None) else "token_characters")
        embedding_params.setdefault("vocab_namespace", default_namespace)
        embedding = Embedding.from_params(vocab, embedding_params)
        encoder_params: Params = params.pop("encoder")
        encoder = Seq2VecEncoder.from_params(encoder_params)
        dropout = params.pop_float("dropout", 0.0)
        params.assert_empty(cls.__name__)
        return cls(embedding, encoder, dropout)
    def test_forward_works_with_projection_layer(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace(u'the')
        vocab.add_token_to_namespace(u'a')
        params = Params({
                u'pretrained_file': unicode(self.FIXTURES_ROOT / u'embeddings/glove.6B.300d.sample.txt.gz'),
                u'embedding_dim': 300,
                u'projection_dim': 20
                })
        embedding_layer = Embedding.from_params(vocab, params)
        input_tensor = torch.LongTensor([[3, 2, 1, 0]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 4, 20)

        input_tensor = torch.LongTensor([[[3, 2, 1, 0]]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 1, 4, 20)
Beispiel #28
0
    def test_read_hdf5_raises_on_invalid_shape(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5")
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 10)
        with h5py.File(embeddings_filename, "w") as fout:
            _ = fout.create_dataset("embedding",
                                    embeddings.shape,
                                    dtype="float32",
                                    data=embeddings)

        params = Params({
            "pretrained_file": embeddings_filename,
            "embedding_dim": 5
        })
        with pytest.raises(ConfigurationError):
            _ = Embedding.from_params(params, vocab=vocab)
Beispiel #29
0
    def test_read_hdf5_format_file(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        vocab.add_token_to_namespace("word2")
        embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5")
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 5)
        with h5py.File(embeddings_filename, "w") as fout:
            _ = fout.create_dataset("embedding",
                                    embeddings.shape,
                                    dtype="float32",
                                    data=embeddings)

        params = Params({
            "pretrained_file": embeddings_filename,
            "embedding_dim": 5
        })
        embedding_layer = Embedding.from_params(params, vocab=vocab)
        assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
Beispiel #30
0
    def test_embedding_vocab_extension_with_default_namespace(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace('word1')
        vocab.add_token_to_namespace('word2')
        embedding_params = Params({"vocab_namespace": "tokens",
                                   "embedding_dim": 10})
        embedder = Embedding.from_params(vocab, embedding_params)
        original_weight = embedder.weight

        assert original_weight.shape[0] == 4

        extension_counter = {"tokens": {"word3": 1}}
        vocab._extend(extension_counter)

        embedder.extend_vocab(vocab) # default namespace

        extended_weight = embedder.weight
        assert extended_weight.shape[0] == 5
        assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
Beispiel #31
0
    def __init__(self, vocab, embed_dim=100, window_sizes=(2, 3, 4, 5), num_filters=128):
        super(CNNModel, self).__init__()

        self.embeds = Embedding.from_params(
            vocab,
            Params({'vocab_namespace': 'tokens',
                    'embedding_dim': embed_dim,
                    'trainable': True,
                    'padding_index': 0,
                    'pretrained_file':
                        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz'
                    }))
        self.binary_feature_embedding = Embedding(2, embed_dim)

        self.convs = nn.ModuleList([
            nn.Conv1d(embed_dim * 2, num_filters, kernel_size=window_size, padding=window_size - 1) for window_size in
            window_sizes
        ])
        self.fc = nn.Linear(num_filters * len(window_sizes), 1, bias=False)
Beispiel #32
0
    def test_forward_works_with_projection_layer(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("the")
        vocab.add_token_to_namespace("a")
        params = Params({
            "pretrained_file":
            str(self.FIXTURES_ROOT / "embeddings/glove.6B.300d.sample.txt.gz"),
            "embedding_dim":
            300,
            "projection_dim":
            20,
        })
        embedding_layer = Embedding.from_params(params, vocab=vocab)
        input_tensor = torch.LongTensor([[3, 2, 1, 0]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 4, 20)

        input_tensor = torch.LongTensor([[[3, 2, 1, 0]]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 1, 4, 20)
Beispiel #33
0
    def __init__(self, vocab, use_mean=True, embed_dim=100):
        """
        Averaged embeddings of ending -> label
        :param embed_dim: dimension to use
        """
        super(BoWModel, self).__init__()
        assert embed_dim == 100
        self.embeds = Embedding.from_params(
            vocab,
            Params({'vocab_namespace': 'tokens',
                    'embedding_dim': embed_dim,
                    'trainable': True,
                    'padding_index': 0,
                    'pretrained_file':
                        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz'
                    }))

        self.embed_dim = embed_dim
        self.use_mean = use_mean
        self.embedding_to_label = nn.Linear(self.embed_dim, 1, bias=False)
Beispiel #34
0
 def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
     vocab.add_token_to_namespace(unicode_space)
     embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode('utf-8'))
     params = Params({
             'pretrained_file': embeddings_filename,
             'embedding_dim': 3,
             })
     embedding_layer = Embedding.from_params(vocab, params)
     word_vector = embedding_layer.weight.data[vocab.get_token_index("word")]
     assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)]
     assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
     assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
Beispiel #35
0
    def test_embedding_vocab_extension_with_specified_namespace(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word1", "tokens_a")
        vocab.add_token_to_namespace("word2", "tokens_a")
        embedding_params = Params({
            "vocab_namespace": "tokens_a",
            "embedding_dim": 10
        })
        embedder = Embedding.from_params(embedding_params, vocab=vocab)
        original_weight = embedder.weight

        assert original_weight.shape[0] == 4

        extension_counter = {"tokens_a": {"word3": 1}}
        vocab._extend(extension_counter)

        embedder.extend_vocab(vocab, "tokens_a")  # specified namespace

        extended_weight = embedder.weight
        assert extended_weight.shape[0] == 5
        assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
Beispiel #36
0
    def test_embedding_vocab_extension_without_stored_namespace(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace('word1', "tokens_a")
        vocab.add_token_to_namespace('word2', "tokens_a")
        embedding_params = Params({"vocab_namespace": "tokens_a", "embedding_dim": 10})
        embedder = Embedding.from_params(vocab, embedding_params)

        # Previous models won't have _vocab_namespace attribute. Force it to be None
        embedder._vocab_namespace = None
        original_weight = embedder.weight

        assert original_weight.shape[0] == 4

        extension_counter = {"tokens_a": {"word3": 1}}
        vocab._extend(extension_counter)

        embedder.extend_vocab(vocab, "tokens_a") # specified namespace

        extended_weight = embedder.weight
        assert extended_weight.shape[0] == 5
        assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
Beispiel #37
0
t = SnliReader()
### Choose datasets here
train_dataset = t.read('.data/snli/snli_1.0/snli_1.0_train.jsonl')
val_dataset = t.read('.data/snli/snli_1.0/snli_1.0_dev.jsonl')
vocab = Vocabulary.from_instances(train_dataset + val_dataset)

### Choose word embeddings. Note it is always trainable - we use a
### backward hook to zero the gradient when we don't optimize
### a part of the word embeddings.
params = Params({
    "pretrained_file": ".vector_cache/glove.840B.300d.txt",
    # "pretrained_file": ".vector_cache/w2v.txt",
    "embedding_dim": 300,
    "trainable": True
})
glove = Embedding.from_params(vocab, params)

### NOTE For Rocktaschel et al only, uncomment lines below:
# rows_not_to_optimize = re_read_embeddings_from_text_file('.vector_cache/w2v.txt', 300, vocab, glove._vocab_namespace)
# glove.weight.register_hook(lambda x: grad_zero(x, rows_not_to_optimize))
### NOTE: ENDS HERE

### Choose your hyperparameter search space here
name_csv = ['C.E. Attention']
batch_size_csv = [32]
p_drop_csv = [0, 0.1, 0.2]
lr_csv = [0.0001, 0.0003, 0.001]
l2p_csv = [0, 1e-4, 3e-4, 1e-3]

### ... or if you want particular values, just use 1-element arrays!
# p_drop_csv = [0.2]