def test_read_embedding_file_inside_archive(self): token2vec = { "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ 'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'), 'embedding_dim': 5 }) with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"): Embedding.from_params(vocab, params) for ext in ['.zip', '.tar.gz']: archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt') params = Params({ 'pretrained_file': file_uri, 'embedding_dim': 5 }) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in token2vec.items(): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
def test_embedding_vocab_extension_is_no_op_when_extension_should_not_happen( self): # Case1: When vocab is already in sync with embeddings it should be a no-op. vocab = Vocabulary({"tokens": {"word1": 1, "word2": 1}}) embedding_params = Params({ "vocab_namespace": "tokens", "embedding_dim": 10 }) embedder = Embedding.from_params(embedding_params, vocab=vocab) original_weight = embedder.weight embedder.extend_vocab(vocab, "tokens") assert torch.all(embedder.weight == original_weight) # Case2: Shouldn't wrongly assuming "tokens" namespace for extension if no # information on vocab_namespece is available. Rather log a warning and be a no-op. vocab = Vocabulary() vocab.add_token_to_namespace("word1", "tokens") vocab.add_token_to_namespace("word2", "tokens") embedding_params = Params({ "vocab_namespace": "tokens", "embedding_dim": 10 }) embedder = Embedding.from_params(embedding_params, vocab=vocab) # Previous models won't have _vocab_namespace attribute. Force it to be None embedder._vocab_namespace = None embedder.weight = torch.nn.Parameter(embedder.weight[:1, :]) assert embedder.weight.shape[0] == 1 embedder.extend_vocab(vocab) # Don't specify namespace assert embedder.weight.shape[0] == 1
def test_read_embedding_file_inside_archive(self): token2vec = { u"think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), u"make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), u"difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), u"àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ u'pretrained_file': unicode(self.FIXTURES_ROOT / u'embeddings/multi-file-archive.zip'), u'embedding_dim': 5 }) with pytest.raises(ValueError, message=u"No ValueError when pretrained_file is a multi-file archive"): Embedding.from_params(vocab, params) for ext in [u'.zip', u'.tar.gz']: archive_path = unicode(self.FIXTURES_ROOT / u'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri(archive_path, u'folder/fake_embeddings.5d.txt') params = Params({ u'pretrained_file': file_uri, u'embedding_dim': 5 }) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in list(token2vec.items()): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), u'Problem with format ' + archive_path
def test_read_embedding_file_inside_archive(self): token2vec = { "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ 'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'), 'embedding_dim': 5 }) with pytest.raises(ValueError, match="The archive .*/embeddings/multi-file-archive.zip contains multiple files, " "so you must select one of the files inside " "providing a uri of the type: " "\\(path_or_url_to_archive\\)#path_inside_archive\\."): Embedding.from_params(vocab, params) for ext in ['.zip', '.tar.gz']: archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt') params = Params({ 'pretrained_file': file_uri, 'embedding_dim': 5 }) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in token2vec.items(): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
def from_params(cls,params: Params, vocab: Vocabulary) -> 'Embedding': # type: ignore cuda_device = params.pop("cuda_device",-1) use_glove_embedding = params.pop("use_glove_embedding", False) #glove_dimension_size = params.pop("glove_dimension_size",300) use_elmo_embedding = params.pop("use_elmo_embedding", False) use_verb_index_embedding = params.pop("use_verb_index_embedding",False) verb_index_embedding_dimension = params.pop("verb_index_embedding_dimension",50) use_visual_score_embedding = params.pop("use_visual_score_embedding",False) num_embeddings = vocab.get_vocab_size() #0 = padding, 1 = unknow, the rest is vocabulary embedding_dim = 0 # test if to use elmo embedding if use_elmo_embedding: elmo_token_embedder = Elmo.from_params(params.pop("elmo")) embedding_dim = embedding_dim + elmo_token_embedder.get_output_dim() # current dimension for elmo embedding - 512*2 = 1024 else: elmo_token_embedder = None if use_glove_embedding: # glove_embeddings an Embeddings with dimension of 300 #glove_embedder = get_glove_embedder(num_embeddings,glove_dimension_size,vocab) glove_embedder = Embedding.from_params(vocab, params.pop("glove_embedder")) embedding_dim = embedding_dim + glove_embedder.get_output_dim() else: glove_embedder = None if use_verb_index_embedding: # suffix_embeddings: need two elements for 0 (non-metaphore) and 1 (is metaphore) verb_index_embedder = Embedding(2, verb_index_embedding_dimension) embedding_dim = embedding_dim + verb_index_embedder.get_output_dim() # for suffix embedding else: verb_index_embedder = None if use_visual_score_embedding: # use pretrained weight matrix visual_score_embedder = Embedding.from_params(vocab, params.pop("visual_embedder")) embedding_dim = embedding_dim + visual_score_embedder.get_output_dim() else: visual_score_embedder = None if cuda_device == -1: is_gpu = False else: is_gpu = True return cls(num_embeddings=num_embeddings,embedding_dim=embedding_dim, glove_embedder=glove_embedder, elmo_embedder=elmo_token_embedder, verb_index_embedder=verb_index_embedder, visual_score_embedder=visual_score_embedder,is_gpu=is_gpu)
def test_embedding_layer_actually_initializes_word_vectors_correctly(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" vocab.add_token_to_namespace(unicode_space) embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, "wb") as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8")) embeddings_file.write( f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8")) params = Params({ "pretrained_file": embeddings_filename, "embedding_dim": 3 }) embedding_layer = Embedding.from_params(params, vocab=vocab) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word")] assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index( unicode_space)] assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
def __init__(self, vocab, use_postags_only=True, embed_dim=100, hidden_size=200, recurrent_dropout_probability=0.3, use_highway=False, maxpool=True): super(BLSTMModel, self).__init__() self.embeds = Embedding.from_params( vocab, Params({'vocab_namespace': 'pos' if use_postags_only else 'tokens', 'embedding_dim': embed_dim, 'trainable': True, 'padding_index': 0, 'pretrained_file': None if use_postags_only else 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz', })) self.binary_feature_embedding = Embedding(2, embed_dim) self.fwd_lstm = PytorchSeq2SeqWrapper(AugmentedLstm( input_size=embed_dim * 2, hidden_size=hidden_size, go_forward=True, recurrent_dropout_probability=recurrent_dropout_probability, use_input_projection_bias=False, use_highway=use_highway), stateful=False) self.bwd_lstm = PytorchSeq2SeqWrapper(AugmentedLstm( input_size=embed_dim * 2, hidden_size=hidden_size, go_forward=False, recurrent_dropout_probability=recurrent_dropout_probability, use_input_projection_bias=False, use_highway=use_highway), stateful=False) self.maxpool = maxpool self.fc = nn.Linear(hidden_size * 2, 1, bias=False)
def test_embedding_vocab_extension_works_with_pretrained_embedding_file(self): vocab = Vocabulary() vocab.add_token_to_namespace('word1') vocab.add_token_to_namespace('word2') embeddings_filename = str(self.TEST_DIR / "embeddings2.gz") with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word3 0.5 0.3 -6.0\n".encode('utf-8')) embeddings_file.write("word4 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8')) embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8')) embedding_params = Params({"vocab_namespace": "tokens", "embedding_dim": 3, "pretrained_file": embeddings_filename}) embedder = Embedding.from_params(vocab, embedding_params) # Change weight to simulate embedding training embedder.weight.data += 1 assert torch.all(embedder.weight[2:, :] == torch.Tensor([[2.0, 3.3, 0.0], [1.1, 1.4, -3.0]])) original_weight = embedder.weight assert tuple(original_weight.size()) == (4, 3) # 4 because of padding and OOV vocab.add_token_to_namespace('word3') embedder.extend_vocab(vocab, extension_pretrained_file=embeddings_filename) # default namespace extended_weight = embedder.weight # Make sure extenstion happened for extra token in extended vocab assert tuple(extended_weight.size()) == (5, 3) # Make sure extension doesn't change original trained weights. assert torch.all(original_weight[:4, :] == extended_weight[:4, :]) # Make sure extended weight is taken from the embedding file. assert torch.all(extended_weight[4, :] == torch.Tensor([0.5, 0.3, -6.0]))
def from_params(cls, vocab: Vocabulary, params: Params) -> 'TreeAttention': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params) premise_encoder_params = params.pop("premise_encoder", None) premise_encoder = Seq2SeqEncoder.from_params(premise_encoder_params) attention_similarity = SimilarityFunction.from_params(params.pop('attention_similarity')) phrase_probability = FeedForward.from_params(params.pop('phrase_probability')) edge_probability = FeedForward.from_params(params.pop('edge_probability')) edge_embedding = Embedding.from_params(vocab, params.pop('edge_embedding')) use_encoding_for_node = params.pop('use_encoding_for_node') ignore_edges = params.pop('ignore_edges', False) init_params = params.pop('initializer', None) initializer = (InitializerApplicator.from_params(init_params) if init_params is not None else InitializerApplicator()) return cls(vocab=vocab, text_field_embedder=text_field_embedder, phrase_probability=phrase_probability, edge_probability=edge_probability, premise_encoder=premise_encoder, edge_embedding=edge_embedding, use_encoding_for_node=use_encoding_for_node, attention_similarity=attention_similarity, ignore_edges=ignore_edges, initializer=initializer)
def test_embedding_vocab_extension_raises_error_for_incorrect_vocab(self): # When vocab namespace of extension vocab is smaller than embeddings # it should raise configuration error. vocab = Vocabulary({"tokens": {"word1": 1, "word2": 1}}) embedding_params = Params({"vocab_namespace": "tokens", "embedding_dim": 10}) embedder = Embedding.from_params(vocab, embedding_params) with pytest.raises(ConfigurationError): embedder.extend_vocab(Vocabulary(), "tokens")
def from_params(cls, vocab: Vocabulary, params: Params) -> 'AfixEmbedding': # type: ignore # pylint: disable=arguments-differ embedding_params: Params = params.pop("embedding") # Embedding.from_params() uses "tokens" as the default namespace, but we need to change # that to be "token_characters" by default. embedding_params.setdefault("vocab_namespace", "afixes") embedding = Embedding.from_params(vocab, embedding_params) dropout = params.pop_float("dropout", 0.0) params.assert_empty(cls.__name__) return cls(embedding, dropout)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'TokenCharactersEncoder': embedding_params: Params = params.pop("embedding") # Embedding.from_params() uses "tokens" as the default namespace, but we need to change # that to be "tokens" by default. embedding_params.setdefault("vocab_namespace", "token_bpe") embedding = Embedding.from_params(vocab, embedding_params) encoder_params: Params = params.pop("encoder") encoder = Seq2VecEncoder.from_params(encoder_params) dropout = params.pop("dropout", 0.0) params.assert_empty(cls.__name__) return cls(embedding, encoder, dropout)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'TokenCharactersEncoder': # type: ignore # pylint: disable=arguments-differ embedding_params: Params = params.pop("embedding") # Embedding.from_params() uses "tokens" as the default namespace, but we need to change # that to be "token_characters" by default. embedding_params.setdefault("vocab_namespace", "token_characters") embedding = Embedding.from_params(vocab, embedding_params) encoder_params: Params = params.pop("encoder") encoder = Seq2VecEncoder.from_params(encoder_params) dropout = params.pop_float("dropout", 0.0) params.assert_empty(cls.__name__) return cls(embedding, encoder, dropout)
def from_params(cls, vocab, params): # type: ignore # pylint: disable=arguments-differ embedding_params = params.pop(u"embedding") # Embedding.from_params() uses "tokens" as the default namespace, but we need to change # that to be "token_characters" by default. embedding_params.setdefault(u"vocab_namespace", u"token_characters") embedding = Embedding.from_params(vocab, embedding_params) encoder_params = params.pop(u"encoder") encoder = Seq2VecEncoder.from_params(encoder_params) dropout = params.pop_float(u"dropout", 0.0) params.assert_empty(cls.__name__) return cls(embedding, encoder, dropout)
def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8')) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 3, }) embedding_layer = Embedding.from_params(vocab, params) word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0]))
def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self): vocab = Vocabulary() vocab.add_token_to_namespace(u"word") vocab.add_token_to_namespace(u"word2") embeddings_filename = unicode(self.TEST_DIR / u"embeddings.gz") with gzip.open(embeddings_filename, u'wb') as embeddings_file: embeddings_file.write(u"word 1.0 2.3 -1.0\n".encode(u'utf-8')) params = Params({ u'pretrained_file': embeddings_filename, u'embedding_dim': 3, }) embedding_layer = Embedding.from_params(vocab, params) word_vector = embedding_layer.weight.data[vocab.get_token_index(u"word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0]))
def test_read_hdf5_raises_on_invalid_shape(self): vocab = Vocabulary() vocab.add_token_to_namespace(u"word") embeddings_filename = unicode(self.TEST_DIR / u"embeddings.hdf5") embeddings = numpy.random.rand(vocab.get_vocab_size(), 10) with h5py.File(embeddings_filename, u'w') as fout: _ = fout.create_dataset( u'embedding', embeddings.shape, dtype=u'float32', data=embeddings ) params = Params({ u'pretrained_file': embeddings_filename, u'embedding_dim': 5, }) with pytest.raises(ConfigurationError): _ = Embedding.from_params(vocab, params)
def test_read_hdf5_raises_on_invalid_shape(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") embeddings_filename = self.TEST_DIR + "embeddings.hdf5" embeddings = numpy.random.rand(vocab.get_vocab_size(), 10) with h5py.File(embeddings_filename, 'w') as fout: _ = fout.create_dataset( 'embedding', embeddings.shape, dtype='float32', data=embeddings ) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 5, }) with pytest.raises(ConfigurationError): _ = Embedding.from_params(vocab, params)
def test_min_pretrained_embeddings(self): vocab = Vocabulary() vocab.add_token_to_namespace('the') vocab.add_token_to_namespace('a') params = Params({ 'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/glove.6B.100d.sample.txt.gz'), 'embedding_dim': 100, 'min_pretrained_embeddings': 50 }) # This will now update vocab _ = Embedding.from_params(vocab, params) assert vocab.get_vocab_size() >= 50 assert vocab.get_token_index("his") > 1 # not @@UNKNOWN@@
def test_embedding_layer_actually_initializes_word_vectors_correctly(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8')) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 3, }) embedding_layer = Embedding.from_params(vocab, params) word_vector = embedding_layer.weight.data[vocab.get_token_index("word")] assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
def test_forward_works_with_projection_layer(self): vocab = Vocabulary() vocab.add_token_to_namespace('the') vocab.add_token_to_namespace('a') params = Params({ 'pretrained_file': 'tests/fixtures/glove.6B.300d.sample.txt.gz', 'embedding_dim': 300, 'projection_dim': 20 }) embedding_layer = Embedding.from_params(vocab, params) input_tensor = Variable(torch.LongTensor([[3, 2, 1, 0]])) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 4, 20) input_tensor = Variable(torch.LongTensor([[[3, 2, 1, 0]]])) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 4, 20)
def test_read_hdf5_format_file(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") embeddings_filename = self.TEST_DIR + "embeddings.hdf5" embeddings = numpy.random.rand(vocab.get_vocab_size(), 5) with h5py.File(embeddings_filename, 'w') as fout: _ = fout.create_dataset( 'embedding', embeddings.shape, dtype='float32', data=embeddings ) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 5, }) embedding_layer = Embedding.from_params(vocab, params) assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
def test_read_hdf5_format_file(self): vocab = Vocabulary() vocab.add_token_to_namespace(u"word") vocab.add_token_to_namespace(u"word2") embeddings_filename = unicode(self.TEST_DIR / u"embeddings.hdf5") embeddings = numpy.random.rand(vocab.get_vocab_size(), 5) with h5py.File(embeddings_filename, u'w') as fout: _ = fout.create_dataset( u'embedding', embeddings.shape, dtype=u'float32', data=embeddings ) params = Params({ u'pretrained_file': embeddings_filename, u'embedding_dim': 5, }) embedding_layer = Embedding.from_params(vocab, params) assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
def test_forward_works_with_projection_layer(self): vocab = Vocabulary() vocab.add_token_to_namespace('the') vocab.add_token_to_namespace('a') params = Params({ 'pretrained_file': 'tests/fixtures/glove.6B.300d.sample.txt.gz', 'embedding_dim': 300, 'projection_dim': 20 }) embedding_layer = Embedding.from_params(vocab, params) input_tensor = Variable(torch.LongTensor([[3, 2, 1, 0]])) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 4, 20) input_tensor = Variable(torch.LongTensor([[[3, 2, 1, 0]]])) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 4, 20)
def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero( self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, "wb") as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8")) params = Params({ "pretrained_file": embeddings_filename, "embedding_dim": 3 }) embedding_layer = Embedding.from_params(params, vocab=vocab) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0]))
def from_params( # type: ignore cls, vocab: Vocabulary, params: Params) -> "TokenCharactersEncoder": embedding_params: Params = params.pop("embedding") # Embedding.from_params() uses "tokens" as the default namespace, but we need to change # that to be "token_characters" by default. If num_embeddings is present, set default namespace # to None so that extend_vocab call doesn't misinterpret that some namespace was originally used. default_namespace = (None if embedding_params.get( "num_embeddings", None) else "token_characters") embedding_params.setdefault("vocab_namespace", default_namespace) embedding = Embedding.from_params(vocab, embedding_params) encoder_params: Params = params.pop("encoder") encoder = Seq2VecEncoder.from_params(encoder_params) dropout = params.pop_float("dropout", 0.0) params.assert_empty(cls.__name__) return cls(embedding, encoder, dropout)
def test_forward_works_with_projection_layer(self): vocab = Vocabulary() vocab.add_token_to_namespace(u'the') vocab.add_token_to_namespace(u'a') params = Params({ u'pretrained_file': unicode(self.FIXTURES_ROOT / u'embeddings/glove.6B.300d.sample.txt.gz'), u'embedding_dim': 300, u'projection_dim': 20 }) embedding_layer = Embedding.from_params(vocab, params) input_tensor = torch.LongTensor([[3, 2, 1, 0]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 4, 20) input_tensor = torch.LongTensor([[[3, 2, 1, 0]]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 4, 20)
def test_read_hdf5_raises_on_invalid_shape(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5") embeddings = numpy.random.rand(vocab.get_vocab_size(), 10) with h5py.File(embeddings_filename, "w") as fout: _ = fout.create_dataset("embedding", embeddings.shape, dtype="float32", data=embeddings) params = Params({ "pretrained_file": embeddings_filename, "embedding_dim": 5 }) with pytest.raises(ConfigurationError): _ = Embedding.from_params(params, vocab=vocab)
def test_read_hdf5_format_file(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5") embeddings = numpy.random.rand(vocab.get_vocab_size(), 5) with h5py.File(embeddings_filename, "w") as fout: _ = fout.create_dataset("embedding", embeddings.shape, dtype="float32", data=embeddings) params = Params({ "pretrained_file": embeddings_filename, "embedding_dim": 5 }) embedding_layer = Embedding.from_params(params, vocab=vocab) assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
def test_embedding_vocab_extension_with_default_namespace(self): vocab = Vocabulary() vocab.add_token_to_namespace('word1') vocab.add_token_to_namespace('word2') embedding_params = Params({"vocab_namespace": "tokens", "embedding_dim": 10}) embedder = Embedding.from_params(vocab, embedding_params) original_weight = embedder.weight assert original_weight.shape[0] == 4 extension_counter = {"tokens": {"word3": 1}} vocab._extend(extension_counter) embedder.extend_vocab(vocab) # default namespace extended_weight = embedder.weight assert extended_weight.shape[0] == 5 assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
def __init__(self, vocab, embed_dim=100, window_sizes=(2, 3, 4, 5), num_filters=128): super(CNNModel, self).__init__() self.embeds = Embedding.from_params( vocab, Params({'vocab_namespace': 'tokens', 'embedding_dim': embed_dim, 'trainable': True, 'padding_index': 0, 'pretrained_file': 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz' })) self.binary_feature_embedding = Embedding(2, embed_dim) self.convs = nn.ModuleList([ nn.Conv1d(embed_dim * 2, num_filters, kernel_size=window_size, padding=window_size - 1) for window_size in window_sizes ]) self.fc = nn.Linear(num_filters * len(window_sizes), 1, bias=False)
def test_forward_works_with_projection_layer(self): vocab = Vocabulary() vocab.add_token_to_namespace("the") vocab.add_token_to_namespace("a") params = Params({ "pretrained_file": str(self.FIXTURES_ROOT / "embeddings/glove.6B.300d.sample.txt.gz"), "embedding_dim": 300, "projection_dim": 20, }) embedding_layer = Embedding.from_params(params, vocab=vocab) input_tensor = torch.LongTensor([[3, 2, 1, 0]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 4, 20) input_tensor = torch.LongTensor([[[3, 2, 1, 0]]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 4, 20)
def __init__(self, vocab, use_mean=True, embed_dim=100): """ Averaged embeddings of ending -> label :param embed_dim: dimension to use """ super(BoWModel, self).__init__() assert embed_dim == 100 self.embeds = Embedding.from_params( vocab, Params({'vocab_namespace': 'tokens', 'embedding_dim': embed_dim, 'trainable': True, 'padding_index': 0, 'pretrained_file': 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz' })) self.embed_dim = embed_dim self.use_mean = use_mean self.embedding_to_label = nn.Linear(self.embed_dim, 1, bias=False)
def test_embedding_layer_actually_initializes_word_vectors_correctly(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" vocab.add_token_to_namespace(unicode_space) embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode('utf-8')) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 3, }) embedding_layer = Embedding.from_params(vocab, params) word_vector = embedding_layer.weight.data[vocab.get_token_index("word")] assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)] assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
def test_embedding_vocab_extension_with_specified_namespace(self): vocab = Vocabulary() vocab.add_token_to_namespace("word1", "tokens_a") vocab.add_token_to_namespace("word2", "tokens_a") embedding_params = Params({ "vocab_namespace": "tokens_a", "embedding_dim": 10 }) embedder = Embedding.from_params(embedding_params, vocab=vocab) original_weight = embedder.weight assert original_weight.shape[0] == 4 extension_counter = {"tokens_a": {"word3": 1}} vocab._extend(extension_counter) embedder.extend_vocab(vocab, "tokens_a") # specified namespace extended_weight = embedder.weight assert extended_weight.shape[0] == 5 assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
def test_embedding_vocab_extension_without_stored_namespace(self): vocab = Vocabulary() vocab.add_token_to_namespace('word1', "tokens_a") vocab.add_token_to_namespace('word2', "tokens_a") embedding_params = Params({"vocab_namespace": "tokens_a", "embedding_dim": 10}) embedder = Embedding.from_params(vocab, embedding_params) # Previous models won't have _vocab_namespace attribute. Force it to be None embedder._vocab_namespace = None original_weight = embedder.weight assert original_weight.shape[0] == 4 extension_counter = {"tokens_a": {"word3": 1}} vocab._extend(extension_counter) embedder.extend_vocab(vocab, "tokens_a") # specified namespace extended_weight = embedder.weight assert extended_weight.shape[0] == 5 assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
t = SnliReader() ### Choose datasets here train_dataset = t.read('.data/snli/snli_1.0/snli_1.0_train.jsonl') val_dataset = t.read('.data/snli/snli_1.0/snli_1.0_dev.jsonl') vocab = Vocabulary.from_instances(train_dataset + val_dataset) ### Choose word embeddings. Note it is always trainable - we use a ### backward hook to zero the gradient when we don't optimize ### a part of the word embeddings. params = Params({ "pretrained_file": ".vector_cache/glove.840B.300d.txt", # "pretrained_file": ".vector_cache/w2v.txt", "embedding_dim": 300, "trainable": True }) glove = Embedding.from_params(vocab, params) ### NOTE For Rocktaschel et al only, uncomment lines below: # rows_not_to_optimize = re_read_embeddings_from_text_file('.vector_cache/w2v.txt', 300, vocab, glove._vocab_namespace) # glove.weight.register_hook(lambda x: grad_zero(x, rows_not_to_optimize)) ### NOTE: ENDS HERE ### Choose your hyperparameter search space here name_csv = ['C.E. Attention'] batch_size_csv = [32] p_drop_csv = [0, 0.1, 0.2] lr_csv = [0.0001, 0.0003, 0.001] l2p_csv = [0, 1e-4, 3e-4, 1e-3] ### ... or if you want particular values, just use 1-element arrays! # p_drop_csv = [0.2]