Beispiel #1
0
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
            "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
            "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
            "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
            "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
        }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
            'pretrained_file':
            str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
            'embedding_dim':
            5
        })
        with pytest.raises(
                ValueError,
                message=
                "No ValueError when pretrained_file is a multi-file archive"):
            Embedding.from_params(vocab, params)

        for ext in ['.zip', '.tar.gz']:
            archive_path = str(
                self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(
                archive_path, 'folder/fake_embeddings.5d.txt')
            params = Params({'pretrained_file': file_uri, 'embedding_dim': 5})
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in token2vec.items():
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i],
                                   vec), 'Problem with format ' + archive_path
    def setUp(self):
        super(IteratorTest, self).setUp()
        self.token_indexers = {"tokens": SingleIdTokenIndexer()}
        self.vocab = Vocabulary()
        self.this_index = self.vocab.add_token_to_namespace('this')
        self.is_index = self.vocab.add_token_to_namespace('is')
        self.a_index = self.vocab.add_token_to_namespace('a')
        self.sentence_index = self.vocab.add_token_to_namespace('sentence')
        self.another_index = self.vocab.add_token_to_namespace('another')
        self.yet_index = self.vocab.add_token_to_namespace('yet')
        self.very_index = self.vocab.add_token_to_namespace('very')
        self.long_index = self.vocab.add_token_to_namespace('long')
        instances = [
            self.create_instance(["this", "is", "a", "sentence"]),
            self.create_instance(["this", "is", "another", "sentence"]),
            self.create_instance(["yet", "another", "sentence"]),
            self.create_instance([
                "this", "is", "a", "very", "very", "very", "very", "long",
                "sentence"
            ]),
            self.create_instance(["sentence"]),
        ]

        class LazyIterable:
            def __iter__(self):
                return (instance for instance in instances)

        self.instances = instances
        self.lazy_instances = LazyIterable()
 def setUp(self):
     super(TestTokenCharactersEncoder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("1", "token_characters")
     self.vocab.add_token_to_namespace("2", "token_characters")
     self.vocab.add_token_to_namespace("3", "token_characters")
     self.vocab.add_token_to_namespace("4", "token_characters")
     params = Params({
         "embedding": {
             "embedding_dim": 2,
             "vocab_namespace": "token_characters"
         },
         "encoder": {
             "type": "cnn",
             "embedding_dim": 2,
             "num_filters": 4,
             "ngram_filter_sizes": [1, 2],
             "output_dim": 3
         }
     })
     self.encoder = TokenCharactersEncoder.from_params(
         vocab=self.vocab, params=deepcopy(params))
     self.embedding = Embedding.from_params(vocab=self.vocab,
                                            params=params["embedding"])
     self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"])
     constant_init = lambda tensor: torch.nn.init.constant_(tensor, 1.)
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(self.encoder)
     initializer(self.embedding)
     initializer(self.inner_encoder)
 def setUp(self):
     super(TestBasicTextFieldEmbedder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("1")
     self.vocab.add_token_to_namespace("2")
     self.vocab.add_token_to_namespace("3")
     self.vocab.add_token_to_namespace("4")
     params = Params({
         "words1": {
             "type": "embedding",
             "embedding_dim": 2
         },
         "words2": {
             "type": "embedding",
             "embedding_dim": 5
         },
         "words3": {
             "type": "embedding",
             "embedding_dim": 3
         }
     })
     self.token_embedder = BasicTextFieldEmbedder.from_params(
         vocab=self.vocab, params=params)
     self.inputs = {
         "words1": torch.LongTensor([[0, 2, 3, 5]]),
         "words2": torch.LongTensor([[1, 4, 3, 2]]),
         "words3": torch.LongTensor([[1, 5, 1, 2]])
     }
Beispiel #5
0
 def setUp(self):
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("this")
     self.vocab.add_token_to_namespace("is")
     self.vocab.add_token_to_namespace("a")
     self.vocab.add_token_to_namespace("sentence")
     self.vocab.add_token_to_namespace(".")
     self.token_indexer = {"tokens": SingleIdTokenIndexer()}
     self.instances = self.get_instances()
     super(TestDataset, self).setUp()
Beispiel #6
0
 def setUp(self):
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("sentence", namespace='words')
     self.vocab.add_token_to_namespace("A", namespace='words')
     self.vocab.add_token_to_namespace("A", namespace='characters')
     self.vocab.add_token_to_namespace("s", namespace='characters')
     self.vocab.add_token_to_namespace("e", namespace='characters')
     self.vocab.add_token_to_namespace("n", namespace='characters')
     self.vocab.add_token_to_namespace("t", namespace='characters')
     self.vocab.add_token_to_namespace("c", namespace='characters')
     super(TestTextField, self).setUp()
Beispiel #7
0
 def test_get_embedding_layer_uses_correct_embedding_dim(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace('word1')
     vocab.add_token_to_namespace('word2')
     embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8'))
     embedding_weights = _read_pretrained_embeddings_file(
         embeddings_filename, 3, vocab)
     assert tuple(embedding_weights.size()) == (
         4, 3)  # 4 because of padding and OOV
     with pytest.raises(ConfigurationError):
         _read_pretrained_embeddings_file(embeddings_filename, 4, vocab)
    def test_make_vocab_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        make_vocab_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3
Beispiel #9
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [capital_a_index, sentence_index]

        field1 = TextField([Token(t) for t in ["A", "sentence"]],
                           {"characters": TokenCharactersIndexer(namespace="characters")})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        field2 = TextField([Token(t) for t in ["A", "sentence"]],
                           token_indexers={"words": SingleIdTokenIndexer(namespace="words"),
                                           "characters": TokenCharactersIndexer(namespace="characters")})
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
Beispiel #10
0
class TestDataset(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this")
        self.vocab.add_token_to_namespace("is")
        self.vocab.add_token_to_namespace("a")
        self.vocab.add_token_to_namespace("sentence")
        self.vocab.add_token_to_namespace(".")
        self.token_indexer = {"tokens": SingleIdTokenIndexer()}
        self.instances = self.get_instances()
        super(TestDataset, self).setUp()

    def test_instances_must_have_homogeneous_fields(self):
        instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))})
        instance2 = Instance({"words": TextField([Token("hello")], {})})
        with pytest.raises(ConfigurationError):
            _ = Batch([instance1, instance2])

    def test_padding_lengths_uses_max_instance_lengths(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        assert padding_lengths == {"text1": {"num_tokens": 5}, "text2": {"num_tokens": 6}}

    def test_as_tensor_dict(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        tensors = dataset.as_tensor_dict(padding_lengths)
        text1 = tensors["text1"]["tokens"].detach().cpu().numpy()
        text2 = tensors["text2"]["tokens"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6],
                                                                    [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6],
                                                                    [2, 3, 1, 0, 0, 0]]))

    def get_instances(self):
        field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]],
                           self.token_indexer)
        field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
                           self.token_indexer)
        field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]],
                           self.token_indexer)
        field4 = TextField([Token(t) for t in ["this", "is", "short"]],
                           self.token_indexer)
        instances = [Instance({"text1": field1, "text2": field2}),
                     Instance({"text1": field3, "text2": field4})]
        return instances
Beispiel #11
0
 def test_tokens_to_indices_uses_pos_tags(self):
     tokens = self.tokenizer.split_words("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     root_index = vocab.add_token_to_namespace('ROOT',
                                               namespace='dep_labels')
     none_index = vocab.add_token_to_namespace('NONE',
                                               namespace='dep_labels')
     indexer = DepLabelIndexer()
     assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {
         "tokens1": [root_index]
     }
     assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {
         "tokens-1": [none_index]
     }
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 attend_feedforward: FeedForward,
                 similarity_function: SimilarityFunction,
                 compare_feedforward: FeedForward,
                 aggregate_feedforward: FeedForward,
                 premise_encoder: Optional[Seq2SeqEncoder] = None,
                 hypothesis_encoder: Optional[Seq2SeqEncoder] = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(DecomposableAttention, self).__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder
        self._attend_feedforward = TimeDistributed(attend_feedforward)
        self._matrix_attention = LegacyMatrixAttention(similarity_function)
        self._compare_feedforward = TimeDistributed(compare_feedforward)
        self._aggregate_feedforward = aggregate_feedforward
        self._premise_encoder = premise_encoder
        self._hypothesis_encoder = hypothesis_encoder or premise_encoder

        self._num_labels = vocab.get_vocab_size(namespace="labels")

        check_dimensions_match(text_field_embedder.get_output_dim(), attend_feedforward.get_input_dim(),
                               "text field embedding dim", "attend feedforward input dim")
        check_dimensions_match(aggregate_feedforward.get_output_dim(), self._num_labels,
                               "final output dimension", "number of labels")

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self)
Beispiel #13
0
    def test_batch_predictions_are_consistent(self):
        # The CNN encoder has problems with this kind of test - it's not properly masked yet, so
        # changing the amount of padding in the batch will result in small differences in the
        # output of the encoder.  Because BiDAF is so deep, these differences get magnified through
        # the network and make this test impossible.  So, we'll remove the CNN encoder entirely
        # from the model for this test.  If/when we fix the CNN encoder to work correctly with
        # masking, we can change this back to how the other models run this test, with just a
        # single line.
        # pylint: disable=protected-access,attribute-defined-outside-init

        # Save some state.
        saved_model = self.model
        saved_instances = self.instances

        # Modify the state, run the test with modified state.
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])
        reader._token_indexers = {'tokens': reader._token_indexers['tokens']}
        self.instances = reader.read(self.FIXTURES_ROOT / 'data' /
                                     'squad.json')
        vocab = Vocabulary.from_instances(self.instances)
        for instance in self.instances:
            instance.index_fields(vocab)
        del params['model']['text_field_embedder']['token_embedders'][
            'token_characters']
        params['model']['phrase_layer']['input_size'] = 2
        self.model = Model.from_params(vocab=vocab, params=params['model'])

        self.ensure_batch_predictions_are_consistent()

        # Restore the state.
        self.model = saved_model
        self.instances = saved_instances
Beispiel #14
0
 def setUp(self):
     super(TestTrainer, self).setUp()
     self.instances = SequenceTaggingDatasetReader().read(
         self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
     vocab = Vocabulary.from_instances(self.instances)
     self.vocab = vocab
     self.model_params = Params({
         "text_field_embedder": {
             "tokens": {
                 "type": "embedding",
                 "embedding_dim": 5
             }
         },
         "encoder": {
             "type": "lstm",
             "input_size": 5,
             "hidden_size": 7,
             "num_layers": 2
         }
     })
     self.model = SimpleTagger.from_params(vocab=self.vocab,
                                           params=self.model_params)
     self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01)
     self.iterator = BasicIterator(batch_size=2)
     self.iterator.index_with(vocab)
    def test_elmo_as_array_produces_token_sequence(self):  # pylint: disable=invalid-name
        indexer = ELMoTokenCharactersIndexer()
        tokens = [Token('Second'), Token('.')]
        indices = indexer.tokens_to_indices(tokens, Vocabulary(),
                                            "test-elmo")["test-elmo"]
        padded_tokens = indexer.pad_token_sequence(
            {'test-elmo': indices},
            desired_num_tokens={'test-elmo': 3},
            padding_lengths={})
        expected_padded_tokens = [
            [
                259, 84, 102, 100, 112, 111, 101, 260, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
            ],
            [
                259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
            ],
            [
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0
            ]
        ]

        assert padded_tokens['test-elmo'] == expected_padded_tokens
Beispiel #16
0
 def from_params(cls, vocab: Vocabulary,
                 params: Params) -> 'ElmoTokenEmbedder':  # type: ignore
     # pylint: disable=arguments-differ
     params.add_file_to_archive('options_file')
     params.add_file_to_archive('weight_file')
     options_file = params.pop('options_file')
     weight_file = params.pop('weight_file')
     requires_grad = params.pop('requires_grad', False)
     do_layer_norm = params.pop_bool('do_layer_norm', False)
     dropout = params.pop_float("dropout", 0.5)
     namespace_to_cache = params.pop("namespace_to_cache", None)
     if namespace_to_cache is not None:
         vocab_to_cache = list(
             vocab.get_token_to_index_vocabulary(namespace_to_cache).keys())
     else:
         vocab_to_cache = None
     projection_dim = params.pop_int("projection_dim", None)
     params.assert_empty(cls.__name__)
     return cls(options_file=options_file,
                weight_file=weight_file,
                do_layer_norm=do_layer_norm,
                dropout=dropout,
                requires_grad=requires_grad,
                projection_dim=projection_dim,
                vocab_to_cache=vocab_to_cache)
Beispiel #17
0
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:
    """
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    Parameters
    ----------
    batch : ``List[List[str]]``, required
        A list of tokenized sentences.

    Returns
    -------
        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']
Beispiel #18
0
def make_vocab_from_params(params: Params, serialization_dir: str):
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))

    instances = [
        instance for key, dataset in all_datasets.items()
        for instance in dataset if key in datasets_for_vocab_creation
    ]

    vocab = Vocabulary.from_params(vocab_params, instances)

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")
Beispiel #19
0
    def _load(cls,
              config: Params,
              serialization_dir: str,
              weights_file: str = None,
              cuda_device: int = -1) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir,
                                                    _DEFAULT_WEIGHTS)
        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, 'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model_params = config.get('model')
        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab=vocab, params=model_params)
        model_state = torch.load(weights_file,
                                 map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)
        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
 def _get_vocab_index_mapping(
         self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
     vocab_index_mapping: List[Tuple[int, int]] = []
     for index in range(self.vocab.get_vocab_size(namespace='tokens')):
         token = self.vocab.get_token_from_index(index=index,
                                                 namespace='tokens')
         archived_token_index = archived_vocab.get_token_index(
             token, namespace='tokens')
         # Checking if we got the UNK token index, because we don't want all new token
         # representations initialized to UNK token's representation. We do that by checking if
         # the two tokens are the same. They will not be if the token at the archived index is
         # UNK.
         if archived_vocab.get_token_from_index(
                 archived_token_index, namespace="tokens") == token:
             vocab_index_mapping.append((index, archived_token_index))
     return vocab_index_mapping
class TestTokenCharactersEncoder(AllenNlpTestCase):
    def setUp(self):
        super(TestTokenCharactersEncoder, self).setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1", "token_characters")
        self.vocab.add_token_to_namespace("2", "token_characters")
        self.vocab.add_token_to_namespace("3", "token_characters")
        self.vocab.add_token_to_namespace("4", "token_characters")
        params = Params({
            "embedding": {
                "embedding_dim": 2,
                "vocab_namespace": "token_characters"
            },
            "encoder": {
                "type": "cnn",
                "embedding_dim": 2,
                "num_filters": 4,
                "ngram_filter_sizes": [1, 2],
                "output_dim": 3
            }
        })
        self.encoder = TokenCharactersEncoder.from_params(
            vocab=self.vocab, params=deepcopy(params))
        self.embedding = Embedding.from_params(vocab=self.vocab,
                                               params=params["embedding"])
        self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"])
        constant_init = lambda tensor: torch.nn.init.constant_(tensor, 1.)
        initializer = InitializerApplicator([(".*", constant_init)])
        initializer(self.encoder)
        initializer(self.embedding)
        initializer(self.inner_encoder)

    def test_get_output_dim_uses_encoder_output_dim(self):
        assert self.encoder.get_output_dim() == 3

    def test_forward_applies_embedding_then_encoder(self):
        numpy_tensor = numpy.random.randint(6, size=(3, 4, 7))
        inputs = torch.from_numpy(numpy_tensor)
        encoder_output = self.encoder(inputs)
        reshaped_input = inputs.view(12, 7)
        embedded = self.embedding(reshaped_input)
        mask = (inputs != 0).long().view(12, 7)
        reshaped_manual_output = self.inner_encoder(embedded, mask)
        manual_output = reshaped_manual_output.view(3, 4, 3)
        assert_almost_equal(encoder_output.data.numpy(),
                            manual_output.data.numpy())
Beispiel #22
0
    def test_read_hdf5_raises_on_invalid_shape(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5")
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 10)
        with h5py.File(embeddings_filename, 'w') as fout:
            _ = fout.create_dataset('embedding',
                                    embeddings.shape,
                                    dtype='float32',
                                    data=embeddings)

        params = Params({
            'pretrained_file': embeddings_filename,
            'embedding_dim': 5,
        })
        with pytest.raises(ConfigurationError):
            _ = Embedding.from_params(vocab, params)
    def test_label_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("entailment", namespace="labels")
        vocab.add_token_to_namespace("contradiction", namespace="labels")
        vocab.add_token_to_namespace("neutral", namespace="labels")

        label = LabelField("entailment")
        label.index(vocab)
        tensor = label.as_tensor(label.get_padding_lengths())
        assert tensor.item() == 0
Beispiel #24
0
 def tokens_to_indices(self, tokens: List[Token],
                       vocabulary: Vocabulary,
                       index_name: str) -> Dict[str, List[int]]: # pylint: disable=unused-argument
     return {
             "token_ids": [10, 15] + \
                      [vocabulary.get_token_index(token.text, 'words') for token in tokens] + \
                      [25],
             "additional_key": [22, 29]
     }
    def test_tokens_to_indices_produces_correct_characters(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace='characters')
        vocab.add_token_to_namespace("s", namespace='characters')
        vocab.add_token_to_namespace("e", namespace='characters')
        vocab.add_token_to_namespace("n", namespace='characters')
        vocab.add_token_to_namespace("t", namespace='characters')
        vocab.add_token_to_namespace("c", namespace='characters')

        indexer = TokenCharactersIndexer("characters")
        indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char")
        assert indices == {"char": [[3, 4, 5, 6, 4, 5, 6, 1, 1, 1]]}
Beispiel #26
0
    def setUp(self):
        self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True))
        self.utterance = self.tokenizer.tokenize("where is mersin?")
        self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}

        json = {
            'question': self.utterance,
            'columns': ['Name in English', 'Location in English'],
            'cells': [['Paradeniz', 'Mersin'], ['Lake Gala', 'Edirne']]
        }
        self.graph = TableQuestionKnowledgeGraph.read_from_json(json)
        self.vocab = Vocabulary()
        self.name_index = self.vocab.add_token_to_namespace("name",
                                                            namespace='tokens')
        self.in_index = self.vocab.add_token_to_namespace("in",
                                                          namespace='tokens')
        self.english_index = self.vocab.add_token_to_namespace(
            "english", namespace='tokens')
        self.location_index = self.vocab.add_token_to_namespace(
            "location", namespace='tokens')
        self.paradeniz_index = self.vocab.add_token_to_namespace(
            "paradeniz", namespace='tokens')
        self.mersin_index = self.vocab.add_token_to_namespace(
            "mersin", namespace='tokens')
        self.lake_index = self.vocab.add_token_to_namespace("lake",
                                                            namespace='tokens')
        self.gala_index = self.vocab.add_token_to_namespace("gala",
                                                            namespace='tokens')
        self.negative_one_index = self.vocab.add_token_to_namespace(
            "-1", namespace='tokens')
        self.zero_index = self.vocab.add_token_to_namespace("0",
                                                            namespace='tokens')
        self.one_index = self.vocab.add_token_to_namespace("1",
                                                           namespace='tokens')

        self.oov_index = self.vocab.get_token_index('random OOV string',
                                                    namespace='tokens')
        self.edirne_index = self.oov_index
        self.field = KnowledgeGraphField(self.graph, self.utterance,
                                         self.token_indexers, self.tokenizer)

        super(KnowledgeGraphFieldTest, self).setUp()
Beispiel #27
0
def _read_embeddings_from_hdf5(embeddings_filename: str,
                               embedding_dim: int,
                               vocab: Vocabulary,
                               namespace: str = "tokens") -> torch.FloatTensor:
    """
    Reads from a hdf5 formatted file. The embedding matrix is assumed to
    be keyed by 'embedding' and of size ``(num_tokens, embedding_dim)``.
    """
    with h5py.File(embeddings_filename, 'r') as fin:
        embeddings = fin['embedding'][...]

    if list(embeddings.shape) != [
            vocab.get_vocab_size(namespace), embedding_dim
    ]:
        raise ConfigurationError(
            "Read shape {0} embeddings from the file, but expected {1}".format(
                list(embeddings.shape),
                [vocab.get_vocab_size(namespace), embedding_dim]))

    return torch.FloatTensor(embeddings)
 def test_unicode_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.tokens_to_indices([Token(chr(256) + 't')],
                                         Vocabulary(), "test-unicode")
     expected_indices = [
         259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
     ]
     assert indices == {"test-unicode": [expected_indices]}
 def test_bos_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.tokens_to_indices([Token('<S>')], Vocabulary(),
                                         "test-elmo")
     expected_indices = [
         259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
     ]
     assert indices == {"test-elmo": [expected_indices]}
Beispiel #30
0
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')
        for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']:
            self.vocab.add_token_to_namespace(label, 'labels')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {
            "words": SingleIdTokenIndexer("words"),
            "characters": TokenCharactersIndexer("characters")
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]],
            self.word_indexer)
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]],
            self.word_indexer)
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]],
            self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1],
                                                       self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field(
        )

        super(TestListField, self).setUp()