Beispiel #1
0
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
                "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
                "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
                "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
                "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
                }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
                'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
                'embedding_dim': 5
                })
        with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"):
            Embedding.from_params(vocab, params)

        for ext in ['.zip', '.tar.gz']:
            archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt')
            params = Params({
                    'pretrained_file': file_uri,
                    'embedding_dim': 5
                    })
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in token2vec.items():
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
 def test_tokens_to_indices_uses_pos_tags(self):
     tokens = self.tokenizer.split_words("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels')
     none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels')
     indexer = DepLabelIndexer()
     assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [root_index]}
     assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}
def get_vocab(word2freq, max_v_sizes):
    '''Build vocabulary'''
    vocab = Vocabulary(counter=None, max_vocab_size=max_v_sizes['word'])
    words_by_freq = [(word, freq) for word, freq in word2freq.items()]
    words_by_freq.sort(key=lambda x: x[1], reverse=True)
    for word, _ in words_by_freq[:max_v_sizes['word']]:
        vocab.add_token_to_namespace(word, 'tokens')
    log.info("\tFinished building vocab. Using %d words", vocab.get_vocab_size('tokens'))
    return vocab
 def test_token_to_indices_uses_ner_tags(self):
     tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags')
     none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
     vocab.add_token_to_namespace('ORG', namespace='ner_tags')
     indexer = NerTagIndexer()
     assert indexer.token_to_indices(tokens[1], vocab) == person_index
     assert indexer.token_to_indices(tokens[-1], vocab) == none_index
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        b_index = vocab.add_token_to_namespace("B", namespace='*labels')
        i_index = vocab.add_token_to_namespace("I", namespace='*labels')
        o_index = vocab.add_token_to_namespace("O", namespace='*labels')

        tags = ["B", "I", "O", "O", "O"]
        sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels")
        sequence_label_field.index(vocab)

        # pylint: disable=protected-access
        assert sequence_label_field._indexed_labels == [b_index, i_index, o_index, o_index, o_index]
 def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
     vocab_index_mapping: List[Tuple[int, int]] = []
     for index in range(self.vocab.get_vocab_size(namespace='tokens')):
         token = self.vocab.get_token_from_index(index=index, namespace='tokens')
         archived_token_index = archived_vocab.get_token_index(token, namespace='tokens')
         # Checking if we got the UNK token index, because we don't want all new token
         # representations initialized to UNK token's representation. We do that by checking if
         # the two tokens are the same. They will not be if the token at the archived index is
         # UNK.
         if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token:
             vocab_index_mapping.append((index, archived_token_index))
     return vocab_index_mapping
 def test_token_to_indices_uses_pos_tags(self):
     tokens = self.tokenizer.split_words("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
     cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
     none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
     indexer = PosTagIndexer(coarse_tags=True)
     assert indexer.token_to_indices(tokens[1], vocab) == verb_index
     assert indexer.token_to_indices(tokens[-1], vocab) == none_index
     indexer._coarse_tags = False  # pylint: disable=protected-access
     assert indexer.token_to_indices(tokens[1], vocab) == cop_index
 def test_get_embedding_layer_uses_correct_embedding_dim(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace('word1')
     vocab.add_token_to_namespace('word2')
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8'))
     embedding_weights = _read_pretrained_embedding_file(embeddings_filename, 3, vocab)
     assert tuple(embedding_weights.size()) == (4, 3)  # 4 because of padding and OOV
     with pytest.raises(ConfigurationError):
         _read_pretrained_embedding_file(embeddings_filename, 4, vocab)
    def test_as_tensor_produces_integer_targets(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("B", namespace='*labels')
        vocab.add_token_to_namespace("I", namespace='*labels')
        vocab.add_token_to_namespace("O", namespace='*labels')

        tags = ["B", "I", "O", "O", "O"]
        sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels")
        sequence_label_field.index(vocab)
        padding_lengths = sequence_label_field.get_padding_lengths()
        tensor = sequence_label_field.as_tensor(padding_lengths).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 1, 2, 2, 2]))
Beispiel #10
0
class TestDataset(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this")
        self.vocab.add_token_to_namespace("is")
        self.vocab.add_token_to_namespace("a")
        self.vocab.add_token_to_namespace("sentence")
        self.vocab.add_token_to_namespace(".")
        self.token_indexer = {"tokens": SingleIdTokenIndexer()}
        self.instances = self.get_instances()
        super(TestDataset, self).setUp()

    def test_instances_must_have_homogeneous_fields(self):
        instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))})
        instance2 = Instance({"words": TextField([Token("hello")], {})})
        with pytest.raises(ConfigurationError):
            _ = Batch([instance1, instance2])

    def test_padding_lengths_uses_max_instance_lengths(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        assert padding_lengths == {"text1": {"num_tokens": 5, "tokens_length": 5},
                                   "text2": {"num_tokens": 6, "tokens_length": 6}}

    def test_as_tensor_dict(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        tensors = dataset.as_tensor_dict(padding_lengths)
        text1 = tensors["text1"]["tokens"].detach().cpu().numpy()
        text2 = tensors["text2"]["tokens"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6],
                                                                    [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6],
                                                                    [2, 3, 1, 0, 0, 0]]))

    def get_instances(self):
        field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]],
                           self.token_indexer)
        field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
                           self.token_indexer)
        field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]],
                           self.token_indexer)
        field4 = TextField([Token(t) for t in ["this", "is", "short"]],
                           self.token_indexer)
        instances = [Instance({"text1": field1, "text2": field2}),
                     Instance({"text1": field3, "text2": field4})]
        return instances
    def test_read_hdf5_raises_on_invalid_shape(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        embeddings_filename = self.TEST_DIR + "embeddings.hdf5"
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 10)
        with h5py.File(embeddings_filename, 'w') as fout:
            _ = fout.create_dataset(
                    'embedding', embeddings.shape, dtype='float32', data=embeddings
            )

        params = Params({
                'pretrained_file': embeddings_filename,
                'embedding_dim': 5,
                })
        with pytest.raises(ConfigurationError):
            _ = Embedding.from_params(vocab, params)
Beispiel #12
0
def make_vocab_from_params(params: Params, serialization_dir: str):
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))

    instances = [instance for key, dataset in all_datasets.items()
                 for instance in dataset
                 if key in datasets_for_vocab_creation]

    vocab = Vocabulary.from_params(vocab_params, instances)

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")
 def test_blank_pos_tag(self):
     tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
     for token in tokens:
         token.pos_ = ""
     indexer = PosTagIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     # spacy uses a empty string to indicate "no POS tag"
     # we convert it to "NONE"
     assert counter["pos_tokens"]["NONE"] == 4
     vocab = Vocabulary(counter)
     none_index = vocab.get_token_index('NONE', 'pos_tokens')
     # should raise no exception
     indices = indexer.tokens_to_indices(tokens, vocab, index_name="pos")
     assert {"pos": [none_index, none_index, none_index, none_index]} == indices
    def setUp(self):
        super(IteratorTest, self).setUp()
        self.token_indexers = {"tokens": SingleIdTokenIndexer()}
        self.vocab = Vocabulary()
        self.this_index = self.vocab.add_token_to_namespace('this')
        self.is_index = self.vocab.add_token_to_namespace('is')
        self.a_index = self.vocab.add_token_to_namespace('a')
        self.sentence_index = self.vocab.add_token_to_namespace('sentence')
        self.another_index = self.vocab.add_token_to_namespace('another')
        self.yet_index = self.vocab.add_token_to_namespace('yet')
        self.very_index = self.vocab.add_token_to_namespace('very')
        self.long_index = self.vocab.add_token_to_namespace('long')
        instances = [
                self.create_instance(["this", "is", "a", "sentence"]),
                self.create_instance(["this", "is", "another", "sentence"]),
                self.create_instance(["yet", "another", "sentence"]),
                self.create_instance(["this", "is", "a", "very", "very", "very", "very", "long", "sentence"]),
                self.create_instance(["sentence"]),
                ]

        class LazyIterable:
            def __iter__(self):
                return (instance for instance in instances)

        self.instances = instances
        self.lazy_instances = LazyIterable()
    def test_adjacency_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("a", namespace="labels")
        vocab.add_token_to_namespace("b", namespace="labels")
        vocab.add_token_to_namespace("c", namespace="labels")

        labels = ["a", "b"]
        indices = [(0, 1), (2, 1)]
        adjacency_field = AdjacencyField(indices, self.text, labels)
        adjacency_field.index(vocab)
        tensor = adjacency_field.as_tensor(adjacency_field.get_padding_lengths())
        numpy.testing.assert_equal(tensor.numpy(), numpy.array([[-1, 0, -1, -1, -1],
                                                                [-1, -1, -1, -1, -1],
                                                                [-1, 1, -1, -1, -1],
                                                                [-1, -1, -1, -1, -1],
                                                                [-1, -1, -1, -1, -1]]))
    def setUp(self):
        self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True))
        self.utterance = self.tokenizer.tokenize("where is mersin?")
        self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}

        json = {
                'question': self.utterance,
                'columns': ['Name in English', 'Location in English'],
                'cells': [['Paradeniz', 'Mersin'],
                          ['Lake Gala', 'Edirne']]
                }
        self.graph = TableQuestionKnowledgeGraph.read_from_json(json)
        self.vocab = Vocabulary()
        self.name_index = self.vocab.add_token_to_namespace("name", namespace='tokens')
        self.in_index = self.vocab.add_token_to_namespace("in", namespace='tokens')
        self.english_index = self.vocab.add_token_to_namespace("english", namespace='tokens')
        self.location_index = self.vocab.add_token_to_namespace("location", namespace='tokens')
        self.paradeniz_index = self.vocab.add_token_to_namespace("paradeniz", namespace='tokens')
        self.mersin_index = self.vocab.add_token_to_namespace("mersin", namespace='tokens')
        self.lake_index = self.vocab.add_token_to_namespace("lake", namespace='tokens')
        self.gala_index = self.vocab.add_token_to_namespace("gala", namespace='tokens')
        self.negative_one_index = self.vocab.add_token_to_namespace("-1", namespace='tokens')
        self.zero_index = self.vocab.add_token_to_namespace("0", namespace='tokens')
        self.one_index = self.vocab.add_token_to_namespace("1", namespace='tokens')

        self.oov_index = self.vocab.get_token_index('random OOV string', namespace='tokens')
        self.edirne_index = self.oov_index
        self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer)

        super(KnowledgeGraphFieldTest, self).setUp()
 def setUp(self):
     super(TestTokenCharactersEncoder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("1", "token_characters")
     self.vocab.add_token_to_namespace("2", "token_characters")
     self.vocab.add_token_to_namespace("3", "token_characters")
     self.vocab.add_token_to_namespace("4", "token_characters")
     params = Params({
             "embedding": {
                     "embedding_dim": 2,
                     "vocab_namespace": "token_characters"
                     },
             "encoder": {
                     "type": "cnn",
                     "embedding_dim": 2,
                     "num_filters": 4,
                     "ngram_filter_sizes": [1, 2],
                     "output_dim": 3
                     }
             })
     self.encoder = TokenCharactersEncoder.from_params(vocab=self.vocab, params=deepcopy(params))
     self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"])
     self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"])
     constant_init = Initializer.from_params(Params({"type": "constant", "val": 1.}))
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(self.encoder)
     initializer(self.embedding)
     initializer(self.inner_encoder)
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'ElmoTokenEmbedder':  # type: ignore
     # pylint: disable=arguments-differ
     params.add_file_to_archive('options_file')
     params.add_file_to_archive('weight_file')
     options_file = params.pop('options_file')
     weight_file = params.pop('weight_file')
     requires_grad = params.pop('requires_grad', False)
     do_layer_norm = params.pop_bool('do_layer_norm', False)
     dropout = params.pop_float("dropout", 0.5)
     namespace_to_cache = params.pop("namespace_to_cache", None)
     if namespace_to_cache is not None:
         vocab_to_cache = list(vocab.get_token_to_index_vocabulary(namespace_to_cache).keys())
     else:
         vocab_to_cache = None
     projection_dim = params.pop_int("projection_dim", None)
     scalar_mix_parameters = params.pop('scalar_mix_parameters', None)
     params.assert_empty(cls.__name__)
     return cls(options_file=options_file,
                weight_file=weight_file,
                do_layer_norm=do_layer_norm,
                dropout=dropout,
                requires_grad=requires_grad,
                projection_dim=projection_dim,
                vocab_to_cache=vocab_to_cache,
                scalar_mix_parameters=scalar_mix_parameters)
Beispiel #19
0
def make_vocab_from_params(params: Params):
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    vocab_dir = vocab_params.get('directory_path')
    if vocab_dir is None:
        raise ConfigurationError("To use `make-vocab` your configuration must contain a value "
                                 "at vocabulary.directory_path")

    os.makedirs(vocab_dir, exist_ok=True)

    all_datasets = datasets_from_params(params)

    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(Params({}),
                                   (instance for key, dataset in all_datasets.items()
                                    for instance in dataset
                                    if key in datasets_for_vocab_creation))

    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")
 def setUp(self):
     super(TestBasicTextFieldEmbedder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("1")
     self.vocab.add_token_to_namespace("2")
     self.vocab.add_token_to_namespace("3")
     self.vocab.add_token_to_namespace("4")
     params = Params({
             "words1": {
                     "type": "embedding",
                     "embedding_dim": 2
                     },
             "words2": {
                     "type": "embedding",
                     "embedding_dim": 5
                     },
             "words3": {
                     "type": "embedding",
                     "embedding_dim": 3
                     }
             })
     self.token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
     self.inputs = {
             "words1": Variable(torch.LongTensor([[0, 2, 3, 5]])),
             "words2": Variable(torch.LongTensor([[1, 4, 3, 2]])),
             "words3": Variable(torch.LongTensor([[1, 5, 1, 2]]))
             }
    def __init__(self,
                 vocab: Vocabulary,
                 question_embedder: TextFieldEmbedder,
                 action_embedding_dim: int,
                 encoder: Seq2SeqEncoder,
                 entity_encoder: Seq2VecEncoder,
                 max_decoding_steps: int,
                 use_neighbor_similarity_for_linking: bool = False,
                 dropout: float = 0.0,
                 num_linking_features: int = 10,
                 rule_namespace: str = 'rule_labels',
                 tables_directory: str = '/wikitables/') -> None:
        super(WikiTablesSemanticParser, self).__init__(vocab)
        self._question_embedder = question_embedder
        self._encoder = encoder
        self._entity_encoder = TimeDistributed(entity_encoder)
        self._max_decoding_steps = max_decoding_steps
        self._use_neighbor_similarity_for_linking = use_neighbor_similarity_for_linking
        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
        self._rule_namespace = rule_namespace
        self._denotation_accuracy = WikiTablesAccuracy(tables_directory)
        self._action_sequence_accuracy = Average()
        self._has_logical_form = Average()

        self._action_padding_index = -1  # the padding value used by IndexField
        num_actions = vocab.get_vocab_size(self._rule_namespace)
        self._action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim)
        self._output_action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim)
        self._action_biases = Embedding(num_embeddings=num_actions, embedding_dim=1)

        # This is what we pass as input in the first step of decoding, when we don't have a
        # previous action, or a previous question attention.
        self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
        self._first_attended_question = torch.nn.Parameter(torch.FloatTensor(encoder.get_output_dim()))
        torch.nn.init.normal_(self._first_action_embedding)
        torch.nn.init.normal_(self._first_attended_question)

        check_dimensions_match(entity_encoder.get_output_dim(), question_embedder.get_output_dim(),
                               "entity word average embedding dim", "question embedding dim")

        self._num_entity_types = 4  # TODO(mattg): get this in a more principled way somehow?
        self._num_start_types = 5  # TODO(mattg): get this in a more principled way somehow?
        self._embedding_dim = question_embedder.get_output_dim()
        self._type_params = torch.nn.Linear(self._num_entity_types, self._embedding_dim)
        self._neighbor_params = torch.nn.Linear(self._embedding_dim, self._embedding_dim)

        if num_linking_features > 0:
            self._linking_params = torch.nn.Linear(num_linking_features, 1)
        else:
            self._linking_params = None

        if self._use_neighbor_similarity_for_linking:
            self._question_entity_params = torch.nn.Linear(1, 1)
            self._question_neighbor_params = torch.nn.Linear(1, 1)
        else:
            self._question_entity_params = None
            self._question_neighbor_params = None
Beispiel #22
0
    def _load(cls,
              config: Params,
              serialization_dir: str,
              weights_file: str = None,
              cuda_device: int = -1) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, 'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model_params = config.get('model')

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab=vocab, params=model_params)
        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
Beispiel #23
0
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')
        for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']:
            self.vocab.add_token_to_namespace(label, 'labels')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {"words": SingleIdTokenIndexer("words"),
                                              "characters": TokenCharactersIndexer("characters")}
        self.field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence"]],
                                self.word_indexer)
        self.field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence"]],
                                self.word_indexer)
        self.field3 = TextField([Token(t) for t in ["this", "is", "another", "sentence"]],
                                self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field()

        super(TestListField, self).setUp()
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 attend_feedforward: FeedForward,
                 similarity_function: SimilarityFunction,
                 compare_feedforward: FeedForward,
                 aggregate_feedforward: FeedForward,
                 premise_encoder: Optional[Seq2SeqEncoder] = None,
                 hypothesis_encoder: Optional[Seq2SeqEncoder] = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(DecomposableAttention, self).__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder
        self._attend_feedforward = TimeDistributed(attend_feedforward)
        self._matrix_attention = LegacyMatrixAttention(similarity_function)
        self._compare_feedforward = TimeDistributed(compare_feedforward)
        self._aggregate_feedforward = aggregate_feedforward
        self._premise_encoder = premise_encoder
        self._hypothesis_encoder = hypothesis_encoder or premise_encoder

        self._num_labels = vocab.get_vocab_size(namespace="labels")

        check_dimensions_match(text_field_embedder.get_output_dim(), attend_feedforward.get_input_dim(),
                               "text field embedding dim", "attend feedforward input dim")
        check_dimensions_match(aggregate_feedforward.get_output_dim(), self._num_labels,
                               "final output dimension", "number of labels")

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self)
Beispiel #25
0
    def test_batch_predictions_are_consistent(self):
        # The CNN encoder has problems with this kind of test - it's not properly masked yet, so
        # changing the amount of padding in the batch will result in small differences in the
        # output of the encoder.  Because BiDAF is so deep, these differences get magnified through
        # the network and make this test impossible.  So, we'll remove the CNN encoder entirely
        # from the model for this test.  If/when we fix the CNN encoder to work correctly with
        # masking, we can change this back to how the other models run this test, with just a
        # single line.
        # pylint: disable=protected-access,attribute-defined-outside-init

        # Save some state.
        saved_model = self.model
        saved_instances = self.instances

        # Modify the state, run the test with modified state.
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])
        reader._token_indexers = {'tokens': reader._token_indexers['tokens']}
        self.instances = reader.read('tests/fixtures/data/squad.json')
        vocab = Vocabulary.from_instances(self.instances)
        for instance in self.instances:
            instance.index_fields(vocab)
        del params['model']['text_field_embedder']['token_characters']
        params['model']['phrase_layer']['input_size'] = 2
        self.model = Model.from_params(vocab, params['model'])

        self.ensure_batch_predictions_are_consistent()

        # Restore the state.
        self.model = saved_model
        self.instances = saved_instances
    def test_forward_works_with_projection_layer(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace('the')
        vocab.add_token_to_namespace('a')
        params = Params({
                'pretrained_file': 'tests/fixtures/glove.6B.300d.sample.txt.gz',
                'embedding_dim': 300,
                'projection_dim': 20
                })
        embedding_layer = Embedding.from_params(vocab, params)
        input_tensor = Variable(torch.LongTensor([[3, 2, 1, 0]]))
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 4, 20)

        input_tensor = Variable(torch.LongTensor([[[3, 2, 1, 0]]]))
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 1, 4, 20)
Beispiel #27
0
def _read_embeddings_from_hdf5(embeddings_filename: str,
                               embedding_dim: int,
                               vocab: Vocabulary,
                               namespace: str = "tokens") -> torch.FloatTensor:
    """
    Reads from a hdf5 formatted file. The embedding matrix is assumed to
    be keyed by 'embedding' and of size ``(num_tokens, embedding_dim)``.
    """
    with h5py.File(embeddings_filename, 'r') as fin:
        embeddings = fin['embedding'][...]

    if list(embeddings.shape) != [vocab.get_vocab_size(namespace), embedding_dim]:
        raise ConfigurationError(
                "Read shape {0} embeddings from the file, but expected {1}".format(
                        list(embeddings.shape), [vocab.get_vocab_size(namespace), embedding_dim]))

    return torch.FloatTensor(embeddings)
Beispiel #28
0
    def test_dry_run_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        dry_run_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3
Beispiel #29
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [capital_a_index, sentence_index]

        field1 = TextField([Token(t) for t in ["A", "sentence"]],
                           {"characters": TokenCharactersIndexer(namespace="characters")})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        field2 = TextField([Token(t) for t in ["A", "sentence"]],
                           token_indexers={"words": SingleIdTokenIndexer(namespace="words"),
                                           "characters": TokenCharactersIndexer(namespace="characters")})
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
Beispiel #30
0
 def tokens_to_indices(self, tokens: List[Token],
                       vocabulary: Vocabulary,
                       index_name: str) -> Dict[str, List[int]]: # pylint: disable=unused-argument
     return {
             "token_ids": [10, 15] + \
                      [vocabulary.get_token_index(token.text, 'words') for token in tokens] + \
                      [25],
             "additional_key": [22, 29]
     }
Beispiel #31
0
def get_fixtures(include_gold_entities=False,
                 include_lm_labels=True,
                 include_contextual_embeddings=False):
    vocab = Vocabulary.from_params(
        Params({
            "directory_path":
            "tests/fixtures/kg_embeddings/tucker_wordnet/vocabulary",
        }))

    batch = {
        'next_sentence_label':
        torch.tensor([0, 1, 1]),
        'tokens': {
            'tokens':
            torch.tensor([[16, 16, 11, 1, 1, 1, 17, 1, 1, 1],
                          [16, 16, 1, 12, 1, 17, 1, 1, 1, 1],
                          [16, 16, 1, 1, 17, 1, 13, 17, 17, 0]])
        },
        'segment_ids':
        torch.tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
                      [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
                      [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]]),
        'lm_label_ids': {
            'lm_labels':
            torch.tensor([[0, 1, 0, 0, 13, 0, 1, 1, 13, 0],
                          [0, 0, 1, 0, 0, 2, 1, 1, 13, 0],
                          [0, 1, 1, 0, 1, 1, 0, 0, 0, 0]])
        },
        'candidates': {
            'wordnet': {
                'candidate_entity_priors':
                torch.tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
                               [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
                              [[0.2500, 0.2500, 0.2500, 0.2500, 0.0000],
                               [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]],
                              [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
                               [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]),
                'candidate_entities': {
                    'ids':
                    torch.tensor([[[67, 0, 0, 0, 0], [0, 0, 0, 0, 0]],
                                  [[344, 349, 354, 122, 0],
                                   [101, 46, 445, 25, 28]],
                                  [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]])
                },
                'candidate_segment_ids':
                torch.tensor([[0, 1], [0, 1], [0, 0]]),
                'candidate_spans':
                torch.tensor([[[1, 1], [-1, -1]], [[1, 1], [4, 4]],
                              [[-1, -1], [-1, -1]]])
            }
        }
    }

    if include_gold_entities:
        batch['gold_entities'] = {
            'wordnet': {
                'ids': torch.tensor([[[67], [0]], [[349], [46]], [[0], [0]]])
            }
        }

    if not include_lm_labels:
        del batch['next_sentence_label']
        del batch['lm_label_ids']

    if include_contextual_embeddings:
        batch_size, timesteps = batch['tokens']['tokens'].shape
        batch['contextual_embeddings'] = torch.rand(batch_size, timesteps, 12)
        batch['tokens_mask'] = batch['tokens']['tokens'] > 0
        del batch['tokens']

    return vocab, batch
class TestBasicTextFieldEmbedder(AllenNlpTestCase):
    def setUp(self):
        super(TestBasicTextFieldEmbedder, self).setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1")
        self.vocab.add_token_to_namespace("2")
        self.vocab.add_token_to_namespace("3")
        self.vocab.add_token_to_namespace("4")
        params = Params({
            "words1": {
                "type": "embedding",
                "embedding_dim": 2
            },
            "words2": {
                "type": "embedding",
                "embedding_dim": 5
            },
            "words3": {
                "type": "embedding",
                "embedding_dim": 3
            }
        })
        self.token_embedder = BasicTextFieldEmbedder.from_params(
            self.vocab, params)
        self.inputs = {
            "words1": torch.LongTensor([[0, 2, 3, 5]]),
            "words2": torch.LongTensor([[1, 4, 3, 2]]),
            "words3": torch.LongTensor([[1, 5, 1, 2]])
        }

    def test_get_output_dim_aggregates_dimension_from_each_embedding(self):
        assert self.token_embedder.get_output_dim() == 10

    def test_forward_asserts_input_field_match(self):
        self.inputs['words4'] = self.inputs['words3']
        del self.inputs['words3']
        with pytest.raises(ConfigurationError):
            self.token_embedder(self.inputs)
        self.inputs['words3'] = self.inputs['words4']
        del self.inputs['words4']

    def test_forward_concats_resultant_embeddings(self):
        assert self.token_embedder(self.inputs).size() == (1, 4, 10)

    def test_forward_works_on_higher_order_input(self):
        params = Params({
            "words": {
                "type": "embedding",
                "num_embeddings": 20,
                "embedding_dim": 2,
            },
            "characters": {
                "type": "character_encoding",
                "embedding": {
                    "embedding_dim": 4,
                    "num_embeddings": 15,
                },
                "encoder": {
                    "type": "cnn",
                    "embedding_dim": 4,
                    "num_filters": 10,
                    "ngram_filter_sizes": [3],
                },
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
            'words': (torch.rand(3, 4, 5, 6) * 20).long(),
            'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(),
        }
        assert token_embedder(inputs,
                              num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)

    def test_forward_runs_with_non_bijective_mapping(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo'
        options_file = str(elmo_fixtures_path / 'options.json')
        weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5')
        params = Params({
            "words": {
                "type": "embedding",
                "num_embeddings": 20,
                "embedding_dim": 2,
            },
            "elmo": {
                "type": "elmo_token_embedder",
                "options_file": options_file,
                "weight_file": weight_file
            },
            "embedder_to_indexer_map": {
                "words": ["words"],
                "elmo": ["elmo", "words"]
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
            'words': (torch.rand(3, 6) * 20).long(),
            'elmo': (torch.rand(3, 6, 50) * 15).long(),
        }
        token_embedder(inputs)
Beispiel #33
0
 def index(self, vocab: Vocabulary):
     self._mapping_array = [
         vocab.get_token_index(x.text, self._target_namespace)
         for x in self._source_tokens
     ]
Beispiel #34
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 att_question_to_choice: SimilarityFunction,
                 question_encoder: Optional[Seq2SeqEncoder] = None,
                 choice_encoder: Optional[Seq2SeqEncoder] = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 aggregate_question: Optional[str] = "max",
                 aggregate_choice: Optional[str] = "max",
                 embeddings_dropout_value: Optional[float] = 0.0) -> None:
        super(QAMultiChoiceMaxAttention, self).__init__(vocab)

        self._use_cuda = (torch.cuda.is_available()
                          and torch.cuda.current_device() >= 0)

        self._text_field_embedder = text_field_embedder
        if embeddings_dropout_value > 0.0:
            self._embeddings_dropout = torch.nn.Dropout(
                p=embeddings_dropout_value)
        else:
            self._embeddings_dropout = lambda x: x

        self._question_encoder = question_encoder

        # choices encoding
        self._choice_encoder = choice_encoder

        self._question_aggregate = aggregate_question
        self._choice_aggregate = aggregate_choice

        self._num_labels = vocab.get_vocab_size(namespace="labels")

        question_output_dim = self._text_field_embedder.get_output_dim()
        if self._question_encoder is not None:
            question_output_dim = self._question_encoder.get_output_dim()

        choice_output_dim = self._text_field_embedder.get_output_dim()
        if self._choice_encoder is not None:
            choice_output_dim = self._choice_encoder.get_output_dim()

        if question_output_dim != choice_output_dim:
            raise ConfigurationError(
                "Output dimension of the question_encoder (dim: {}) "
                "and choice_encoder (dim: {})"
                "must match! ".format(question_output_dim, choice_output_dim))

        # Check input tensor dimensions for the question to choices attention (similarity function)
        if hasattr(att_question_to_choice, "tensor_1_dim"):
            tensor_1_dim = att_question_to_choice.tensor_1_dim
            if tensor_1_dim != question_output_dim:
                raise ConfigurationError(
                    "Output dimension of the question_encoder (dim: {}) "
                    "and tensor_1_dim (dim: {}) of att_question_to_choice"
                    "must match! ".format(question_output_dim, tensor_1_dim))

        if hasattr(att_question_to_choice, "tensor_2_dim"):
            tensor_2_dim = att_question_to_choice.tensor_2_dim
            if tensor_2_dim != question_output_dim:
                raise ConfigurationError(
                    "Output dimension of the choice_encoder (dim: {}) "
                    "and tensor_2_dim (dim: {}) of att_question_to_choice"
                    "must match! ".format(choice_output_dim, tensor_2_dim))

        self._matrix_attention_question_to_choice = LegacyMatrixAttention(
            att_question_to_choice)

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self)
Beispiel #35
0
    def __init__(self,
                 vocab: Vocabulary,
                 encoder: Seq2SeqEncoder,
                 entity_encoder: Seq2VecEncoder,
                 decoder_beam_search: BeamSearch,
                 question_embedder: TextFieldEmbedder,
                 schema_embedder:TextFieldEmbedder,
                 input_attention: Attention,
                 past_attention: Attention,
                 max_decoding_steps: int,
                 action_embedding_dim: int,
                 gnn: bool = True,
                 decoder_use_graph_entities: bool = True,
                 decoder_self_attend: bool = True,
                 gnn_timesteps: int = 2,
                 parse_sql_on_decoding: bool = True,
                 add_action_bias: bool = True,
                 use_neighbor_similarity_for_linking: bool = True,
                 dataset_path: str = 'dataset',
                 training_beam_size: int = None,
                 decoder_num_layers: int = 1,
                 dropout: float = 0.0,
                 rule_namespace: str = 'rule_labels',
                 scoring_dev_params: dict = None,
                 debug_parsing: bool = False) -> None:
        super().__init__(vocab)
        self.vocab = vocab
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
        self._rule_namespace = rule_namespace
        self._question_embedder = question_embedder
        self._schema_embedder = schema_embedder
        self._add_action_bias = add_action_bias
        self._scoring_dev_params = scoring_dev_params or {}
        self.parse_sql_on_decoding = parse_sql_on_decoding
        self._entity_encoder = TimeDistributed(entity_encoder)
        self._use_neighbor_similarity_for_linking = use_neighbor_similarity_for_linking
        self._self_attend = decoder_self_attend
        self._decoder_use_graph_entities = decoder_use_graph_entities

        self._action_padding_index = -1  # the padding value used by IndexField

        self._exact_match = Average()
        self._sql_evaluator_match = Average()
        self._action_similarity = Average()
        self._acc_single = Average()
        self._acc_multi = Average()
        self._beam_hit = Average()

        self._action_embedding_dim = action_embedding_dim

        num_actions = vocab.get_vocab_size(self._rule_namespace)
        if self._add_action_bias:
            input_action_dim = action_embedding_dim + 1
        else:
            input_action_dim = action_embedding_dim
        self._action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=input_action_dim)
        self._output_action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim)

        encoder_output_dim = encoder.get_output_dim()
        if gnn:
            encoder_output_dim += action_embedding_dim

        self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
        self._first_attended_utterance = torch.nn.Parameter(torch.FloatTensor(encoder_output_dim))
        self._first_attended_output = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
        torch.nn.init.normal_(self._first_action_embedding)
        torch.nn.init.normal_(self._first_attended_utterance)
        torch.nn.init.normal_(self._first_attended_output)

        self._num_entity_types = 9
        self._embedding_dim = question_embedder.get_output_dim()

        self._entity_type_encoder_embedding = Embedding(self._num_entity_types, self._embedding_dim)
        self._entity_type_decoder_embedding = Embedding(self._num_entity_types, action_embedding_dim)

        self._linking_params = torch.nn.Linear(16, 1)
        torch.nn.init.uniform_(self._linking_params.weight, 0, 1)

        num_edge_types = 3
        self._gnn = GatedGraphConv(self._embedding_dim, gnn_timesteps, num_edge_types=num_edge_types, dropout=dropout)

        self._decoder_num_layers = decoder_num_layers

        self._beam_search = decoder_beam_search
        self._decoder_trainer = MaximumMarginalLikelihood(training_beam_size)

        if decoder_self_attend:
            self._transition_function = AttendPastSchemaItemsTransitionFunction(encoder_output_dim=encoder_output_dim,
                                                                                action_embedding_dim=action_embedding_dim,
                                                                                input_attention=input_attention,
                                                                                past_attention=past_attention,
                                                                                predict_start_type_separately=False,
                                                                                add_action_bias=self._add_action_bias,
                                                                                dropout=dropout,
                                                                                num_layers=self._decoder_num_layers)
        else:
            self._transition_function = LinkingTransitionFunction(encoder_output_dim=encoder_output_dim,
                                                                  action_embedding_dim=action_embedding_dim,
                                                                  input_attention=input_attention,
                                                                  predict_start_type_separately=False,
                                                                  add_action_bias=self._add_action_bias,
                                                                  dropout=dropout,
                                                                  num_layers=self._decoder_num_layers)

        self._ent2ent_ff = FeedForward(action_embedding_dim, 1, action_embedding_dim, Activation.by_name('relu')())

        self._neighbor_params = torch.nn.Linear(self._embedding_dim, self._embedding_dim)

        # TODO: Remove hard-coded dirs
        self._evaluate_func = partial(evaluate,
                                      db_dir=os.path.join(dataset_path, 'database'),
                                      table=os.path.join(dataset_path, 'tables.json'),
                                      check_valid=False)

        self.debug_parsing = debug_parsing
Beispiel #36
0
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> 'Embedding':  # type: ignore
        """
        We need the vocabulary here to know how many items we need to embed, and we look for a
        ``vocab_namespace`` key in the parameter dictionary to know which vocabulary to use.  If
        you know beforehand exactly how many embeddings you need, or aren't using a vocabulary
        mapping for the things getting embedded here, then you can pass in the ``num_embeddings``
        key directly, and the vocabulary will be ignored.

        In the configuration file, a file containing pretrained embeddings can be specified
        using the parameter ``"pretrained_file"``.
        It can be the path to a local file or an URL of a (cached) remote file.
        Two formats are supported:

            * hdf5 file - containing an embedding matrix in the form of a torch.Tensor;

            * text file - an utf-8 encoded text file with space separated fields::

                    [word] [dim 1] [dim 2] ...

              The text file can eventually be compressed with gzip, bz2, lzma or zip.
              You can even select a single file inside an archive containing multiple files
              using the URI::

                    "(archive_uri)#file_path_inside_the_archive"

              where ``archive_uri`` can be a file system path or a URL. For example::

                    "(http://nlp.stanford.edu/data/glove.twitter.27B.zip)#glove.twitter.27B.200d.txt"
        """
        # pylint: disable=arguments-differ
        num_embeddings = params.pop_int('num_embeddings', None)
        vocab_namespace = params.pop("vocab_namespace", "tokens")
        if num_embeddings is None:
            num_embeddings = vocab.get_vocab_size(vocab_namespace)
        embedding_dim = params.pop_int('embedding_dim')
        pretrained_file = params.pop("pretrained_file", None)
        projection_dim = params.pop_int("projection_dim", None)
        trainable = params.pop_bool("trainable", True)
        padding_index = params.pop_int('padding_index', None)
        max_norm = params.pop_float('max_norm', None)
        norm_type = params.pop_float('norm_type', 2.)
        scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False)
        sparse = params.pop_bool('sparse', False)
        params.assert_empty(cls.__name__)

        if pretrained_file:
            # If we're loading a saved model, we don't want to actually read a pre-trained
            # embedding file - the embeddings will just be in our saved weights, and we might not
            # have the original embedding file anymore, anyway.
            weight = _read_pretrained_embeddings_file(pretrained_file,
                                                      embedding_dim, vocab,
                                                      vocab_namespace)
        else:
            weight = None

        return cls(num_embeddings=num_embeddings,
                   embedding_dim=embedding_dim,
                   projection_dim=projection_dim,
                   weight=weight,
                   padding_index=padding_index,
                   trainable=trainable,
                   max_norm=max_norm,
                   norm_type=norm_type,
                   scale_grad_by_freq=scale_grad_by_freq,
                   sparse=sparse)
Beispiel #37
0
def _read_embeddings_from_text_file(
        file_uri: str,
        embedding_dim: int,
        vocab: Vocabulary,
        namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read pre-trained word vectors from an eventually compressed text file, possibly contained
    inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
    space-separated fields: [word] [dim 1] [dim 2] ...

    Lines that contain more numerical tokens than `embedding_dim` raise a warning and are skipped.

    The remainder of the docstring is identical to `_read_pretrained_embeddings_file`.
    """
    tokens_to_keep = set(
        vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pretrained embeddings from file")

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(" ", 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(" ")
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning(
                        "Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                        embedding_dim,
                        len(fields) - 1,
                        line,
                    )
                    continue

                vector = numpy.asarray(fields[1:], dtype="float32")
                embeddings[token] = vector

    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)
    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)
    for i in range(vocab_size):
        token = index_to_token[i]

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        else:
            logger.debug(
                "Token %s was not found in the embedding file. Initialising randomly.",
                token)

    logger.info("Pretrained embeddings were found for %d out of %d tokens",
                num_tokens_found, vocab_size)

    return embedding_matrix
Beispiel #38
0
    def _load(
        cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1
    ) -> "Model":
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, "vocabulary")
        # If the config specifies a vocabulary subclass, we need to use it.
        vocab_params = config.get("vocabulary", Params({}))
        vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True)
        vocab_class, _ = Vocabulary.resolve_class_name(vocab_choice)
        vocab = vocab_class.from_files(
            vocab_dir, vocab_params.get("padding_token"), vocab_params.get("oov_token")
        )

        model_params = config.get("model")

        training_params = config.get("trainer", Params({}))
        opt_level = training_params.get("opt_level")

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab=vocab, params=model_params)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        # If the model was trained with amp and amp is available, we should re-initialize it with
        # the opt_level that was used. If the model was trained with amp but amp is not availble, log a warning
        # so this doesn't pass silently.
        if opt_level is not None:
            if amp is None:
                logger.warning(
                    (
                        f"This model was trained with amp (opt_level: {opt_level}) but amp is not available."
                        " Any further training or inference will happen at full-precision."
                    )
                )
            else:
                model = amp.initialize(model, opt_level=opt_level)

        # If vocab+embedding extension was done, the model initialized from from_params
        # and one defined by state dict in weights_file might not have same embedding shapes.
        # Eg. when model embedder module was transferred along with vocab extension, the
        # initialized embedding weight shape would be smaller than one in the state_dict.
        # So calling model embedding extension is required before load_state_dict.
        # If vocab and model embeddings are in sync, following would be just a no-op.
        model.extend_embedder_vocab()

        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)

        return model
Beispiel #39
0
class TestTextField(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace(u"sentence", namespace=u'words')
        self.vocab.add_token_to_namespace(u"A", namespace=u'words')
        self.vocab.add_token_to_namespace(u"A", namespace=u'characters')
        self.vocab.add_token_to_namespace(u"s", namespace=u'characters')
        self.vocab.add_token_to_namespace(u"e", namespace=u'characters')
        self.vocab.add_token_to_namespace(u"n", namespace=u'characters')
        self.vocab.add_token_to_namespace(u"t", namespace=u'characters')
        self.vocab.add_token_to_namespace(u"c", namespace=u'characters')
        super(TestTextField, self).setUp()

    def test_field_counts_vocab_items_correctly(self):
        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts[u"words"][u"This"] == 1
        assert namespace_token_counts[u"words"][u"is"] == 1
        assert namespace_token_counts[u"words"][u"a"] == 1
        assert namespace_token_counts[u"words"][u"sentence"] == 1
        assert namespace_token_counts[u"words"][u"."] == 1
        assert list(namespace_token_counts.keys()) == [u"words"]

        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"characters": TokenCharactersIndexer(u"characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts[u"characters"][u"T"] == 1
        assert namespace_token_counts[u"characters"][u"h"] == 1
        assert namespace_token_counts[u"characters"][u"i"] == 2
        assert namespace_token_counts[u"characters"][u"s"] == 3
        assert namespace_token_counts[u"characters"][u"a"] == 1
        assert namespace_token_counts[u"characters"][u"e"] == 3
        assert namespace_token_counts[u"characters"][u"n"] == 2
        assert namespace_token_counts[u"characters"][u"t"] == 1
        assert namespace_token_counts[u"characters"][u"c"] == 1
        assert namespace_token_counts[u"characters"][u"."] == 1
        assert list(namespace_token_counts.keys()) == [u"characters"]

        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words"),
                                          u"characters": TokenCharactersIndexer(u"characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts[u"characters"][u"T"] == 1
        assert namespace_token_counts[u"characters"][u"h"] == 1
        assert namespace_token_counts[u"characters"][u"i"] == 2
        assert namespace_token_counts[u"characters"][u"s"] == 3
        assert namespace_token_counts[u"characters"][u"a"] == 1
        assert namespace_token_counts[u"characters"][u"e"] == 3
        assert namespace_token_counts[u"characters"][u"n"] == 2
        assert namespace_token_counts[u"characters"][u"t"] == 1
        assert namespace_token_counts[u"characters"][u"c"] == 1
        assert namespace_token_counts[u"characters"][u"."] == 1
        assert namespace_token_counts[u"words"][u"This"] == 1
        assert namespace_token_counts[u"words"][u"is"] == 1
        assert namespace_token_counts[u"words"][u"a"] == 1
        assert namespace_token_counts[u"words"][u"sentence"] == 1
        assert namespace_token_counts[u"words"][u"."] == 1
        assert set(namespace_token_counts.keys()) == set([u"words", u"characters"])

    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace(u"sentence", namespace=u'words')
        capital_a_index = vocab.add_token_to_namespace(u"A", namespace=u'words')
        capital_a_char_index = vocab.add_token_to_namespace(u"A", namespace=u'characters')
        s_index = vocab.add_token_to_namespace(u"s", namespace=u'characters')
        e_index = vocab.add_token_to_namespace(u"e", namespace=u'characters')
        n_index = vocab.add_token_to_namespace(u"n", namespace=u'characters')
        t_index = vocab.add_token_to_namespace(u"t", namespace=u'characters')
        c_index = vocab.add_token_to_namespace(u"c", namespace=u'characters')

        field = TextField([Token(t) for t in [u"A", u"sentence"]],
                          {u"words": SingleIdTokenIndexer(namespace=u"words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens[u"words"] == [capital_a_index, sentence_index]

        field1 = TextField([Token(t) for t in [u"A", u"sentence"]],
                           {u"characters": TokenCharactersIndexer(namespace=u"characters")})
        field1.index(vocab)
        assert field1._indexed_tokens[u"characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        field2 = TextField([Token(t) for t in [u"A", u"sentence"]],
                           token_indexers={u"words": SingleIdTokenIndexer(namespace=u"words"),
                                           u"characters": TokenCharactersIndexer(namespace=u"characters")})
        field2.index(vocab)
        assert field2._indexed_tokens[u"words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens[u"characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        # pylint: enable=protected-access

    def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words")})
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths()

    def test_padding_lengths_are_computed_correctly(self):
        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {u"num_tokens": 5}

        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"characters": TokenCharactersIndexer(u"characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {u"num_tokens": 5, u"num_token_characters": 8}

        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"characters": TokenCharactersIndexer(u"characters"),
                                          u"words": SingleIdTokenIndexer(u"words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {u"num_tokens": 5, u"num_token_characters": 8}

    def test_as_tensor_handles_words(self):
        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(tensor_dict[u"words"].detach().cpu().numpy(),
                                                numpy.array([1, 1, 1, 2, 1]))

    def test_as_tensor_handles_longer_lengths(self):
        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths[u"num_tokens"] = 10
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(tensor_dict[u"words"].detach().cpu().numpy(),
                                                numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0]))

    def test_as_tensor_handles_characters(self):
        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"characters": TokenCharactersIndexer(u"characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                                [1, 3, 0, 0, 0, 0, 0, 0],
                                                [1, 0, 0, 0, 0, 0, 0, 0],
                                                [3, 4, 5, 6, 4, 5, 7, 4],
                                                [1, 0, 0, 0, 0, 0, 0, 0]])
        numpy.testing.assert_array_almost_equal(tensor_dict[u"characters"].detach().cpu().numpy(),
                                                expected_character_array)

    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField([Token(t) for t in [u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words"),
                                          u"characters": TokenCharactersIndexer(u"characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths[u"num_tokens"] = 5
        padding_lengths[u"num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(tensor_dict[u"words"].detach().cpu().numpy(),
                                                numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict[u"characters"].detach().cpu().numpy(),
                                                numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                                                             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))

    def test_printing_doesnt_crash(self):
        field = TextField([Token(t) for t in [u"A", u"sentence"]],
                          {u"words": SingleIdTokenIndexer(namespace=u"words")})
        print(field)

    def test_token_embedder_returns_dict(self):
        field = TextField([Token(t) for t in [u"A", u"sentence"]],
                          token_indexers={u"field_with_dict": DictReturningTokenIndexer(),
                                          u"words": SingleIdTokenIndexer(u"words"),
                                          u"characters": TokenCharactersIndexer(u"characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
                u'token_ids': 5,
                u'additional_key': 2,
                u'words': 2,
                u'characters': 2,
                u'num_token_characters': 8
        }
        padding_lengths[u'token_ids'] = 7
        padding_lengths[u'additional_key'] = 3
        padding_lengths[u'words'] = 4
        padding_lengths[u'characters'] = 4
        tensors = field.as_tensor(padding_lengths)
        assert list(tensors[u'token_ids'].shape) == [7]
        assert list(tensors[u'additional_key'].shape) == [3]
        assert list(tensors[u'words'].shape) == [4]
        assert list(tensors[u'characters'].shape) == [4, 8]
Beispiel #40
0
## Modify these parameters so that I am not f****d in memory in my litle servergb
cf_a.datareader_lazy = True  # Force lazyness for RAM optimization
cf_a.batch_size_train = 30
cf_a.batch_size_validation = 30
cf_a.force_free_batch_memory = False
max_instances_in_memory = 1000
print_conf_params(cf_a)

folder_images+= "Eta_"+str(cf_a.eta_KL) + "_DOr_" + str(cf_a.spans_output_dropout) + \
"_sigma_" + str(round(np.exp(cf_a.VB_span_end_predictor_linear_prior["log_sigma1"]),3))
"""
##################################################################
############ INSTANTIATE DATAREADER AND LOAD DATASET ############
##################################################################
"""
vocab = Vocabulary()
"""
########################################################
################# INSTANTIATE THE MODEL ###################
"""
if (Experiments_instantiate_model):
    print("Initializing Model architecture")
    model = BidirectionalAttentionFlow_1(vocab, cf_a)
    print("Loading previous model")
    model.load_state_dict(torch.load(model_file_path))

#model.trim_model(4)


def plots_weights_layer(mu_W,
                        sigma_W,
Beispiel #41
0
# coding=utf-8
# @Author: 莫冉
# @Date: 2020-08-08
"""
测试获取allennlp词表文件
"""
from pathlib import Path
from allennlp.data import Vocabulary

basename = "/home/zs261988/"
save_path = "data/vocab/bert_vocabulary"
vocab_file = "models/ptms/albert_void_tiny/vocab.txt"

# vocab = Vocabulary(padding_token="[PAD]", oov_token="[UNK]")
# # #
# # # 加载bert词表
# vocab.set_from_file(Path(basename) / vocab_file, oov_token="[UNK]")
# # #
# vocab.save_to_files(Path(basename) / save_path)
#
# 加载之前保存到词表
vocab = Vocabulary.from_files(Path(basename) / save_path,
                              padding_token="[PAD]",
                              oov_token="[UNK]")

print("oov_token: ", vocab._oov_token, vocab.get_token_index(vocab._oov_token))
print("padding_token: ", vocab._padding_token,
      vocab.get_token_index(vocab._padding_token))
Beispiel #42
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 question_encoder: Optional[Seq2SeqEncoder] = None,
                 choice_encoder: Optional[Seq2SeqEncoder] = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 aggregate_question: Optional[str] = "max",
                 aggregate_choice: Optional[str] = "max",
                 embeddings_dropout_value: Optional[float] = 0.0,
                 share_encoders: Optional[bool] = False,
                 choices_init_from_question_states: Optional[bool] = False,
                 use_choice_sum_instead_of_question: Optional[bool] = False,
                 params=Params) -> None:
        super(QAMultiChoice_OneVsRest_Choices_v1, self).__init__(vocab)

        # TO DO: AllenNLP does not support statefull RNNS yet..
        init_is_supported = False
        if not init_is_supported and (choices_init_from_question_states):
            raise ValueError(
                "choices_init_from_question_states=True or facts_init_from_question_states=True are not supported yet!"
            )
        else:
            self._choices_init_from_question_states = choices_init_from_question_states

        self._use_cuda = (torch.cuda.is_available()
                          and torch.cuda.current_device() >= 0)

        self._return_question_to_choices_att = False
        self._use_choice_sum_instead_of_question = use_choice_sum_instead_of_question

        self._params = params

        self._text_field_embedder = text_field_embedder
        if embeddings_dropout_value > 0.0:
            self._embeddings_dropout = torch.nn.Dropout(
                p=embeddings_dropout_value)
        else:
            self._embeddings_dropout = lambda x: x

        self._question_encoder = question_encoder

        # choices encoding
        self._choice_encoder = choice_encoder

        self._question_aggregate = aggregate_question
        self._choice_aggregate = aggregate_choice

        self._num_labels = vocab.get_vocab_size(namespace="labels")

        question_output_dim = self._text_field_embedder.get_output_dim()
        if self._question_encoder is not None:
            question_output_dim = self._question_encoder.get_output_dim()

        choice_output_dim = self._text_field_embedder.get_output_dim()
        if self._choice_encoder is not None:
            choice_output_dim = self._choice_encoder.get_output_dim()

        if question_output_dim != choice_output_dim:
            raise ConfigurationError(
                "Output dimension of the question_encoder (dim: {}), "
                "plus choice_encoder (dim: {})"
                "must match! ".format(question_output_dim, choice_output_dim))

        # question to choice attention
        att_question_to_choice_params = params.get("att_question_to_choice")
        if "tensor_1_dim" in att_question_to_choice_params:
            att_question_to_choice_params = update_params(
                att_question_to_choice_params, {
                    "tensor_1_dim": question_output_dim,
                    "tensor_2_dim": choice_output_dim
                })
        self._matrix_attention_question_to_choice = LegacyMatrixAttention(
            SimilarityFunction.from_params(att_question_to_choice_params))

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self)
 def setup_method(self):
     super().setup_method()
     self.reader = TransformerSquadReader(length_limit=50, stride=10)
     self.vocab = Vocabulary()
     self.model = TransformerQA(self.vocab)
     self.predictor = TransformerQAPredictor(self.model, self.reader)
Beispiel #44
0
    def __init__(self,
                 vocab: Vocabulary,
                 mention_feedforward: FeedForward,
                 relation_feedforward: FeedForward,
                 feature_size: int,
                 spans_per_word: float,
                 span_emb_dim: int,
                 use_biaffine_rel: bool,
                 rel_prop: int = 0,
                 rel_prop_dropout_A: float = 0.0,
                 rel_prop_dropout_f: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 positive_label_weight: float = 1.0,
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(RelationExtractor, self).__init__(vocab, regularizer)

        # Need to hack this for cases where there's no relation data. It breaks Ulme's code.
        self._n_labels = max(vocab.get_vocab_size("relation_labels"), 1)

        # Span candidate scorer.
        # TODO(dwadden) make sure I've got the input dim right on this one.
        feedforward_scorer = torch.nn.Sequential(
            TimeDistributed(mention_feedforward),
            TimeDistributed(
                torch.nn.Linear(mention_feedforward.get_output_dim(), 1)))
        self._mention_pruner = Pruner(feedforward_scorer)

        # Relation scorer.
        self._use_biaffine_rel = use_biaffine_rel
        if self._use_biaffine_rel:
            self._biaffine = torch.nn.Linear(span_emb_dim, span_emb_dim)
        else:
            self._relation_feedforward = relation_feedforward
            self._relation_scorer = torch.nn.Linear(
                relation_feedforward.get_output_dim(), self._n_labels)

        self._spans_per_word = spans_per_word

        # TODO(dwadden) Add code to compute relation F1.
        # self._candidate_recall = CandidateRecall()
        self._relation_metrics = RelationMetrics1()

        class_weights = torch.cat([
            torch.tensor([1.0]),
            positive_label_weight * torch.ones(self._n_labels)
        ])
        self._loss = torch.nn.CrossEntropyLoss(reduction="sum",
                                               ignore_index=-1,
                                               weight=class_weights)
        self.rel_prop = rel_prop

        # Relation Propagation
        self._A_network = FeedForward(input_dim=self._n_labels,
                                      num_layers=1,
                                      hidden_dims=span_emb_dim,
                                      activations=lambda x: x,
                                      dropout=rel_prop_dropout_A)
        self._f_network = FeedForward(input_dim=2 * span_emb_dim,
                                      num_layers=1,
                                      hidden_dims=span_emb_dim,
                                      activations=torch.nn.Sigmoid(),
                                      dropout=rel_prop_dropout_f)

        initializer(self)
Beispiel #45
0
    def __init__(
            self,
            vocab: Vocabulary,
            trigger_feedforward: FeedForward,
            trigger_candidate_feedforward: FeedForward,
            mention_feedforward: FeedForward,  # Used if entity beam is off.
            argument_feedforward: FeedForward,
            context_attention: BilinearMatrixAttention,
            trigger_attention: Seq2SeqEncoder,
            span_prop: SpanProp,
            cls_projection: FeedForward,
            feature_size: int,
            trigger_spans_per_word: float,
            argument_spans_per_word: float,
            loss_weights,
            trigger_attention_context: bool,
            event_args_use_trigger_labels: bool,
            event_args_use_ner_labels: bool,
            event_args_label_emb: int,
            shared_attention_context: bool,
            label_embedding_method: str,
            event_args_label_predictor: str,
            event_args_gold_candidates:
        bool = False,  # If True, use gold argument candidates.
            context_window: int = 0,
            softmax_correction: bool = False,
            initializer: InitializerApplicator = InitializerApplicator(),
            positive_label_weight: float = 1.0,
            entity_beam: bool = False,
            regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(EventExtractor, self).__init__(vocab, regularizer)

        self._n_ner_labels = vocab.get_vocab_size("ner_labels")
        self._n_trigger_labels = vocab.get_vocab_size("trigger_labels")
        self._n_argument_labels = vocab.get_vocab_size("argument_labels")

        # Embeddings for trigger labels and ner labels, to be used by argument scorer.
        # These will be either one-hot encodings or learned embeddings, depending on "kind".
        self._ner_label_emb = make_embedder(kind=label_embedding_method,
                                            num_embeddings=self._n_ner_labels,
                                            embedding_dim=event_args_label_emb)
        self._trigger_label_emb = make_embedder(
            kind=label_embedding_method,
            num_embeddings=self._n_trigger_labels,
            embedding_dim=event_args_label_emb)
        self._label_embedding_method = label_embedding_method

        # Weight on trigger labeling and argument labeling.
        self._loss_weights = loss_weights

        # Trigger candidate scorer.
        null_label = vocab.get_token_index("", "trigger_labels")
        assert null_label == 0  # If not, the dummy class won't correspond to the null label.

        self._trigger_scorer = torch.nn.Sequential(
            TimeDistributed(trigger_feedforward),
            TimeDistributed(
                torch.nn.Linear(trigger_feedforward.get_output_dim(),
                                self._n_trigger_labels - 1)))

        self._trigger_attention_context = trigger_attention_context
        if self._trigger_attention_context:
            self._trigger_attention = trigger_attention

        # Make pruners. If `entity_beam` is true, use NER and trigger scorers to construct the beam
        # and only keep candidates that the model predicts are actual entities or triggers.
        self._mention_pruner = make_pruner(
            mention_feedforward,
            entity_beam=entity_beam,
            gold_beam=event_args_gold_candidates)
        self._trigger_pruner = make_pruner(trigger_candidate_feedforward,
                                           entity_beam=entity_beam,
                                           gold_beam=False)

        # Argument scorer.
        self._event_args_use_trigger_labels = event_args_use_trigger_labels  # If True, use trigger labels.
        self._event_args_use_ner_labels = event_args_use_ner_labels  # If True, use ner labels to predict args.
        assert event_args_label_predictor in [
            "hard", "softmax", "gold"
        ]  # Method for predicting labels at test time.
        self._event_args_label_predictor = event_args_label_predictor
        self._event_args_gold_candidates = event_args_gold_candidates
        # If set to True, then construct a context vector from a bilinear attention over the trigger
        # / argument pair embeddings and the text.
        self._context_window = context_window  # If greater than 0, concatenate context as features.
        self._argument_feedforward = argument_feedforward
        self._argument_scorer = torch.nn.Linear(
            argument_feedforward.get_output_dim(), self._n_argument_labels)

        # Distance embeddings.
        self._num_distance_buckets = 10  # Just use 10 which is the default.
        self._distance_embedding = Embedding(self._num_distance_buckets,
                                             feature_size)

        # Class token projection.
        self._cls_projection = cls_projection
        self._cls_n_triggers = torch.nn.Linear(
            self._cls_projection.get_output_dim(), 5)
        self._cls_event_types = torch.nn.Linear(
            self._cls_projection.get_output_dim(), self._n_trigger_labels - 1)

        self._trigger_spans_per_word = trigger_spans_per_word
        self._argument_spans_per_word = argument_spans_per_word

        # Context attention for event argument scorer.
        self._shared_attention_context = shared_attention_context
        if self._shared_attention_context:
            self._shared_attention_context_module = context_attention

        # Span propagation object.
        # TODO(dwadden) initialize with `from_params` instead if this ends up working.
        self._span_prop = span_prop
        self._span_prop._trig_arg_embedder = self._compute_trig_arg_embeddings
        self._span_prop._argument_scorer = self._compute_argument_scores

        # Softmax correction parameters.
        self._softmax_correction = softmax_correction
        self._softmax_log_temp = torch.nn.Parameter(
            torch.zeros([1, 1, 1, self._n_argument_labels]))
        self._softmax_log_multiplier = torch.nn.Parameter(
            torch.zeros([1, 1, 1, self._n_argument_labels]))

        # TODO(dwadden) Add metrics.
        self._metrics = EventMetrics()
        self._argument_stats = ArgumentStats()

        self._trigger_loss = torch.nn.CrossEntropyLoss(reduction="sum")
        # TODO(dwadden) add loss weights.
        self._argument_loss = torch.nn.CrossEntropyLoss(reduction="sum",
                                                        ignore_index=-1)
        initializer(self)
Beispiel #46
0
def run_evaluation(evaluation_file,
                   model_archive_file,
                   is_wordnet_and_wiki=False):
    archive = load_archive(model_archive_file)

    params = archive.config
    vocab = Vocabulary.from_params(params.pop('vocabulary'))

    model = archive.model
    #model.cuda()
    model.eval()

    if is_wordnet_and_wiki:
        reader_params = Params({
            "type": "aida_wiki_linking",
            "entity_disambiguation_only": False,
            "entity_indexer": {
                "type": "characters_tokenizer",
                "namespace": "entity_wiki",
                "tokenizer": {
                    "type": "word",
                    "word_splitter": {
                        "type": "just_spaces"
                    }
                }
            },
            "extra_candidate_generators": {
                "wordnet": {
                    "type": "wordnet_mention_generator",
                    "entity_file":
                    "s3://allennlp/knowbert/wordnet/entities.jsonl"
                }
            },
            "should_remap_span_indices": True,
            "token_indexers": {
                "tokens": {
                    "type": "bert-pretrained",
                    "do_lowercase": True,
                    "max_pieces": 512,
                    "pretrained_model": "bert-base-uncased",
                    "use_starting_offsets": True,
                }
            }
        })
    else:
        reader_params = Params({
            "type": "aida_wiki_linking",
            "entity_disambiguation_only": False,
            "token_indexers": {
                "tokens": {
                    "type": "bert-pretrained",
                    "pretrained_model": "bert-base-uncased",
                    "do_lowercase": True,
                    "use_starting_offsets": True,
                    "max_pieces": 512,
                },
            },
            "entity_indexer": {
                "type": "characters_tokenizer",
                "tokenizer": {
                    "type": "word",
                    "word_splitter": {
                        "type": "just_spaces"
                    },
                },
                "namespace": "entity",
            },
            "should_remap_span_indices": True,
        })

    if is_wordnet_and_wiki:
        cg_params = Params({
            "type": "bert_tokenizer_and_candidate_generator",
            "bert_model_type": "bert-base-uncased",
            "do_lower_case": True,
            "entity_candidate_generators": {
                "wordnet": {
                    "type": "wordnet_mention_generator",
                    "entity_file":
                    "s3://allennlp/knowbert/wordnet/entities.jsonl"
                }
            },
            "entity_indexers": {
                "wordnet": {
                    "type": "characters_tokenizer",
                    "namespace": "entity_wordnet",
                    "tokenizer": {
                        "type": "word",
                        "word_splitter": {
                            "type": "just_spaces"
                        }
                    }
                }
            }
        })
        candidate_generator = TokenizerAndCandidateGenerator.from_params(
            cg_params)

    reader = DatasetReader.from_params(Params(reader_params))

    iterator = DataIterator.from_params(
        Params({
            "type": "basic",
            "batch_size": 16
        }))
    iterator.index_with(vocab)

    instances = reader.read(evaluation_file)

    for batch_no, batch in enumerate(
            iterator(instances, shuffle=False, num_epochs=1)):
        b = move_to_device(batch, -1)

        b['candidates'] = {
            'wiki': {
                'candidate_entities': b.pop('candidate_entities'),
                'candidate_entity_priors': b.pop('candidate_entity_prior'),
                'candidate_segment_ids': b.pop('candidate_segment_ids'),
                'candidate_spans': b.pop('candidate_spans')
            }
        }
        gold_entities = b.pop('gold_entities')
        b['gold_entities'] = {'wiki': gold_entities}

        if is_wordnet_and_wiki:
            extra_candidates = b.pop('extra_candidates')
            seq_len = b['tokens']['tokens'].shape[1]
            bbb = []
            for e in extra_candidates:
                for k in e.keys():
                    e[k]['candidate_segment_ids'] = [0] * len(
                        e[k]['candidate_spans'])
                ee = {
                    'tokens': ['[CLS]'] * seq_len,
                    'segment_ids': [0] * seq_len,
                    'candidates': e
                }
                ee_fields = candidate_generator.convert_tokens_candidates_to_fields(
                    ee)
                bbb.append(Instance(ee_fields))
            eb = Batch(bbb)
            eb.index_instances(vocab)
            padding_lengths = eb.get_padding_lengths()
            tensor_dict = eb.as_tensor_dict(padding_lengths)
            b['candidates'].update(tensor_dict['candidates'])
            bb = move_to_device(b, -1)
        else:
            bb = b

        loss = model(**bb)
        if batch_no % 100 == 0:
            print(model.get_metrics())

    print(model.get_metrics())
class TestBasicTextFieldEmbedder(AllenNlpTestCase):
    def setUp(self):
        super(TestBasicTextFieldEmbedder, self).setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1")
        self.vocab.add_token_to_namespace("2")
        self.vocab.add_token_to_namespace("3")
        self.vocab.add_token_to_namespace("4")
        params = Params({
                "token_embedders": {
                        "words1": {
                                "type": "embedding",
                                "embedding_dim": 2
                                },
                        "words2": {
                                "type": "embedding",
                                "embedding_dim": 5
                                },
                        "words3": {
                                "type": "embedding",
                                "embedding_dim": 3
                                }
                        }
                })
        self.token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params)
        self.inputs = {
                "words1": torch.LongTensor([[0, 2, 3, 5]]),
                "words2": torch.LongTensor([[1, 4, 3, 2]]),
                "words3": torch.LongTensor([[1, 5, 1, 2]])
                }

    def test_get_output_dim_aggregates_dimension_from_each_embedding(self):
        assert self.token_embedder.get_output_dim() == 10

    def test_forward_asserts_input_field_match(self):
        # Total mismatch
        self.inputs['words4'] = self.inputs['words3']
        del self.inputs['words3']
        with pytest.raises(ConfigurationError) as exc:
            self.token_embedder(self.inputs)
        assert exc.match("Mismatched token keys")

        self.inputs['words3'] = self.inputs['words4']

        # Text field has too many inputs
        with pytest.raises(ConfigurationError) as exc:
            self.token_embedder(self.inputs)
        assert exc.match("is generating more keys")

        del self.inputs['words4']

    def test_forward_concats_resultant_embeddings(self):
        assert self.token_embedder(self.inputs).size() == (1, 4, 10)

    def test_forward_works_on_higher_order_input(self):
        params = Params({
                "token_embedders": {
                        "words": {
                                "type": "embedding",
                                "num_embeddings": 20,
                                "embedding_dim": 2,
                                },
                        "characters": {
                                "type": "character_encoding",
                                "embedding": {
                                        "embedding_dim": 4,
                                        "num_embeddings": 15,
                                        },
                                "encoder": {
                                        "type": "cnn",
                                        "embedding_dim": 4,
                                        "num_filters": 10,
                                        "ngram_filter_sizes": [3],
                                        },
                                }
                        }
                })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params)
        inputs = {
                'words': (torch.rand(3, 4, 5, 6) * 20).long(),
                'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(),
                }
        assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)

    def test_forward_runs_with_non_bijective_mapping(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo'
        options_file = str(elmo_fixtures_path / 'options.json')
        weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5')
        params = Params({
                "token_embedders": {
                        "words": {
                                "type": "embedding",
                                "num_embeddings": 20,
                                "embedding_dim": 2,
                                },
                        "elmo": {
                                "type": "elmo_token_embedder",
                                "options_file": options_file,
                                "weight_file": weight_file
                                },
                        },
                "embedder_to_indexer_map": {"words": ["words"], "elmo": ["elmo", "words"]}
                })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
                'words': (torch.rand(3, 6) * 20).long(),
                'elmo': (torch.rand(3, 6, 50) * 15).long(),
                }
        token_embedder(inputs)

    def test_forward_runs_with_non_bijective_mapping_with_null(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo'
        options_file = str(elmo_fixtures_path / 'options.json')
        weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5')
        params = Params({
                "token_embedders": {
                        "elmo": {
                                "type": "elmo_token_embedder",
                                "options_file": options_file,
                                "weight_file": weight_file
                        },
                },
                "embedder_to_indexer_map": {
                        # ignore `word_inputs` in `ElmoTokenEmbedder.forward`
                        "elmo": ["elmo", None]
                }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
                'elmo': (torch.rand(3, 6, 50) * 15).long(),
        }
        token_embedder(inputs)

    def test_forward_runs_with_non_bijective_mapping_with_dict(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo'
        options_file = str(elmo_fixtures_path / 'options.json')
        weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5')
        params = Params({
                "token_embedders": {
                        "words": {
                                "type": "embedding",
                                "num_embeddings": 20,
                                "embedding_dim": 2,
                        },
                        "elmo": {
                                "type": "elmo_token_embedder",
                                "options_file": options_file,
                                "weight_file": weight_file
                        },
                },
                "embedder_to_indexer_map": {
                        # pass arguments to `ElmoTokenEmbedder.forward` by dict
                        "elmo": {
                                "inputs": "elmo",
                                "word_inputs": "words"
                        },
                        "words": ["words"]
                }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
                'words': (torch.rand(3, 6) * 20).long(),
                'elmo': (torch.rand(3, 6, 50) * 15).long(),
        }
        token_embedder(inputs)

    def test_old_from_params_new_from_params(self):
        old_params = Params({
                "words1": {
                        "type": "embedding",
                        "embedding_dim": 2
                        },
                "words2": {
                        "type": "embedding",
                        "embedding_dim": 5
                        },
                "words3": {
                        "type": "embedding",
                        "embedding_dim": 3
                        }
                })

        # Allow loading the parameters in the old format
        with pytest.warns(DeprecationWarning):
            old_embedder = BasicTextFieldEmbedder.from_params(params=old_params, vocab=self.vocab)

        new_params = Params({
                "token_embedders": {
                        "words1": {
                                "type": "embedding",
                                "embedding_dim": 2
                                },
                        "words2": {
                                "type": "embedding",
                                "embedding_dim": 5
                                },
                        "words3": {
                                "type": "embedding",
                                "embedding_dim": 3
                                }
                        }
                })

        # But also allow loading the parameters in the new format
        new_embedder = BasicTextFieldEmbedder.from_params(params=new_params, vocab=self.vocab)
        assert old_embedder._token_embedders.keys() == new_embedder._token_embedders.keys()

        assert new_embedder(self.inputs).size() == (1, 4, 10)
class TestListField(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')
        for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']:
            self.vocab.add_token_to_namespace(label, 'labels')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {
            "words": SingleIdTokenIndexer("words"),
            "characters": TokenCharactersIndexer("characters")
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]],
            self.word_indexer)
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]],
            self.word_indexer)
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]],
            self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1],
                                                       self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field(
        )

        super(TestListField, self).setUp()

    def test_get_padding_lengths(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        lengths = list_field.get_padding_lengths()
        assert lengths == {
            "num_fields": 3,
            "list_words_length": 5,
            "list_num_tokens": 5
        }

    def test_list_field_can_handle_empty_text_fields(self):
        list_field = ListField(
            [self.field1, self.field2, self.empty_text_field])
        list_field.index(self.vocab)
        tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(
            tensor_dict["words"].detach().cpu().numpy(),
            numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]]))

    def test_list_field_can_handle_empty_index_fields(self):
        list_field = ListField(
            [self.index_field, self.index_field, self.empty_index_field])
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(),
                                         numpy.array([[1], [1], [-1]]))

    def test_list_field_can_handle_empty_sequence_label_fields(self):
        list_field = ListField([
            self.sequence_label_field, self.sequence_label_field,
            self.empty_sequence_label_field
        ])
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(
            tensor.detach().cpu().numpy(),
            numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]]))

    def test_all_fields_padded_to_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][0].detach().cpu().numpy(),
            numpy.array([2, 3, 4, 5, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][1].detach().cpu().numpy(),
            numpy.array([2, 3, 4, 1, 5]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][2].detach().cpu().numpy(),
            numpy.array([2, 3, 1, 5, 0]))

    def test_nested_list_fields_are_padded_correctly(self):
        nested_field1 = ListField(
            [LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']])
        nested_field2 = ListField(
            [LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']])
        list_field = ListField(
            [nested_field1.empty_field(), nested_field1, nested_field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6}
        tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy()
        numpy.testing.assert_almost_equal(
            tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1],
                     [5, 6, 7, 8, 9, 10]])

    def test_fields_can_pad_to_greater_than_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        padding_lengths["list_words_length"] = 7
        padding_lengths["num_fields"] = 5
        tensor_dict = list_field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][0].detach().cpu().numpy(),
            numpy.array([2, 3, 4, 5, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][1].detach().cpu().numpy(),
            numpy.array([2, 3, 4, 1, 5, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][2].detach().cpu().numpy(),
            numpy.array([2, 3, 1, 5, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][3].detach().cpu().numpy(),
            numpy.array([0, 0, 0, 0, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][4].detach().cpu().numpy(),
            numpy.array([0, 0, 0, 0, 0, 0, 0]))

    def test_as_tensor_can_handle_multiple_token_indexers(self):
        # pylint: disable=protected-access
        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict["words"].detach().cpu().numpy()
        characters = tensor_dict["characters"].detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(
            words,
            numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[0],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[1],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0],
                         [1, 1, 1, 1, 3, 1, 3, 4, 5],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[2],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 4, 1, 5, 1, 3, 1, 0, 0],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

    def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(
            self):
        # pylint: disable=protected-access
        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField(
            [self.field1.empty_field(), self.field1, self.field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict["words"].detach().cpu().numpy()
        characters = tensor_dict["characters"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(
            words,
            numpy.array([[0, 0, 0, 0, 0], [2, 3, 4, 5, 0], [2, 3, 4, 1, 5]]))

        numpy.testing.assert_array_almost_equal(characters[0],
                                                numpy.zeros([5, 9]))

        numpy.testing.assert_array_almost_equal(
            characters[1],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[2],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0],
                         [1, 1, 1, 1, 3, 1, 3, 4, 5],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0]]))

    def test_printing_doesnt_crash(self):
        list_field = ListField([self.field1, self.field2])
        print(list_field)

    def test_sequence_methods(self):
        list_field = ListField([self.field1, self.field2, self.field3])

        assert len(list_field) == 3
        assert list_field[1] == self.field2
        assert [f for f in list_field
                ] == [self.field1, self.field2, self.field3]
Beispiel #49
0
    def extend_vocab(
        self,
        extended_vocab: Vocabulary,
        vocab_namespace: str = None,
        extension_pretrained_file: str = None,
        model_path: str = None,
    ):
        """
        Extends the embedding matrix according to the extended vocabulary.
        If extension_pretrained_file is available, it will be used for initializing the new words
        embeddings in the extended vocabulary; otherwise we will check if _pretrained_file attribute
        is already available. If none is available, they will be initialized with xavier uniform.

        # Parameters

        extended_vocab : `Vocabulary`
            Vocabulary extended from original vocabulary used to construct
            this `Embedding`.
        vocab_namespace : `str`, (optional, default=`None`)
            In case you know what vocab_namespace should be used for extension, you
            can pass it. If not passed, it will check if vocab_namespace used at the
            time of `Embedding` construction is available. If so, this namespace
            will be used or else extend_vocab will be a no-op.
        extension_pretrained_file : `str`, (optional, default=`None`)
            A file containing pretrained embeddings can be specified here. It can be
            the path to a local file or an URL of a (cached) remote file. Check format
            details in `from_params` of `Embedding` class.
        model_path : `str`, (optional, default=`None`)
            Path traversing the model attributes upto this embedding module.
            Eg. "_text_field_embedder.token_embedder_tokens". This is only useful
            to give a helpful error message when extend_vocab is implicitly called
            by train or any other command.
        """
        # Caveat: For allennlp v0.8.1 and below, we weren't storing vocab_namespace as an attribute,
        # knowing which is necessary at time of embedding vocab extension. So old archive models are
        # currently unextendable.

        vocab_namespace = vocab_namespace or self._vocab_namespace
        if not vocab_namespace:
            # It's not safe to default to "tokens" or any other namespace.
            logging.info(
                "Loading a model trained before embedding extension was implemented; "
                "pass an explicit vocab namespace if you want to extend the vocabulary."
            )
            return

        extended_num_embeddings = extended_vocab.get_vocab_size(
            vocab_namespace)
        if extended_num_embeddings == self.num_embeddings:
            # It's already been extended. No need to initialize / read pretrained file in first place (no-op)
            return

        if extended_num_embeddings < self.num_embeddings:
            raise ConfigurationError(
                f"Size of namespace, {vocab_namespace} for extended_vocab is smaller than "
                f"embedding. You likely passed incorrect vocab or namespace for extension."
            )

        # Case 1: user passed extension_pretrained_file and it's available.
        if extension_pretrained_file and is_url_or_existing_file(
                extension_pretrained_file):
            # Don't have to do anything here, this is the happy case.
            pass
        # Case 2: user passed extension_pretrained_file and it's not available
        elif extension_pretrained_file:
            raise ConfigurationError(
                f"You passed pretrained embedding file {extension_pretrained_file} "
                f"for model_path {model_path} but it's not available.")
        # Case 3: user didn't pass extension_pretrained_file, but pretrained_file attribute was
        # saved during training and is available.
        elif is_url_or_existing_file(self._pretrained_file):
            extension_pretrained_file = self._pretrained_file
        # Case 4: no file is available, hope that pretrained embeddings weren't used in the first place and warn
        else:
            extra_info = (f"Originally pretrained_file was at "
                          f"{self._pretrained_file}. "
                          if self._pretrained_file else "")
            # It's better to warn here and not give error because there is no way to distinguish between
            # whether pretrained-file wasn't used during training or user forgot to pass / passed incorrect
            # mapping. Raising an error would prevent fine-tuning in the former case.
            logging.warning(
                f"Embedding at model_path, {model_path} cannot locate the pretrained_file. "
                f"{extra_info} If you are fine-tuning and want to use using pretrained_file for "
                f"embedding extension, please pass the mapping by --embedding-sources argument."
            )

        embedding_dim = self.weight.data.shape[-1]
        if not extension_pretrained_file:
            extra_num_embeddings = extended_num_embeddings - self.num_embeddings
            extra_weight = torch.FloatTensor(extra_num_embeddings,
                                             embedding_dim)
            torch.nn.init.xavier_uniform_(extra_weight)
        else:
            # It's easiest to just reload the embeddings for the entire vocab,
            # then only keep the ones we need.
            whole_weight = _read_pretrained_embeddings_file(
                extension_pretrained_file, embedding_dim, extended_vocab,
                vocab_namespace)
            extra_weight = whole_weight[self.num_embeddings:, :]

        device = self.weight.data.device
        extended_weight = torch.cat(
            [self.weight.data, extra_weight.to(device)], dim=0)
        self.weight = torch.nn.Parameter(
            extended_weight, requires_grad=self.weight.requires_grad)
        self.num_embeddings = extended_num_embeddings
class TestBasicTextFieldEmbedder(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1")
        self.vocab.add_token_to_namespace("2")
        self.vocab.add_token_to_namespace("3")
        self.vocab.add_token_to_namespace("4")
        params = Params({
            "token_embedders": {
                "words1": {
                    "type": "embedding",
                    "embedding_dim": 2
                },
                "words2": {
                    "type": "embedding",
                    "embedding_dim": 5
                },
                "words3": {
                    "type": "embedding",
                    "embedding_dim": 3
                },
            }
        })
        self.token_embedder = BasicTextFieldEmbedder.from_params(
            vocab=self.vocab, params=params)
        self.inputs = {
            "words1": {
                "tokens": torch.LongTensor([[0, 2, 3, 5]])
            },
            "words2": {
                "tokens": torch.LongTensor([[1, 4, 3, 2]])
            },
            "words3": {
                "tokens": torch.LongTensor([[1, 5, 1, 2]])
            },
        }

    def test_get_output_dim_aggregates_dimension_from_each_embedding(self):
        assert self.token_embedder.get_output_dim() == 10

    def test_forward_asserts_input_field_match(self):
        # Total mismatch
        self.inputs["words4"] = self.inputs["words3"]
        del self.inputs["words3"]
        with pytest.raises(ConfigurationError) as exc:
            self.token_embedder(self.inputs)
        assert exc.match("Mismatched token keys")

        self.inputs["words3"] = self.inputs["words4"]

        # Text field has too many inputs
        with pytest.raises(ConfigurationError) as exc:
            self.token_embedder(self.inputs)
        assert exc.match("Mismatched token keys")

        del self.inputs["words4"]

    def test_forward_concats_resultant_embeddings(self):
        assert self.token_embedder(self.inputs).size() == (1, 4, 10)

    def test_forward_works_on_higher_order_input(self):
        params = Params({
            "token_embedders": {
                "words": {
                    "type": "embedding",
                    "num_embeddings": 20,
                    "embedding_dim": 2
                },
                "characters": {
                    "type": "character_encoding",
                    "embedding": {
                        "embedding_dim": 4,
                        "num_embeddings": 15
                    },
                    "encoder": {
                        "type": "cnn",
                        "embedding_dim": 4,
                        "num_filters": 10,
                        "ngram_filter_sizes": [3],
                    },
                },
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                            params=params)
        inputs = {
            "words": {
                "tokens": (torch.rand(3, 4, 5, 6) * 20).long()
            },
            "characters": {
                "token_characters": (torch.rand(3, 4, 5, 6, 7) * 15).long()
            },
        }
        assert token_embedder(inputs,
                              num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)

    def test_forward_runs_with_forward_params(self):
        class FakeEmbedder(torch.nn.Module):
            def __init__(self):
                super().__init__()

            def forward(self, tokens: torch.Tensor, extra_arg: int = None):
                assert tokens is not None
                assert extra_arg is not None
                return tokens

        token_embedder = BasicTextFieldEmbedder({"elmo": FakeEmbedder()})
        inputs = {"elmo": {"tokens": (torch.rand(3, 6, 5) * 2).long()}}
        kwargs = {"extra_arg": 1}
        token_embedder(inputs, **kwargs)

    def test_forward_runs_with_non_bijective_mapping(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / "elmo"
        options_file = str(elmo_fixtures_path / "options.json")
        weight_file = str(elmo_fixtures_path / "lm_weights.hdf5")
        params = Params({
            "token_embedders": {
                "words": {
                    "type": "embedding",
                    "num_embeddings": 20,
                    "embedding_dim": 2
                },
                "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": options_file,
                    "weight_file": weight_file,
                },
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                            params=params)
        inputs = {
            "words": {
                "tokens": (torch.rand(3, 6) * 20).long()
            },
            "elmo": {
                "tokens": (torch.rand(3, 6, 50) * 15).long()
            },
        }
        token_embedder(inputs)

    def test_forward_runs_with_non_bijective_mapping_with_null(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / "elmo"
        options_file = str(elmo_fixtures_path / "options.json")
        weight_file = str(elmo_fixtures_path / "lm_weights.hdf5")
        params = Params({
            "token_embedders": {
                "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": options_file,
                    "weight_file": weight_file,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                            params=params)
        inputs = {"elmo": {"tokens": (torch.rand(3, 6, 50) * 15).long()}}
        token_embedder(inputs)

    def test_forward_runs_with_non_bijective_mapping_with_dict(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / "elmo"
        options_file = str(elmo_fixtures_path / "options.json")
        weight_file = str(elmo_fixtures_path / "lm_weights.hdf5")
        params = Params({
            "token_embedders": {
                "words": {
                    "type": "embedding",
                    "num_embeddings": 20,
                    "embedding_dim": 2
                },
                "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": options_file,
                    "weight_file": weight_file,
                },
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                            params=params)
        inputs = {
            "words": {
                "tokens": (torch.rand(3, 6) * 20).long()
            },
            "elmo": {
                "tokens": (torch.rand(3, 6, 50) * 15).long()
            },
        }
        token_embedder(inputs)

    def test_forward_runs_with_bijective_and_non_bijective_mapping(self):
        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer",
                    "model_name": "bert-base-uncased"
                },
                "token_characters": {
                    "type": "character_encoding",
                    "embedding": {
                        "embedding_dim": 5
                    },
                    "encoder": {
                        "type": "cnn",
                        "embedding_dim": 5,
                        "num_filters": 5,
                        "ngram_filter_sizes": [5],
                    },
                },
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                            params=params)
        inputs = {
            "bert": {
                "token_ids": (torch.rand(3, 5) * 10).long(),
                "mask": (torch.rand(3, 5) * 1).bool(),
            },
            "token_characters": {
                "token_characters": (torch.rand(3, 5, 5) * 1).long()
            },
        }
        token_embedder(inputs)
Beispiel #51
0
    def __init__(
        self,
        embedding_dim: int,
        num_embeddings: int = None,
        projection_dim: int = None,
        weight: torch.FloatTensor = None,
        padding_index: int = None,
        trainable: bool = True,
        max_norm: float = None,
        norm_type: float = 2.0,
        scale_grad_by_freq: bool = False,
        sparse: bool = False,
        vocab_namespace: str = "tokens",
        pretrained_file: str = None,
        vocab: Vocabulary = None,
    ) -> None:
        super().__init__()

        if num_embeddings is None and vocab is None:
            raise ConfigurationError(
                "Embedding must be constructed with either num_embeddings or a vocabulary."
            )

        if num_embeddings is None:
            num_embeddings = vocab.get_vocab_size(vocab_namespace)
        else:
            # If num_embeddings is present, set default namespace to None so that extend_vocab
            # call doesn't misinterpret that some namespace was originally used.
            vocab_namespace = None

        self.num_embeddings = num_embeddings
        self.padding_index = padding_index
        self.max_norm = max_norm
        self.norm_type = norm_type
        self.scale_grad_by_freq = scale_grad_by_freq
        self.sparse = sparse
        self._vocab_namespace = vocab_namespace
        self._pretrained_file = pretrained_file

        self.output_dim = projection_dim or embedding_dim

        if weight is not None and pretrained_file:
            raise ConfigurationError(
                "Embedding was constructed with both a weight and a pretrained file."
            )

        elif pretrained_file is not None:

            if vocab is None:
                raise ConfigurationError(
                    "To construct an Embedding from a pretrained file, you must also pass a vocabulary."
                )

            # If we're loading a saved model, we don't want to actually read a pre-trained
            # embedding file - the embeddings will just be in our saved weights, and we might not
            # have the original embedding file anymore, anyway.

            # TODO: having to pass tokens here is SUPER gross, but otherwise this breaks the
            # extend_vocab method, which relies on the value of vocab_namespace being None
            # to infer at what stage the embedding has been constructed. Phew.
            weight = _read_pretrained_embeddings_file(
                pretrained_file, embedding_dim, vocab, vocab_namespace
                or "tokens")
            self.weight = torch.nn.Parameter(weight, requires_grad=trainable)

        elif weight is not None:
            self.weight = torch.nn.Parameter(weight, requires_grad=trainable)

        else:
            weight = torch.FloatTensor(num_embeddings, embedding_dim)
            self.weight = torch.nn.Parameter(weight, requires_grad=trainable)
            torch.nn.init.xavier_uniform_(self.weight)

        # Whatever way we have constructed the embedding, it should be consistent with
        # num_embeddings and embedding_dim.
        if self.weight.size() != (num_embeddings, embedding_dim):
            raise ConfigurationError(
                "A weight matrix was passed with contradictory embedding shapes."
            )

        if self.padding_index is not None:
            self.weight.data[self.padding_index].fill_(0)

        if projection_dim:
            self._projection = torch.nn.Linear(embedding_dim, projection_dim)
        else:
            self._projection = None
Beispiel #52
0
    def test_start_and_end_tokens(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace="characters")  # 2
        vocab.add_token_to_namespace("s", namespace="characters")  # 3
        vocab.add_token_to_namespace("e", namespace="characters")  # 4
        vocab.add_token_to_namespace("n", namespace="characters")  # 5
        vocab.add_token_to_namespace("t", namespace="characters")  # 6
        vocab.add_token_to_namespace("c", namespace="characters")  # 7
        vocab.add_token_to_namespace("<", namespace="characters")  # 8
        vocab.add_token_to_namespace(">", namespace="characters")  # 9
        vocab.add_token_to_namespace("/", namespace="characters")  # 10

        indexer = TokenCharactersIndexer("characters",
                                         start_tokens=["<s>"],
                                         end_tokens=["</s>"],
                                         min_padding_length=1)
        indices = indexer.tokens_to_indices([Token("sentential")], vocab)
        assert indices == {
            "token_characters": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1],
                                 [8, 10, 3, 9]]
        }
Beispiel #53
0
from allennlp.commands.train import train_model_from_args
from mtl.dataset_readers.MTLDatasetReader import MTLDatasetReader
from mtl.models.MTLSharedClassifier import MTLSharedClassifier

reader = MTLDatasetReader(token_indexers={
    'tokens': SingleIdTokenIndexer(lowercase_tokens=True),
    'elmo': ELMoTokenCharactersIndexer()
},
                          max_sequence_len=100)
books_train_dataset = reader.read('./data/mtl-dataset/books.task.train')
books_validation_dataset = reader.read('./data/mtl-dataset/books.task.test')
imdb_train_dataset = reader.read('./data/mtl-dataset/imdb.task.train')
imdb_test_dataset = reader.read('./data/mtl-dataset/imdb.task.test')

vocab = Vocabulary.from_instances(books_train_dataset +
                                  books_validation_dataset)
iterator = BucketIterator(batch_size=128,
                          sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)
print(vocab._index_to_token)
# print(vocab.__getstate__()['_token_to_index']['labels'])
# for batch in itera  tor(books_train_dataset, num_epochs=1, shuffle=True):
#     print(batch['tokens']['tokens'], batch['label'])

print(iterator.get_num_batches(books_train_dataset))

books_iter = iter(iterator._create_batches(books_train_dataset, shuffle=True))
print(len(books_train_dataset))

print(next(books_iter).as_tensor_dict())
'''
Beispiel #54
0
    # These five lines control all the major sources of randomness.
    np.random.seed(_C.RANDOM_SEED)
    torch.manual_seed(_C.RANDOM_SEED)
    torch.cuda.manual_seed_all(_C.RANDOM_SEED)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    # Set device according to specified GPU ids.
    device = torch.device(
        f"cuda:{_A.gpu_ids[0]}" if _A.gpu_ids[0] >= 0 else "cpu")

    # --------------------------------------------------------------------------------------------
    #   INSTANTIATE VOCABULARY, DATALOADER, MODEL, OPTIMIZER
    # --------------------------------------------------------------------------------------------

    vocabulary = Vocabulary.from_files(_C.DATA.VOCABULARY)

    # If we wish to use CBS during evaluation or inference, expand the vocabulary and add
    # constraint words derived from Open Images classes.
    if _C.MODEL.USE_CBS:
        vocabulary = add_constraint_words_to_vocabulary(
            vocabulary, wordforms_tsvpath=_C.DATA.CBS.WORDFORMS)

    train_dataset = TrainingDataset.from_config(_C,
                                                vocabulary=vocabulary,
                                                in_memory=_A.in_memory)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE,
        shuffle=True,
        num_workers=_A.cpu_workers,
    def __init__(
        self,
        vocab: Vocabulary,
        bert_model: Union[str, BertModel],
        span_extractor: SpanExtractor,
        tree_mapper: TreeMapper,
        domain_utils: DomainUtils,
        is_weak_supervision: bool,
        feedforward: FeedForward = None,
        dropout: float = 0.0,
        num_labels: int = None,
        index: str = "bert",
        label_namespace: str = "labels",
        trainable: bool = True,
        initializer: InitializerApplicator = InitializerApplicator(),
        denotation_based_metric: Metric = None,
        token_based_metric: Metric = None,
        **kwargs,
    ) -> None:
        super().__init__(vocab, **kwargs)

        if isinstance(bert_model, str):
            self.bert_model = PretrainedBertModel.load(bert_model)
        else:
            self.bert_model = bert_model

        for param in self.bert_model.parameters():
            param.requires_grad = trainable

        in_features = self.bert_model.config.hidden_size

        self._label_namespace = label_namespace

        self.span_extractor = span_extractor
        self.feedforward_layer = TimeDistributed(feedforward) if feedforward else None
        self.num_classes = self.vocab.get_vocab_size("labels")
        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = span_extractor.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim, self.num_classes))

        if num_labels:
            out_features = num_labels
        else:
            out_features = vocab.get_vocab_size(namespace=self._label_namespace)

        self._dropout = torch.nn.Dropout(p=dropout)

        self._tree_mapper = tree_mapper

        labels = self.vocab.get_index_to_token_vocabulary(self._label_namespace)
        grammar = Grammar(labels)
        self._cky = CKY(grammar, tree_mapper, domain_utils)

        use_lexicon = True
        if use_lexicon:
            self.zero_shot_extractor = ZeroShotExtractor(labels, domain_utils)
            self._sim_weight = torch.nn.Parameter(
                torch.ones([1], dtype=torch.float32, requires_grad=True))

        self._classification_layer = torch.nn.Linear(in_features, out_features)
        self._accuracy = CategoricalAccuracy()
        self._accuracy_all_no_span = CategoricalAccuracy()
        self._fmeasure = F1Measure(positive_label=1)
        self._denotation_based_metric = denotation_based_metric
        self._token_based_metric = token_based_metric
        self._loss = torch.nn.CrossEntropyLoss()
        self._index = index
        initializer(self._classification_layer)

        self._epoch_counter = 0

        self._is_weak_supervision = is_weak_supervision
        if self._is_weak_supervision:
            self._weak_supervision_acc = WeakSupervisionAccuracy()
            self._label_preparer = LabelsPreparer(self.vocab.get_index_to_token_vocabulary(self._label_namespace))

        self._sets_f1_metric = SetsF1()
        self._compute_spans_f1 = False
Beispiel #56
0
def load_lm_data(fold=None, mode='train'):
    """
    Turns the sequential data into instances.
    :param split:
    :return:
    """
    # Get or make vocab
    spacy_model = get_spacy_model("en_core_web_sm",
                                  pos_tags=False,
                                  parse=False,
                                  ner=False)
    if os.path.exists('vocabulary'):
        print(
            "Loading cached vocab. caution if you're building the dataset again!!!!",
            flush=True)
        vocab = Vocabulary.from_files('vocabulary')

        with open(os.path.join(DATA_PATH, 'events-3.json'), 'r') as f:
            lm_data = json.load(f)
        lm_data = [
            data_item for s in ('train', 'val', 'test')
            for data_item in lm_data[s]
        ]
    else:
        assert fold is None
        with open(os.path.join(DATA_PATH, 'events-3.json'), 'r') as f:
            lm_data = json.load(f)
        lm_data = [
            data_item for s in ('train', 'val', 'test')
            for data_item in lm_data[s]
        ]
        # Manually doing this because I don't want to double count things
        vocab = Vocabulary.from_instances([
            Instance({
                'story':
                TextField(
                    [
                        Token(x) for x in ['@@bos@@'] +
                        [x.orth_ for x in spacy_model(sent)] + ['@@eos@@']
                    ],
                    token_indexers={
                        'tokens':
                        SingleIdTokenIndexer(namespace='tokens',
                                             lowercase_tokens=True)
                    })
            }) for data_item in lm_data for sent in data_item['sentences']
        ],
                                          min_count={'tokens': 3})

        vocab.get_index_to_token_vocabulary('tokens')
        vocab.save_to_files('vocabulary')
        print("VOCABULARY HAS {} ITEMS".format(
            vocab.get_vocab_size(namespace='tokens')))

    if all([
            os.path.exists('lm-{}-of-{}.pkl'.format(i, NUM_FOLDS))
            for i in range(NUM_FOLDS)
    ]):
        print("LOADING CACHED DATASET", flush=True)
        if mode == 'val':
            with open('lm-{}-of-{}.pkl'.format(fold, NUM_FOLDS), 'rb') as f:
                print("Loading split{} for {}".format(fold, mode))
                instances = pkl.load(f)
        else:
            instances = []
            for other_fold in range(NUM_FOLDS):
                if other_fold != fold:
                    with open('lm-{}-of-{}.pkl'.format(other_fold, NUM_FOLDS),
                              'rb') as f:
                        print("Loading split{} for {}".format(
                            other_fold, mode))
                        instances += pkl.load(f)
        return instances, vocab

    print("MAKING THE DATASET", flush=True)
    assert fold is None
    for item in tqdm(lm_data):
        item['sentences_tokenized'] = [[st.orth_ for st in spacy_model(sent)]
                                       for sent in item['sentences']]

    def _to_instances(data):
        # flatten this
        instances = []
        for item in data:
            for s1, s2 in pairwise(item['sentences_tokenized']):
                instances.append((
                    Instance({
                        'story':
                        TextField(
                            [
                                Token(x)
                                for x in ['@@bos@@'] + s1 + s2 + ['@@eos@@']
                            ],
                            token_indexers={
                                'tokens':
                                SingleIdTokenIndexer(namespace='tokens',
                                                     lowercase_tokens=True)
                            })
                    }),
                    s1,
                    s2,
                    item,
                ))
        return instances

    random.seed(123456)
    random.shuffle(lm_data)
    all_sets = []
    for fold_ in range(NUM_FOLDS):
        val_set = _to_instances(
            lm_data[len(lm_data) * fold_ // NUM_FOLDS:len(lm_data) *
                    (fold_ + 1) // NUM_FOLDS])
        with open('lm-{}-of-{}.pkl'.format(fold_, NUM_FOLDS), 'wb') as f:
            pkl.dump(val_set, f)
        all_sets.extend(val_set)
    return all_sets, vocab
Beispiel #57
0
class TestDataset(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this")
        self.vocab.add_token_to_namespace("is")
        self.vocab.add_token_to_namespace("a")
        self.vocab.add_token_to_namespace("sentence")
        self.vocab.add_token_to_namespace(".")
        self.token_indexer = {"tokens": SingleIdTokenIndexer()}
        self.instances = self.get_instances()
        super().setUp()

    def test_instances_must_have_homogeneous_fields(self):
        instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))})
        instance2 = Instance({"words": TextField([Token("hello")], {})})
        with pytest.raises(ConfigurationError):
            _ = Batch([instance1, instance2])

    def test_padding_lengths_uses_max_instance_lengths(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        assert padding_lengths == {
            "text1": {
                "num_tokens": 5,
                "tokens_length": 5
            },
            "text2": {
                "num_tokens": 6,
                "tokens_length": 6
            }
        }

    def test_as_tensor_dict(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        tensors = dataset.as_tensor_dict(padding_lengths)
        text1 = tensors["text1"]["tokens"].detach().cpu().numpy()
        text2 = tensors["text2"]["tokens"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(
            text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(
            text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))

    def get_instances(self):
        field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence", "."]],
            self.token_indexer)
        field2 = TextField([
            Token(t)
            for t in ["this", "is", "a", "different", "sentence", "."]
        ], self.token_indexer)
        field3 = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]],
            self.token_indexer)
        field4 = TextField([Token(t) for t in ["this", "is", "short"]],
                           self.token_indexer)
        instances = [
            Instance({
                "text1": field1,
                "text2": field2
            }),
            Instance({
                "text1": field3,
                "text2": field4
            })
        ]
        return instances
def build_vocab(instances: Iterable[Instance]) -> Vocabulary:
    print("Building the vocabulary")
    return Vocabulary.from_instances(instances)
Beispiel #59
0
def _read_pretrained_word2vec_format_embedding_file(
        embeddings_filename: str,  # pylint: disable=invalid-name
        embedding_dim: int,
        vocab: Vocabulary,
        namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read from a gzipped-word2vec format file.  The embeddings file is assumed to be gzipped and
    space delimited, e.g. [word] [dim 1] [dim 2] ...

    The remainder of the docstring is identical to ``_read_pretrained_embedding_file``.
    """
    words_to_keep = set(
        vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading embeddings from file")
    with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
        for line in embeddings_file:
            fields = line.decode('utf-8').strip().split(' ')
            if len(fields) - 1 != embedding_dim:
                # Sometimes there are funny unicode parsing problems that lead to different
                # fields lengths (e.g., a word with a unicode space character that splits
                # into more than one column).  We skip those lines.  Note that if you have
                # some kind of long header, this could result in all of your lines getting
                # skipped.  It's hard to check for that here; you just have to look in the
                # embedding_misses_file and at the model summary to make sure things look
                # like they are supposed to.
                logger.warning(
                    "Found line with wrong number of dimensions (expected %d, was %d): %s ...",
                    embedding_dim,
                    len(fields) - 1, line[:15])
                continue
            word = fields[0]
            if word in words_to_keep:
                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[word] = vector

    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)

    for i in range(0, vocab_size):
        word = vocab.get_token_from_index(i, namespace)

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if word in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[word])
        else:
            logger.debug(
                "Word %s was not found in the embedding file. Initialising randomly.",
                word)

    # The weight matrix is initialized, so we construct and return the actual Embedding.
    return embedding_matrix
Beispiel #60
0
    def test_min_padding_length(self):
        sentence = "AllenNLP is awesome ."
        tokens = [Token(token) for token in sentence.split(" ")]
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace="characters")  # 2
        vocab.add_token_to_namespace("l", namespace="characters")  # 3
        vocab.add_token_to_namespace("e", namespace="characters")  # 4
        vocab.add_token_to_namespace("n", namespace="characters")  # 5
        vocab.add_token_to_namespace("N", namespace="characters")  # 6
        vocab.add_token_to_namespace("L", namespace="characters")  # 7
        vocab.add_token_to_namespace("P", namespace="characters")  # 8
        vocab.add_token_to_namespace("i", namespace="characters")  # 9
        vocab.add_token_to_namespace("s", namespace="characters")  # 10
        vocab.add_token_to_namespace("a", namespace="characters")  # 11
        vocab.add_token_to_namespace("w", namespace="characters")  # 12
        vocab.add_token_to_namespace("o", namespace="characters")  # 13
        vocab.add_token_to_namespace("m", namespace="characters")  # 14
        vocab.add_token_to_namespace(".", namespace="characters")  # 15

        indexer = TokenCharactersIndexer("characters", min_padding_length=10)
        indices = indexer.tokens_to_indices(tokens, vocab)
        padded = indexer.as_padded_tensor_dict(
            indices, indexer.get_padding_lengths(indices))
        assert padded["token_characters"].tolist() == [
            [2, 3, 3, 4, 5, 6, 7, 8, 0, 0],
            [9, 10, 0, 0, 0, 0, 0, 0, 0, 0],
            [11, 12, 4, 10, 13, 14, 4, 0, 0, 0],
            [15, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        ]