Esempio n. 1
0
File: testing.py Progetto: zxlzr/kb
def get_bert_test_fixture():
    embedder_params = {
        "type": "bert-pretrained",
        "pretrained_model": "tests/fixtures/bert/bert_test_fixture.tar.gz",
        "requires_grad": True,
        "top_layer_only": True,
    }
    embedder_params_copy = dict(embedder_params)
    embedder = TokenEmbedder.from_params(Params(embedder_params))

    indexer_params = {
        "type": "bert-pretrained",
        "pretrained_model": "tests/fixtures/bert/vocab.txt",
        "do_lowercase": True,
        "use_starting_offsets": True,
        "max_pieces": 512,
    }
    indexer_params_copy = dict(indexer_params)
    indexer = TokenIndexer.from_params(Params(indexer_params))

    return {
        'embedder': embedder,
        'embedder_params': embedder_params_copy,
        'indexer': indexer,
        'indexer_params': indexer_params_copy
    }
Esempio n. 2
0
    def compile_featurizer(self, tokenizer: Tokenizer) -> InputFeaturizer:
        """Creates the featurizer based on the configured input features

        :::tip
        If you are creating configurations programmatically
        use this method to check that you provided a valid configuration.
        :::

        Parameters
        ----------
        tokenizer
            Tokenizer used for this featurizer

        Returns
        -------
        featurizer
            The configured `InputFeaturizer`
        """
        configuration = self._make_allennlp_config()

        indexer = {
            feature_namespace: TokenIndexer.from_params(Params(config["indexer"]))
            for feature_namespace, config in configuration.items()
        }

        return InputFeaturizer(tokenizer, indexer=indexer)
Esempio n. 3
0
 def from_params(cls, params: Params):
     """
     Parameters
     ----------
     squad_filename : ``str``
     negative_sentence_selection : ``str``, optional (default=``"paragraph"``)
     tokenizer : ``Params``, optional
     token_indexers: ``List[Params]``, optional
     """
     negative_sentence_selection = params.pop('negative_sentence_selection',
                                              'paragraph')
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return SquadSentenceSelectionReader(
         negative_sentence_selection=negative_sentence_selection,
         tokenizer=tokenizer,
         token_indexers=token_indexers)
Esempio n. 4
0
def _get_entity_indexers():
    indexer_params = Params({
        "type": "characters_tokenizer",
        "tokenizer": {
            "type": "word",
            "word_splitter": {
                "type": "just_spaces"
            },
        },
        "namespace": "entity"
    })
    return {'wordnet': TokenIndexer.from_params(indexer_params)}
Esempio n. 5
0
 def _get_indexer(namespace):
     return TokenIndexer.from_params(
         Params({
             "type": "characters_tokenizer",
             "tokenizer": {
                 "type": "word",
                 "word_splitter": {
                     "type": "just_spaces"
                 },
             },
             "namespace": namespace
         }))
Esempio n. 6
0
 def from_params(cls, params: Params):
     """
     Parameters
     ----------
     token_indexers: ``Dict[Params]``, optional
     """
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return SequenceTaggingDatasetReader(token_indexers=token_indexers)
Esempio n. 7
0
def token_indexer_dict_from_params(
        params: Params) -> 'Dict[str, TokenIndexer]':  # type: ignore
    """
    We typically use ``TokenIndexers`` in a dictionary, with each ``TokenIndexer`` getting a
    name.  The specification for this in a ``Params`` object is typically ``{"name" ->
    {indexer_params}}``.  This method reads that whole set of parameters and returns a
    dictionary suitable for use in a ``TextField``.

    Because default values for token indexers are typically handled in the calling class to
    this and are based on checking for ``None``, if there were no parameters specifying any
    token indexers in the given ``params``, we return ``None`` instead of an empty dictionary.
    """
    token_indexers = {}
    for name, indexer_params in params.items():
        token_indexers[name] = TokenIndexer.from_params(indexer_params)
    if token_indexers == {}:
        token_indexers = None
    return token_indexers
Esempio n. 8
0
 def from_params(cls, params: Params):
     """
     Parameters
     ----------
     filename : ``str``
     tokenizer : ``Params``, optional
     token_indexers: ``List[Params]``, optional
     """
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers)
Esempio n. 9
0
    def test_token_characters_indexer_tokenizer(self):
        params = Params({
            "type": "characters_tokenizer",
            "tokenizer": {
                "type": "word",
                "word_splitter": {
                    "type": "just_spaces"
                },
            },
            "namespace": "tok"
        })

        indexer = TokenIndexer.from_params(params)

        vocab = Vocabulary()
        vocab.add_token_to_namespace("the", namespace="tok")
        vocab.add_token_to_namespace("2", namespace="tok")

        indices = indexer.tokens_to_indices(
            [Token(t) for t in "the 2 .".split()], vocab, 'a')

        self.assertListEqual(indices['a'], [[2], [3], [1]])
Esempio n. 10
0
    def __init__(
        self,
        archive_file: str,
        dropout: float = None,
        bos_eos_tokens: Tuple[str, str] = ("<S>", "</S>"),
        remove_bos_eos: bool = True,
        requires_grad: bool = False,
    ) -> None:
        super().__init__()

        overrides = {"model": {"contextualizer": {"return_all_layers": True}}}

        # Import here to avoid circular dependency.
        from allennlp.models.archival import load_archive

        # Load LM and the associated config.
        archive = load_archive(archive_file, overrides=json.dumps(overrides))
        self._lm: LanguageModel = archive.model
        self._lm.delete_softmax()
        config = archive.config
        dict_config = config.as_dict(quiet=True)

        # Extract the name of the tokens that the LM was trained on.
        text_field_embedder = dict_config["model"]["text_field_embedder"]
        text_field_embedder = TextFieldEmbedder.from_params(
            Params(text_field_embedder))
        if not isinstance(text_field_embedder, BasicTextFieldEmbedder):
            raise ConfigurationError(
                f"Language model from {archive_file} uses a non-standard TextFieldEmbedder!"
            )
        non_empty_embedders = [
            name for name, token_embedder in
            text_field_embedder._token_embedders.items()
            if not isinstance(token_embedder, EmptyEmbedder)
        ]

        if len(non_empty_embedders) == 0:
            # Only empty embedders were contained in the language model
            # We need at least one non-empty embedder in the language model
            raise ConfigurationError(
                f"Language model from {archive_file} trained with only empty embedders!"
            )
        elif len(non_empty_embedders) > 1:
            raise ConfigurationError(
                f"Language model from {archive_file} trained with multiple non-empty embedders!"
            )

        self._token_name = non_empty_embedders[0]

        # TODO(brendanr): Find a way to remove this hack. The issue fundamentally is that the
        # BasicTextFieldEmbedder concatenates multiple embedded representations. When a
        # downstream model uses both, tokens and token characters, say, and only adds bos/eos
        # tokens to the token characters, the dimensions don't match. See:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/text_field_embedders/basic_text_field_embedder.py#L109
        #
        # For the equivalent hack in the ELMo embedder see:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/elmo.py#L590
        if bos_eos_tokens:
            dataset_reader_config = config.get("dataset_reader")
            token_indexer_config = dataset_reader_config.get(
                "token_indexers").get(self._token_name)
            token_indexer: TokenIndexer = TokenIndexer.from_params(
                token_indexer_config)
            token_list = [Token(token) for token in bos_eos_tokens]
            # TODO(brendanr): Obtain these indices from the vocab once the
            # ELMoTokenCharactersIndexer adds the mappings.
            bos_eos_indices = token_indexer.tokens_to_indices(
                token_list, self._lm.vocab)["elmo_tokens"]
            self._bos_indices = torch.LongTensor(bos_eos_indices[0])
            self._eos_indices = torch.LongTensor(bos_eos_indices[1])
        else:
            self._bos_indices = None
            self._eos_indices = None

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = lambda x: x

        self._remove_bos_eos = remove_bos_eos
        num_layers = self._lm.num_layers()
        # TODO(brendanr): Consider passing our LM as a custom module to `Elmo` instead.
        # See https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L76
        self._scalar_mix = ScalarMix(mixture_size=num_layers,
                                     do_layer_norm=False,
                                     trainable=True)

        character_dim = self._lm._text_field_embedder.get_output_dim()
        contextual_dim = self._lm._contextualizer.get_output_dim()

        if contextual_dim % character_dim != 0:
            raise ConfigurationError(
                "The output dimensions for the text_field_embedder " +
                f"({character_dim}) and the contextualizer ({contextual_dim})"
                + f" from the language model loaded from {archive_file} are " +
                "not compatible. Please check the config used to train that " +
                "model and ensure that the output dimension of the " +
                "text_field_embedder divides the output dimension of the " +
                "contextualizer.")
        self._character_embedding_duplication_count = contextual_dim // character_dim

        for param in self._lm.parameters():
            param.requires_grad = requires_grad
    def __init__(self,
                 archive_file: str,
                 dropout: float = None,
                 bos_eos_tokens: Tuple[str, str] = ("<S>", "</S>"),
                 remove_bos_eos: bool = True,
                 requires_grad: bool = False) -> None:
        super().__init__()

        overrides = {"model": {"contextualizer": {"return_all_layers": True}}}

        # Import here to avoid circular dependency.
        from allennlp.models.archival import load_archive
        # Load LM and the associated config.
        archive = load_archive(archive_file, overrides=json.dumps(overrides))
        self._lm: BidirectionalLanguageModel = archive.model
        self._lm.delete_softmax()
        config = archive.config
        dict_config = config.as_dict(quiet=True)

        # Extract the name of the tokens that the LM was trained on.
        text_field_embedder = dict_config["model"]["text_field_embedder"]
        token_names = list(text_field_embedder["token_embedders"].keys())
        if len(token_names) != 1:
            # We don't currently support embedding with language models trained with multiple
            # embedded indices.
            #
            # Note: We only care about embedded indices. This does not include "tokens" which
            # is just used to compute the loss in BidirectionalLanguageModel.
            raise ConfigurationError(
                f"LM from {archive_file} trained with multiple embedders!")
        if "embedder_to_indexer_map" in text_field_embedder:
            # Similarly we don't support multiple indexers per embedder.
            raise ConfigurationError(
                f"LM from {archive_file} trained with embedder_to_indexer_map!"
            )
        self._token_name = token_names[0]

        # TODO(brendanr): Find a way to remove this hack. The issue fundamentally is that the
        # BasicTextFieldEmbedder concatenates multiple embedded representations. When a
        # downstream model uses both, tokens and token characters, say, and only adds bos/eos
        # tokens to the token characters, the dimensions don't match. See:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/text_field_embedders/basic_text_field_embedder.py#L109
        #
        # For the equivalent hack in the ELMo embedder see:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/elmo.py#L590
        if bos_eos_tokens:
            dataset_reader_config = config.get("dataset_reader")
            if dataset_reader_config.get("type") == "multiprocess":
                dataset_reader_config = dataset_reader_config.get(
                    "base_reader")
            token_indexer_config = dataset_reader_config.get(
                "token_indexers").get(self._token_name)
            token_indexer: TokenIndexer = TokenIndexer.from_params(
                token_indexer_config)
            token_list = [Token(token) for token in bos_eos_tokens]
            # TODO(brendanr): Obtain these indices from the vocab once the
            # ELMoTokenCharactersIndexer adds the mappings.
            bos_eos_indices = token_indexer.tokens_to_indices(
                token_list, self._lm.vocab, "key")["key"]
            self._bos_indices = torch.Tensor(bos_eos_indices[0])
            self._eos_indices = torch.Tensor(bos_eos_indices[1])
        else:
            self._bos_indices = None
            self._eos_indices = None

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = lambda x: x

        self._remove_bos_eos = remove_bos_eos
        num_layers = self._lm.num_layers()
        # TODO(brendanr): Consider passing our LM as a custom module to `Elmo` instead.
        # See https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L76
        self._scalar_mix = ScalarMix(mixture_size=num_layers,
                                     do_layer_norm=False,
                                     trainable=True)

        # pylint: disable=protected-access
        character_dim = self._lm._text_field_embedder.get_output_dim()
        contextual_dim = self._lm._contextualizer.get_output_dim()

        if contextual_dim % character_dim != 0:
            raise ConfigurationError(
                "The output dimensions for the text_field_embedder " +
                f"({character_dim}) and the contextualizer ({contextual_dim})"
                + f" from the language model loaded from {archive_file} are " +
                "not compatible. Please check the config used to train that " +
                "model and ensure that the output dimension of the " +
                "text_field_embedder divides the output dimension of the " +
                "contextualizer.")
        self._character_embedding_duplication_count = contextual_dim // character_dim

        for param in self._lm.parameters():
            param.requires_grad = requires_grad
    def __init__(self,
                 archive_file: str,
                 dropout: float = None,
                 bos_eos_tokens: Tuple[str, str] = ("<S>", "</S>"),
                 remove_bos_eos: bool = True,
                 requires_grad: bool = False) -> None:
        super().__init__()

        overrides = {
                "model": {
                        "contextualizer": {
                                "return_all_layers": True
                        }
                }
        }

        # Import here to avoid circular dependency.
        from allennlp.models.archival import load_archive
        # Load LM and the associated config.
        archive = load_archive(archive_file, overrides=json.dumps(overrides))
        self._lm: LanguageModel = archive.model
        self._lm.delete_softmax()
        config = archive.config
        dict_config = config.as_dict(quiet=True)

        # Extract the name of the tokens that the LM was trained on.
        text_field_embedder = dict_config["model"]["text_field_embedder"]
        token_names = list(text_field_embedder["token_embedders"].keys())
        if len(token_names) != 1:
            # We don't currently support embedding with language models trained with multiple
            # embedded indices.
            #
            # Note: We only care about embedded indices. This does not include "tokens" which
            # is just used to compute the loss in LanguageModel.
            raise ConfigurationError(f"LM from {archive_file} trained with multiple embedders!")
        if "embedder_to_indexer_map" in text_field_embedder:
            # Similarly we don't support multiple indexers per embedder.
            raise ConfigurationError(f"LM from {archive_file} trained with embedder_to_indexer_map!")
        self._token_name = token_names[0]

        # TODO(brendanr): Find a way to remove this hack. The issue fundamentally is that the
        # BasicTextFieldEmbedder concatenates multiple embedded representations. When a
        # downstream model uses both, tokens and token characters, say, and only adds bos/eos
        # tokens to the token characters, the dimensions don't match. See:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/text_field_embedders/basic_text_field_embedder.py#L109
        #
        # For the equivalent hack in the ELMo embedder see:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/elmo.py#L590
        if bos_eos_tokens:
            dataset_reader_config = config.get("dataset_reader")
            if dataset_reader_config.get("type") == "multiprocess":
                dataset_reader_config = dataset_reader_config.get("base_reader")
            token_indexer_config = dataset_reader_config.get("token_indexers").get(self._token_name)
            token_indexer: TokenIndexer = TokenIndexer.from_params(token_indexer_config)
            token_list = [Token(token) for token in bos_eos_tokens]
            # TODO(brendanr): Obtain these indices from the vocab once the
            # ELMoTokenCharactersIndexer adds the mappings.
            bos_eos_indices = token_indexer.tokens_to_indices(token_list, self._lm.vocab, "key")["key"]
            self._bos_indices = torch.Tensor(bos_eos_indices[0])
            self._eos_indices = torch.Tensor(bos_eos_indices[1])
        else:
            self._bos_indices = None
            self._eos_indices = None

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = lambda x: x

        self._remove_bos_eos = remove_bos_eos
        num_layers = self._lm.num_layers()
        # TODO(brendanr): Consider passing our LM as a custom module to `Elmo` instead.
        # See https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L76
        self._scalar_mix = ScalarMix(mixture_size=num_layers, do_layer_norm=False, trainable=True)

        # pylint: disable=protected-access
        character_dim = self._lm._text_field_embedder.get_output_dim()
        contextual_dim = self._lm._contextualizer.get_output_dim()

        if contextual_dim % character_dim != 0:
            raise ConfigurationError(
                    "The output dimensions for the text_field_embedder " +
                    f"({character_dim}) and the contextualizer ({contextual_dim})" +
                    f" from the language model loaded from {archive_file} are " +
                    "not compatible. Please check the config used to train that " +
                    "model and ensure that the output dimension of the " +
                    "text_field_embedder divides the output dimension of the " +
                    "contextualizer.")
        self._character_embedding_duplication_count = contextual_dim // character_dim

        for param in self._lm.parameters():
            param.requires_grad = requires_grad