Exemple #1
0
 def from_params(cls, params: Params) -> 'CharacterTokenizer':
     byte_encoding = params.pop('byte_encoding', None)
     lowercase_characters = params.pop('lowercase_characters', False)
     start_tokens = params.pop('start_tokens', None)
     end_tokens = params.pop('end_tokens', None)
     params.assert_empty(cls.__name__)
     return cls(byte_encoding=byte_encoding,
                lowercase_characters=lowercase_characters,
                start_tokens=start_tokens,
                end_tokens=end_tokens)
Exemple #2
0
def pop_max_vocab_size(params: Params) -> Union[int, Dict[str, int]]:
    """
    max_vocab_size is allowed to be either an int or a Dict[str, int] (or nothing).
    but it could also be a string representing an int (in the case of environment variable
    substitution). So we need some complex logic to handle it.
    """
    size = params.pop("max_vocab_size", None)

    if isinstance(size, Params):
        # This is the Dict[str, int] case.
        return size.as_dict()
    elif size is not None:
        # This is the int / str case.
        return int(size)
    else:
        return None
Exemple #3
0
    def dict_from_params(
            cls, params: Params) -> 'Dict[str, TokenIndexer]':  # type: ignore
        """
        We typically use ``TokenIndexers`` in a dictionary, with each ``TokenIndexer`` getting a
        name.  The specification for this in a ``Params`` object is typically ``{"name" ->
        {indexer_params}}``.  This method reads that whole set of parameters and returns a
        dictionary suitable for use in a ``TextField``.

        Because default values for token indexers are typically handled in the calling class to
        this and are based on checking for ``None``, if there were no parameters specifying any
        token indexers in the given ``params``, we return ``None`` instead of an empty dictionary.
        """
        token_indexers = {}
        for name, indexer_params in params.items():
            token_indexers[name] = cls.from_params(indexer_params)
        if token_indexers == {}:
            token_indexers = None
        return token_indexers
Exemple #4
0
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> 'Embedding':  # type: ignore
        """
        We need the vocabulary here to know how many items we need to embed, and we look for a
        ``vocab_namespace`` key in the parameter dictionary to know which vocabulary to use.  If
        you know beforehand exactly how many embeddings you need, or aren't using a vocabulary
        mapping for the things getting embedded here, then you can pass in the ``num_embeddings``
        key directly, and the vocabulary will be ignored.
        In the configuration file, a file containing pretrained embeddings can be specified
        using the parameter ``"pretrained_file"``.
        It can be the path to a local file or an URL of a (cached) remote file.
        Two formats are supported:
            * hdf5 file - containing an embedding matrix in the form of a torch.Tensor;
            * text file - an utf-8 encoded text file with space separated fields::
                    [word] [dim 1] [dim 2] ...
              The text file can eventually be compressed with gzip, bz2, lzma or zip.
              You can even select a single file inside an archive containing multiple files
              using the URI::
                    "(archive_uri)#file_path_inside_the_archive"
              where ``archive_uri`` can be a file system path or a URL. For example::
                    "(http://nlp.stanford.edu/data/glove.twitter.27B.zip)#glove.twitter.27B.200d.txt"
        """
        # pylint: disable=arguments-differ
        num_embeddings = params.pop_int('num_embeddings', None)
        vocab_namespace = params.pop("vocab_namespace", "tokens")
        if num_embeddings is None:
            num_embeddings = vocab.get_vocab_size(vocab_namespace)
        embedding_dim = params.pop_int('embedding_dim')
        pretrained_file = params.pop("pretrained_file", None)
        projection_dim = params.pop_int("projection_dim", None)
        trainable = params.pop_bool("trainable", True)
        padding_index = params.pop_int('padding_index', None)
        max_norm = params.pop_float('max_norm', None)
        norm_type = params.pop_float('norm_type', 2.)
        scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False)
        sparse = params.pop_bool('sparse', False)
        params.assert_empty(cls.__name__)

        if pretrained_file:
            # If we're loading a saved model, we don't want to actually read a pre-trained
            # embedding file - the embeddings will just be in our saved weights, and we might not
            # have the original embedding file anymore, anyway.
            weight = _read_pretrained_embeddings_file(pretrained_file,
                                                      embedding_dim, vocab,
                                                      vocab_namespace)
        else:
            weight = None

        return cls(num_embeddings=num_embeddings,
                   embedding_dim=embedding_dim,
                   projection_dim=projection_dim,
                   weight=weight,
                   padding_index=padding_index,
                   trainable=trainable,
                   max_norm=max_norm,
                   norm_type=norm_type,
                   scale_grad_by_freq=scale_grad_by_freq,
                   sparse=sparse)
Exemple #5
0
 def from_params(cls, params: Params) -> 'Tokenizer':
     choice = params.pop_choice('type',
                                cls.list_available(),
                                default_to_first_choice=True)
     return cls.by_name(choice).from_params(params)
Exemple #6
0
    def from_params(
            cls,
            params: Params,
            instances: Iterable['adi.Instance'] = None):  # type: ignore
        """
        there are two possible ways to build a vocabulary; from a
        collection of instances, using :func:`Vocabulary.from_instances`, or
        from a pre-saved vocabulary, using :func:`Vocabulary.from_files`.
        you can also extend pre-saved vocabulary with collection of instances
        using this method. This method wraps these options, allowing their
        specification from a ``Params`` object, generated from a JSON
        configuration file.

        parameters
        ----------
        params: Params, required.
        instances: Iterable['adi.Instance'], optional
            If ``params`` doesn't contain a ``directory_path`` key,
            the ``Vocabulary`` can be built directly from a collection of
            instances (i.e. a dataset). If ``extend`` key is set False,
            dataset instances will be ignored and final vocabulary will be
            one loaded from ``directory_path``. If ``extend`` key is set True,
            dataset instances will be used to extend the vocabulary loaded
            from ``directory_path`` and that will be final vocabulary used.

        returns
        -------
        a ``Vocabulary``.
        """
        # pylint: disable=arguments-differ

        # Vocabulary is ``Registrable`` so that you can configure a custom subclass,
        # but (unlike most of our registrables) almost everyone will want to use the
        # base implementation. So instead of having an abstract ``VocabularyBase`` or
        # such, we just add the logic for instantiating a registered subclass here,
        # so that most users can continue doing what they were doing.
        vocab_type = params.pop("type", None)
        if vocab_type is not None:
            return cls.by_name(vocab_type).from_params(params=params,
                                                       instances=instances)

        extend = params.pop("extend", False)
        vocabulary_directory = params.pop("directory_path", None)
        if not vocabulary_directory and not instances:
            raise ConfigurationError(
                "You must provide either a Params object containing a "
                "vocab_directory key or a Dataset to build a vocabulary from.")
        if extend and not instances:
            raise ConfigurationError(
                "'extend' is true but there are not instances passed to extend."
            )
        if extend and not vocabulary_directory:
            raise ConfigurationError(
                "'extend' is true but there is not 'directory_path' to extend from."
            )

        if vocabulary_directory and instances:
            if extend:
                info("Loading Vocab from files and extending it with dataset.")
            else:
                info("Loading Vocab from files instead of dataset.")

        if vocabulary_directory:
            vocab = Vocabulary.from_files(vocabulary_directory)
            if not extend:
                params.assert_empty("Vocabulary - from files")
                return vocab
        if extend:
            vocab.extend_from_instances(params, instances=instances)
            return vocab
        min_count = params.pop("min_count", None)
        max_vocab_size = pop_max_vocab_size(params)
        non_padded_namespaces = params.pop("non_padded_namespaces",
                                           DEFAULT_NON_PADDED_NAMESPACES)
        pretrained_files = params.pop("pretrained_files", {})
        only_include_pretrained_words = params.pop_bool(
            "only_include_pretrained_words", False)
        tokens_to_add = params.pop("tokens_to_add", None)
        params.assert_empty("Vocabulary - from dataset")
        return Vocabulary.from_instances(
            instances=instances,
            min_count=min_count,
            max_vocab_size=max_vocab_size,
            non_padded_namespaces=non_padded_namespaces,
            pretrained_files=pretrained_files,
            only_include_pretrained_words=only_include_pretrained_words,
            tokens_to_add=tokens_to_add)
 def from_params(cls, params: Params) -> 'SingleIdTokenIndexer':
     namespace = params.pop('namespace', 'tokens')
     lowercase_tokens = params.pop_bool('lowercase_tokens', False)
     params.assert_empty(cls.__name__)
     return cls(namespace=namespace, lowercase_tokens=lowercase_tokens)
Exemple #8
0
    def from_params(cls, params: Params) -> 'Elmo':
        # Add files to archive
        params.add_file_to_archive('options_file')
        params.add_file_to_archive('weight_file')

        options_file = params.pop('options_file')
        weight_file = params.pop('weight_file')
        requires_grad = params.pop('requires_grad', False)
        num_output_representations = params.pop('num_output_representations')
        do_layer_norm = params.pop_bool('do_layer_norm', False)
        keep_sentence_boundaries = params.pop_bool('keep_sentence_boundaries',
                                                   False)
        dropout = params.pop_float('dropout', 0.5)
        params.assert_empty(cls.__name__)

        return cls(options_file=options_file,
                   weight_file=weight_file,
                   num_output_representations=num_output_representations,
                   requires_grad=requires_grad,
                   do_layer_norm=do_layer_norm,
                   keep_sentence_boundaries=keep_sentence_boundaries,
                   dropout=dropout)