Exemple #1
0
 def from_params(cls, vocab: Vocabulary,
                 params: Params) -> 'ElmoTokenEmbedder':  # type: ignore
     # pylint: disable=arguments-differ
     params.add_file_to_archive('options_file')
     params.add_file_to_archive('weight_file')
     options_file = params.pop('options_file')
     weight_file = params.pop('weight_file')
     requires_grad = params.pop('requires_grad', False)
     do_layer_norm = params.pop_bool('do_layer_norm', False)
     dropout = params.pop_float("dropout", 0.5)
     namespace_to_cache = params.pop("namespace_to_cache", None)
     if namespace_to_cache is not None:
         vocab_to_cache = list(
             vocab.get_token_to_index_vocabulary(namespace_to_cache).keys())
     else:
         vocab_to_cache = None
     projection_dim = params.pop_int("projection_dim", None)
     params.assert_empty(cls.__name__)
     return cls(options_file=options_file,
                weight_file=weight_file,
                do_layer_norm=do_layer_norm,
                dropout=dropout,
                requires_grad=requires_grad,
                projection_dim=projection_dim,
                vocab_to_cache=vocab_to_cache)
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'TokenCharactersEncoder':  # type: ignore
     # pylint: disable=arguments-differ
     embedding_params: Params = params.pop("embedding")
     # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
     # that to be "token_characters" by default.
     embedding_params.setdefault("vocab_namespace", "token_characters")
     embedding = Embedding.from_params(vocab, embedding_params)
     encoder_params: Params = params.pop("encoder")
     encoder = Seq2VecEncoder.from_params(encoder_params)
     dropout = params.pop_float("dropout", 0.0)
     params.assert_empty(cls.__name__)
     return cls(embedding, encoder, dropout)
Exemple #3
0
    def from_params(cls, params: Params) -> 'Elmo':
        # Add files to archive
        params.add_file_to_archive('options_file')
        params.add_file_to_archive('weight_file')

        options_file = params.pop('options_file')
        weight_file = params.pop('weight_file')
        requires_grad = params.pop('requires_grad', False)
        num_output_representations = params.pop('num_output_representations')
        do_layer_norm = params.pop_bool('do_layer_norm', False)
        dropout = params.pop_float('dropout', 0.5)
        params.assert_empty(cls.__name__)

        return cls(options_file=options_file,
                   weight_file=weight_file,
                   num_output_representations=num_output_representations,
                   requires_grad=requires_grad,
                   do_layer_norm=do_layer_norm,
                   dropout=dropout)
Exemple #4
0
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> 'Embedding':  # type: ignore
        """
        We need the vocabulary here to know how many items we need to embed, and we look for a
        ``vocab_namespace`` key in the parameter dictionary to know which vocabulary to use.  If
        you know beforehand exactly how many embeddings you need, or aren't using a vocabulary
        mapping for the things getting embedded here, then you can pass in the ``num_embeddings``
        key directly, and the vocabulary will be ignored.

        In the configuration file, a file containing pretrained embeddings can be specified
        using the parameter ``"pretrained_file"``.
        It can be the path to a local file or an URL of a (cached) remote file.
        Two formats are supported:

            * hdf5 file - containing an embedding matrix in the form of a torch.Tensor;

            * text file - an utf-8 encoded text file with space separated fields::

                    [word] [dim 1] [dim 2] ...

              The text file can eventually be compressed with gzip, bz2, lzma or zip.
              You can even select a single file inside an archive containing multiple files
              using the URI::

                    "(archive_uri)#file_path_inside_the_archive"

              where ``archive_uri`` can be a file system path or a URL. For example::

                    "(http://nlp.stanford.edu/data/glove.twitter.27B.zip)#glove.twitter.27B.200d.txt"
        """
        # pylint: disable=arguments-differ
        num_embeddings = params.pop_int('num_embeddings', None)
        vocab_namespace = params.pop("vocab_namespace", "tokens")
        if num_embeddings is None:
            num_embeddings = vocab.get_vocab_size(vocab_namespace)
        embedding_dim = params.pop_int('embedding_dim')
        pretrained_file = params.pop("pretrained_file", None)
        projection_dim = params.pop_int("projection_dim", None)
        trainable = params.pop_bool("trainable", True)
        padding_index = params.pop_int('padding_index', None)
        max_norm = params.pop_float('max_norm', None)
        norm_type = params.pop_float('norm_type', 2.)
        scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False)
        sparse = params.pop_bool('sparse', False)
        params.assert_empty(cls.__name__)

        if pretrained_file:
            # If we're loading a saved model, we don't want to actually read a pre-trained
            # embedding file - the embeddings will just be in our saved weights, and we might not
            # have the original embedding file anymore, anyway.
            weight = _read_pretrained_embeddings_file(pretrained_file,
                                                      embedding_dim, vocab,
                                                      vocab_namespace)
        else:
            weight = None

        return cls(num_embeddings=num_embeddings,
                   embedding_dim=embedding_dim,
                   projection_dim=projection_dim,
                   weight=weight,
                   padding_index=padding_index,
                   trainable=trainable,
                   max_norm=max_norm,
                   norm_type=norm_type,
                   scale_grad_by_freq=scale_grad_by_freq,
                   sparse=sparse)