def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding': # type: ignore """ We need the vocabulary here to know how many items we need to embed, and we look for a ``vocab_namespace`` key in the parameter dictionary to know which vocabulary to use. If you know beforehand exactly how many embeddings you need, or aren't using a vocabulary mapping for the things getting embedded here, then you can pass in the ``num_embeddings`` key directly, and the vocabulary will be ignored. In the configuration file, a file containing pretrained embeddings can be specified using the parameter ``"pretrained_file"``. It can be the path to a local file or an URL of a (cached) remote file. Two formats are supported: * hdf5 file - containing an embedding matrix in the form of a torch.Tensor; * text file - an utf-8 encoded text file with space separated fields:: [word] [dim 1] [dim 2] ... The text file can eventually be compressed with gzip, bz2, lzma or zip. You can even select a single file inside an archive containing multiple files using the URI:: "(archive_uri)#file_path_inside_the_archive" where ``archive_uri`` can be a file system path or a URL. For example:: "(http://nlp.stanford.edu/data/glove.twitter.27B.zip)#glove.twitter.27B.200d.txt" """ # pylint: disable=arguments-differ num_embeddings = params.pop_int('num_embeddings', None) vocab_namespace = params.pop("vocab_namespace", "tokens") if num_embeddings is None: num_embeddings = vocab.get_vocab_size(vocab_namespace) embedding_dim = params.pop_int('embedding_dim') pretrained_file = params.pop("pretrained_file", None) projection_dim = params.pop_int("projection_dim", None) trainable = params.pop_bool("trainable", True) padding_index = params.pop_int('padding_index', None) max_norm = params.pop_float('max_norm', None) norm_type = params.pop_float('norm_type', 2.) scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False) sparse = params.pop_bool('sparse', False) params.assert_empty(cls.__name__) if pretrained_file: # If we're loading a saved model, we don't want to actually read a pre-trained # embedding file - the embeddings will just be in our saved weights, and we might not # have the original embedding file anymore, anyway. weight = _read_pretrained_embeddings_file(pretrained_file, embedding_dim, vocab, vocab_namespace) else: weight = None return cls(num_embeddings=num_embeddings, embedding_dim=embedding_dim, projection_dim=projection_dim, weight=weight, padding_index=padding_index, trainable=trainable, max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq, sparse=sparse)
def from_params(cls, params: Params) -> 'CharacterTokenizer': byte_encoding = params.pop('byte_encoding', None) lowercase_characters = params.pop('lowercase_characters', False) start_tokens = params.pop('start_tokens', None) end_tokens = params.pop('end_tokens', None) params.assert_empty(cls.__name__) return cls(byte_encoding=byte_encoding, lowercase_characters=lowercase_characters, start_tokens=start_tokens, end_tokens=end_tokens)
def from_params(cls, params: Params) -> 'Elmo': # Add files to archive params.add_file_to_archive('options_file') params.add_file_to_archive('weight_file') options_file = params.pop('options_file') weight_file = params.pop('weight_file') requires_grad = params.pop('requires_grad', False) num_output_representations = params.pop('num_output_representations') do_layer_norm = params.pop_bool('do_layer_norm', False) keep_sentence_boundaries = params.pop_bool('keep_sentence_boundaries', False) dropout = params.pop_float('dropout', 0.5) params.assert_empty(cls.__name__) return cls(options_file=options_file, weight_file=weight_file, num_output_representations=num_output_representations, requires_grad=requires_grad, do_layer_norm=do_layer_norm, keep_sentence_boundaries=keep_sentence_boundaries, dropout=dropout)
def from_params( cls, params: Params, instances: Iterable['adi.Instance'] = None): # type: ignore """ there are two possible ways to build a vocabulary; from a collection of instances, using :func:`Vocabulary.from_instances`, or from a pre-saved vocabulary, using :func:`Vocabulary.from_files`. you can also extend pre-saved vocabulary with collection of instances using this method. This method wraps these options, allowing their specification from a ``Params`` object, generated from a JSON configuration file. parameters ---------- params: Params, required. instances: Iterable['adi.Instance'], optional If ``params`` doesn't contain a ``directory_path`` key, the ``Vocabulary`` can be built directly from a collection of instances (i.e. a dataset). If ``extend`` key is set False, dataset instances will be ignored and final vocabulary will be one loaded from ``directory_path``. If ``extend`` key is set True, dataset instances will be used to extend the vocabulary loaded from ``directory_path`` and that will be final vocabulary used. returns ------- a ``Vocabulary``. """ # pylint: disable=arguments-differ # Vocabulary is ``Registrable`` so that you can configure a custom subclass, # but (unlike most of our registrables) almost everyone will want to use the # base implementation. So instead of having an abstract ``VocabularyBase`` or # such, we just add the logic for instantiating a registered subclass here, # so that most users can continue doing what they were doing. vocab_type = params.pop("type", None) if vocab_type is not None: return cls.by_name(vocab_type).from_params(params=params, instances=instances) extend = params.pop("extend", False) vocabulary_directory = params.pop("directory_path", None) if not vocabulary_directory and not instances: raise ConfigurationError( "You must provide either a Params object containing a " "vocab_directory key or a Dataset to build a vocabulary from.") if extend and not instances: raise ConfigurationError( "'extend' is true but there are not instances passed to extend." ) if extend and not vocabulary_directory: raise ConfigurationError( "'extend' is true but there is not 'directory_path' to extend from." ) if vocabulary_directory and instances: if extend: info("Loading Vocab from files and extending it with dataset.") else: info("Loading Vocab from files instead of dataset.") if vocabulary_directory: vocab = Vocabulary.from_files(vocabulary_directory) if not extend: params.assert_empty("Vocabulary - from files") return vocab if extend: vocab.extend_from_instances(params, instances=instances) return vocab min_count = params.pop("min_count", None) max_vocab_size = pop_max_vocab_size(params) non_padded_namespaces = params.pop("non_padded_namespaces", DEFAULT_NON_PADDED_NAMESPACES) pretrained_files = params.pop("pretrained_files", {}) only_include_pretrained_words = params.pop_bool( "only_include_pretrained_words", False) tokens_to_add = params.pop("tokens_to_add", None) params.assert_empty("Vocabulary - from dataset") return Vocabulary.from_instances( instances=instances, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, tokens_to_add=tokens_to_add)
def from_params(cls, params: Params) -> 'SingleIdTokenIndexer': namespace = params.pop('namespace', 'tokens') lowercase_tokens = params.pop_bool('lowercase_tokens', False) params.assert_empty(cls.__name__) return cls(namespace=namespace, lowercase_tokens=lowercase_tokens)