def test_config_optional_sections(): config = Config().from_str(nlp_config_string) config = DEFAULT_CONFIG.merge(config) assert "pretraining" not in config filled = registry.fill(config, schema=ConfigSchema, validate=False) # Make sure that optional "pretraining" block doesn't default to None, # which would (rightly) cause error because it'd result in a top-level # key that's not a section (dict). Note that the following roundtrip is # also how Config.interpolate works under the hood. new_config = Config().from_str(filled.to_str()) assert new_config["pretraining"] == {}
def __init__(self, udpipe_model: UDPipeModel, meta: Optional[Dict] = None, **kwargs): """Initialize the Language class. The language is called "udpipe_en" instead of "en" in order to avoid any potential conflicts with spaCy's built-in languages. Using entry points, this enables serializing and deserializing the language class and "lang": "udpipe_en" in the meta.json will automatically instantiate this class if this package is available. udpipe_model: The loaded UDPipe model. meta: spaCy model metadata. kwargs: Optional config parameters. """ self.udpipe = udpipe_model self.Defaults = get_defaults(lang=udpipe_model._lang) self.lang = f"udpipe_{udpipe_model._lang}" ignore_tag_map = kwargs.get("ignore_tag_map", False) if ignore_tag_map: self.Defaults.tag_map = {} # workaround for ValueError: [E167] if SPACY_V3: from spacy.vocab import create_vocab from spacy.language import DEFAULT_CONFIG self.vocab = create_vocab(udpipe_model._lang, self.Defaults) self.batch_size = 1000 self._components = [] self._disabled = set() self._config = DEFAULT_CONFIG.merge(self.default_config) else: self.vocab = self.Defaults.create_vocab() self.pipeline = [] self.tokenizer = UDPipeTokenizer(model=self.udpipe, vocab=self.vocab) self.max_length = kwargs.get("max_length", 10**6) self._meta = self.udpipe._meta if meta is None else dict(meta) self._path = None self._optimizer = None