def init_tok2vec(nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any]) -> bool: # Load pretrained tok2vec weights - cf. CLI command 'pretrain' P = pretrain_config I = init_config weights_data = None init_tok2vec = ensure_path(I["init_tok2vec"]) if init_tok2vec is not None: if P["objective"].get("type") == "vectors" and not I["vectors"]: err = 'need initialize.vectors if pretraining.objective.type is "vectors"' errors = [{"loc": ["initialize"], "msg": err}] raise ConfigValidationError(config=nlp.config, errors=errors) if not init_tok2vec.exists(): err = f"can't find pretrained tok2vec: {init_tok2vec}" errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}] raise ConfigValidationError(config=nlp.config, errors=errors) with init_tok2vec.open("rb") as file_: weights_data = file_.read() if weights_data is not None: tok2vec_component = P["component"] if tok2vec_component is None: desc = ( f"To use pretrained tok2vec weights, [pretraining.component] " f"needs to specify the component that should load them.") err = "component can't be null" errors = [{"loc": ["pretraining", "component"], "msg": err}] raise ConfigValidationError(config=nlp.config["pretraining"], errors=errors, desc=desc) layer = nlp.get_pipe(tok2vec_component).model if P["layer"]: layer = layer.get_ref(P["layer"]) layer.from_bytes(weights_data) return True return False
def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": raw_config = config config = raw_config.interpolate() if "seed" not in config["training"]: raise ValueError(Errors.E1015.format(value="[training] seed")) if "gpu_allocator" not in config["training"]: raise ValueError(Errors.E1015.format(value="[training] gpu_allocator")) if config["training"]["seed"] is not None: fix_random_seed(config["training"]["seed"]) allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) # Use original config here before it's resolved to functions sourced = get_sourced_components(config) nlp = load_model_from_config(raw_config, auto_fill=True) logger.info("Set up nlp object from config") config = nlp.config.interpolate() # Resolve all training-relevant sections using the filled nlp config T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] if not isinstance(T["train_corpus"], str): raise ConfigValidationError( desc=Errors.E897.format( field="training.train_corpus", type=type(T["train_corpus"]) ) ) if not isinstance(T["dev_corpus"], str): raise ConfigValidationError( desc=Errors.E897.format( field="training.dev_corpus", type=type(T["dev_corpus"]) ) ) train_corpus, dev_corpus = resolve_dot_names(config, dot_names) optimizer = T["optimizer"] # Components that shouldn't be updated during training frozen_components = T["frozen_components"] # Sourced components that require resume_training resume_components = [p for p in sourced if p not in frozen_components] logger.info(f"Pipeline: {nlp.pipe_names}") if resume_components: with nlp.select_pipes(enable=resume_components): logger.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) # Make sure that listeners are defined before initializing further nlp._link_components() with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) logger.info(f"Initialized pipeline components: {nlp.pipe_names}") # Detect components with listeners that are not frozen consistently for name, proc in nlp.pipeline: if getattr(proc, "listening_components", None): # e.g. tok2vec/transformer for listener in proc.listening_components: if listener in frozen_components and name not in frozen_components: logger.warning(Warnings.W087.format(name=name, listener=listener)) # We always check this regardless, in case user freezes tok2vec if listener not in frozen_components and name in frozen_components: logger.warning(Warnings.W086.format(name=name, listener=listener)) return nlp
def validate_init_settings( func: Callable, settings: Dict[str, Any], *, section: Optional[str] = None, name: str = "", exclude: Iterable[str] = ("get_examples", "nlp"), ) -> Dict[str, Any]: """Validate initialization settings against the expected arguments in the method signature. Will parse values if possible (e.g. int to string) and return the updated settings dict. Will raise a ConfigValidationError if types don't match or required values are missing. func (Callable): The initialize method of a given component etc. settings (Dict[str, Any]): The settings from the respective [initialize] block. section (str): Initialize section, for error message. name (str): Name of the block in the section. exclude (Iterable[str]): Parameter names to exclude from schema. RETURNS (Dict[str, Any]): The validated settings. """ schema = get_arg_model(func, exclude=exclude, name="InitArgModel") try: return schema(**settings).dict() except ValidationError as e: block = "initialize" if not section else f"initialize.{section}" title = f"Error validating initialization settings in [{block}]" raise ConfigValidationError(title=title, errors=e.errors(), config=settings, parent=name) from None
def load_vectors_into_model(nlp: "Language", name: Union[str, Path], *, add_strings: bool = True) -> None: """Load word vectors from an installed model or path into a model instance.""" try: vectors_nlp = load_model(name) except ConfigValidationError as e: title = f"Config validation error for vectors {name}" desc = ( "This typically means that there's a problem in the config.cfg included " "with the packaged vectors. Make sure that the vectors package you're " "loading is compatible with the current version of spaCy.") err = ConfigValidationError.from_error(e, title=title, desc=desc) raise err from None if len(vectors_nlp.vocab.vectors.keys()) == 0: logger.warning(Warnings.W112.format(name=name)) nlp.vocab.vectors = vectors_nlp.vocab.vectors for lex in nlp.vocab: lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK) if add_strings: # I guess we should add the strings from the vectors_nlp model? # E.g. if someone does a similarity query, they might expect the strings. for key in nlp.vocab.vectors.key2row: if key in vectors_nlp.vocab.strings: nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def load_vectors_into_model(nlp: "Language", name: Union[str, Path], *, add_strings: bool = True) -> None: """Load word vectors from an installed model or path into a model instance.""" try: # Load with the same vocab, which automatically adds the vectors to # the current nlp object. Exclude lookups so they are not modified. exclude = ["lookups"] if not add_strings: exclude.append("strings") vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude) except ConfigValidationError as e: title = f"Config validation error for vectors {name}" desc = ( "This typically means that there's a problem in the config.cfg included " "with the packaged vectors. Make sure that the vectors package you're " "loading is compatible with the current version of spaCy.") err = ConfigValidationError.from_error(e, title=title, desc=desc) raise err from None if (len(vectors_nlp.vocab.vectors.keys()) == 0 and vectors_nlp.vocab.vectors.mode != VectorsMode.floret) or ( vectors_nlp.vocab.vectors.shape[0] == 0 and vectors_nlp.vocab.vectors.mode == VectorsMode.floret): logger.warning(Warnings.W112.format(name=name)) for lex in nlp.vocab: lex.rank = nlp.vocab.vectors.key2row.get( lex.orth, OOV_RANK) # type: ignore[attr-defined]
def init_tok2vec(nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any]) -> bool: # Load pretrained tok2vec weights - cf. CLI command 'pretrain' P = pretrain_config I = init_config weights_data = None init_tok2vec = ensure_path(I["init_tok2vec"]) if init_tok2vec is not None: if not init_tok2vec.exists(): err = f"can't find pretrained tok2vec: {init_tok2vec}" errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}] raise ConfigValidationError(config=nlp.config, errors=errors) with init_tok2vec.open("rb") as file_: weights_data = file_.read() if weights_data is not None: layer = get_tok2vec_ref(nlp, P) layer.from_bytes(weights_data) logger.info(f"Loaded pretrained weights from {init_tok2vec}") return True return False