def test_resolve_dot_names(): config = { "training": {"optimizer": {"@optimizers": "Adam.v1"}}, "foo": {"bar": "training.optimizer", "baz": "training.xyz"}, } result = util.resolve_dot_names(config, ["training.optimizer"]) assert isinstance(result[0], Optimizer) with pytest.raises(ConfigValidationError) as e: util.resolve_dot_names(config, ["training.xyz", "training.optimizer"]) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ["training", "xyz"]
def __init__( self, config: Config, *, rank: int = 0, num_workers: int = 1, use_gpu: int = 0, ray=None, ): if ray is None: # Avoid importing ray in the module. This allows a test-ray to # be passed in, and speeds up the CLI. import ray # type: ignore self.ray = ray self.rank = rank self.num_workers = num_workers self.gpu_id = self._resolve_gpu(use_gpu) self.nlp = init_nlp(Config(config), use_gpu=self.gpu_id) config = self.nlp.config.interpolate() self.T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [self.T["train_corpus"], self.T["dev_corpus"]] self.train_corpus, self.dev_corpus = resolve_dot_names(config, dot_names) self.before_to_disk = create_before_to_disk_callback(self.T["before_to_disk"]) allocator = self.T["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) self._evaluation_callback = lambda: {} self._results = [] self._has_evaluation_callback = False self.thread = None self.proxy = None self.n_grads_used = 0 self.n_grads_discarded = 0
def test_readers(): config_string = """ [training] [corpora] @readers = "myreader.v1" [nlp] lang = "en" pipeline = ["tok2vec", "textcat"] [components] [components.tok2vec] factory = "tok2vec" [components.textcat] factory = "textcat" """ @registry.readers.register("myreader.v1") def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]: annots = {"cats": {"POS": 1.0, "NEG": 0.0}} def reader(nlp: Language): doc = nlp.make_doc(f"This is an example") return [Example.from_dict(doc, annots)] return { "train": reader, "dev": reader, "extra": reader, "something": reader } config = Config().from_str(config_string) nlp = load_model_from_config(config, auto_fill=True) T = registry.resolve(nlp.config.interpolate()["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) assert isinstance(train_corpus, Callable) optimizer = T["optimizer"] # simulate a training loop nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) for example in train_corpus(nlp): nlp.update([example], sgd=optimizer) scores = nlp.evaluate(list(dev_corpus(nlp))) assert scores["cats_macro_auc"] == 0.0 # ensure the pipeline runs doc = nlp("Quick test") assert doc.cats corpora = {"corpora": nlp.config.interpolate()["corpora"]} extra_corpus = registry.resolve(corpora)["corpora"]["extra"] assert isinstance(extra_corpus, Callable)
def test_cat_readers(reader, additional_config): nlp_config_string = """ [training] seed = 0 [training.score_weights] cats_macro_auc = 1.0 [corpora] @readers = "PLACEHOLDER" [nlp] lang = "en" pipeline = ["tok2vec", "textcat_multilabel"] [components] [components.tok2vec] factory = "tok2vec" [components.textcat_multilabel] factory = "textcat_multilabel" """ config = Config().from_str(nlp_config_string) fix_random_seed(config["training"]["seed"]) config["corpora"]["@readers"] = reader config["corpora"].update(additional_config) nlp = load_model_from_config(config, auto_fill=True) T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) optimizer = T["optimizer"] # simulate a training loop nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) for example in train_corpus(nlp): assert example.y.cats # this shouldn't fail if each training example has at least one positive label assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] nlp.update([example], sgd=optimizer) # simulate performance benchmark on dev corpus dev_examples = list(dev_corpus(nlp)) for example in dev_examples: # this shouldn't fail if each dev example has at least one positive label assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] scores = nlp.evaluate(dev_examples) assert scores["cats_score"] # ensure the pipeline runs doc = nlp("Quick test") assert doc.cats
def debug_model( config, resolved_train_config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None, ): if not isinstance(model, Model): msg.fail( f"Requires a Thinc Model to be analysed, but found {type(model)} instead.", exits=1, ) if print_settings is None: print_settings = {} # STEP 0: Printing before training msg.info(f"Analysing model with ID {model.id}") if print_settings.get("print_before_training"): msg.divider(f"STEP 0 - before training") _print_model(model, print_settings) # STEP 1: Initializing the model and printing again X = _get_docs() # The output vector might differ from the official type of the output layer with data_validation(False): try: dot_names = [resolved_train_config["train_corpus"]] with show_validation_error(): (train_corpus, ) = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") except ValueError: try: _set_output_dim(nO=7, model=model) with show_validation_error(): nlp.initialize( lambda: [Example.from_dict(x, {}) for x in X]) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.", exits=1, ) if print_settings.get("print_after_init"): msg.divider(f"STEP 1 - after initialization") _print_model(model, print_settings) # STEP 2: Updating the model and printing again optimizer = Adam(0.001) set_dropout_rate(model, 0.2) # ugly hack to deal with Tok2Vec listeners tok2vec = None if model.has_ref("tok2vec") and model.get_ref( "tok2vec").name == "tok2vec-listener": tok2vec = nlp.get_pipe("tok2vec") goldY = None for e in range(3): if tok2vec: tok2vec.update([Example.from_dict(x, {}) for x in X]) Y, get_dX = model.begin_update(X) if goldY is None: goldY = _simulate_gold(Y) dY = get_gradient(goldY, Y, model.ops) get_dX(dY) model.finish_update(optimizer) if print_settings.get("print_after_training"): msg.divider(f"STEP 2 - after training") _print_model(model, print_settings) # STEP 3: the final prediction prediction = model.predict(X) if print_settings.get("print_prediction"): msg.divider(f"STEP 3 - prediction") msg.info(str(prediction)) msg.good(f"Succesfully ended analysis - model looks good.")
def debug_model( config, resolved_train_config, nlp, pipe, *, print_settings: Optional[Dict[str, Any]] = None, ): if not hasattr(pipe, "model"): msg.fail( f"The component '{pipe}' does not specify an object that holds a Model.", exits=1, ) model = pipe.model if not isinstance(model, Model): msg.fail( f"Requires a Thinc Model to be analysed, but found {type(model)} instead.", exits=1, ) if print_settings is None: print_settings = {} # STEP 0: Printing before training msg.info(f"Analysing model with ID {model.id}") if print_settings.get("print_before_training"): msg.divider(f"STEP 0 - before training") _print_model(model, print_settings) # STEP 1: Initializing the model and printing again with data_validation(False): try: dot_names = [resolved_train_config["train_corpus"]] with show_validation_error(): (train_corpus, ) = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") examples = list(itertools.islice(train_corpus(nlp), 5)) except ValueError: try: _set_output_dim(nO=7, model=model) with show_validation_error(): examples = [Example.from_dict(x, {}) for x in _get_docs()] nlp.initialize(lambda: examples) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( "Could not initialize the model: you'll have to provide a valid 'train_corpus' argument in the config file.", exits=1, ) if print_settings.get("print_after_init"): msg.divider(f"STEP 1 - after initialization") _print_model(model, print_settings) # STEP 2: Updating the model and printing again set_dropout_rate(model, 0.2) # ugly hack to deal with Tok2Vec/Transformer listeners upstream_component = None if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref( "tok2vec").name: upstream_component = nlp.get_pipe("tok2vec") if (model.has_ref("tok2vec") and "transformer-listener" in model.get_ref("tok2vec").name): upstream_component = nlp.get_pipe("transformer") for e in range(3): if upstream_component: upstream_component.update(examples) pipe.update(examples) if print_settings.get("print_after_training"): msg.divider(f"STEP 2 - after training") _print_model(model, print_settings) # STEP 3: the final prediction prediction = model.predict([ex.predicted for ex in examples]) if print_settings.get("print_prediction"): msg.divider(f"STEP 3 - prediction") msg.info(str(prediction)) msg.good(f"Succesfully ended analysis - model looks good.")