def util_batch_unbatch_docs_array(model: Model[List[Doc], Array2d], in_data: List[Doc], out_data: Array2d): with data_validation(True): model.initialize(in_data, out_data) Y_batched = model.predict(in_data).tolist() Y_not_batched = [model.predict([u])[0] for u in in_data] assert_almost_equal(Y_batched, Y_not_batched, decimal=4)
def util_batch_unbatch_docs_list(model: Model[List[Doc], List[Array2d]], in_data: List[Doc], out_data: List[Array2d]): with data_validation(True): model.initialize(in_data, out_data) Y_batched = model.predict(in_data) Y_not_batched = [model.predict([u])[0] for u in in_data] for i in range(len(Y_batched)): assert_almost_equal(Y_batched[i], Y_not_batched[i], decimal=4)
def util_batch_unbatch_docs_ragged(model: Model[List[Doc], Ragged], in_data: List[Doc], out_data: Ragged): with data_validation(True): model.initialize(in_data, out_data) Y_batched = model.predict(in_data) Y_not_batched = [] for u in in_data: Y_not_batched.extend(model.predict([u]).data.tolist()) assert_almost_equal(Y_batched.data, Y_not_batched, decimal=4)
def debug_model( config, resolved_train_config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None, ): if not isinstance(model, Model): msg.fail( f"Requires a Thinc Model to be analysed, but found {type(model)} instead.", exits=1, ) if print_settings is None: print_settings = {} # STEP 0: Printing before training msg.info(f"Analysing model with ID {model.id}") if print_settings.get("print_before_training"): msg.divider(f"STEP 0 - before training") _print_model(model, print_settings) # STEP 1: Initializing the model and printing again X = _get_docs() # The output vector might differ from the official type of the output layer with data_validation(False): try: dot_names = [resolved_train_config["train_corpus"]] with show_validation_error(): (train_corpus, ) = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") except ValueError: try: _set_output_dim(nO=7, model=model) with show_validation_error(): nlp.initialize( lambda: [Example.from_dict(x, {}) for x in X]) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.", exits=1, ) if print_settings.get("print_after_init"): msg.divider(f"STEP 1 - after initialization") _print_model(model, print_settings) # STEP 2: Updating the model and printing again optimizer = Adam(0.001) set_dropout_rate(model, 0.2) # ugly hack to deal with Tok2Vec listeners tok2vec = None if model.has_ref("tok2vec") and model.get_ref( "tok2vec").name == "tok2vec-listener": tok2vec = nlp.get_pipe("tok2vec") goldY = None for e in range(3): if tok2vec: tok2vec.update([Example.from_dict(x, {}) for x in X]) Y, get_dX = model.begin_update(X) if goldY is None: goldY = _simulate_gold(Y) dY = get_gradient(goldY, Y, model.ops) get_dX(dY) model.finish_update(optimizer) if print_settings.get("print_after_training"): msg.divider(f"STEP 2 - after training") _print_model(model, print_settings) # STEP 3: the final prediction prediction = model.predict(X) if print_settings.get("print_prediction"): msg.divider(f"STEP 3 - prediction") msg.info(str(prediction)) msg.good(f"Succesfully ended analysis - model looks good.")
def debug_model( config, resolved_train_config, nlp, pipe, *, print_settings: Optional[Dict[str, Any]] = None, ): if not hasattr(pipe, "model"): msg.fail( f"The component '{pipe}' does not specify an object that holds a Model.", exits=1, ) model = pipe.model if not isinstance(model, Model): msg.fail( f"Requires a Thinc Model to be analysed, but found {type(model)} instead.", exits=1, ) if print_settings is None: print_settings = {} # STEP 0: Printing before training msg.info(f"Analysing model with ID {model.id}") if print_settings.get("print_before_training"): msg.divider(f"STEP 0 - before training") _print_model(model, print_settings) # STEP 1: Initializing the model and printing again with data_validation(False): try: dot_names = [resolved_train_config["train_corpus"]] with show_validation_error(): (train_corpus, ) = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") examples = list(itertools.islice(train_corpus(nlp), 5)) except ValueError: try: _set_output_dim(nO=7, model=model) with show_validation_error(): examples = [Example.from_dict(x, {}) for x in _get_docs()] nlp.initialize(lambda: examples) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( "Could not initialize the model: you'll have to provide a valid 'train_corpus' argument in the config file.", exits=1, ) if print_settings.get("print_after_init"): msg.divider(f"STEP 1 - after initialization") _print_model(model, print_settings) # STEP 2: Updating the model and printing again set_dropout_rate(model, 0.2) # ugly hack to deal with Tok2Vec/Transformer listeners upstream_component = None if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref( "tok2vec").name: upstream_component = nlp.get_pipe("tok2vec") if (model.has_ref("tok2vec") and "transformer-listener" in model.get_ref("tok2vec").name): upstream_component = nlp.get_pipe("transformer") for e in range(3): if upstream_component: upstream_component.update(examples) pipe.update(examples) if print_settings.get("print_after_training"): msg.divider(f"STEP 2 - after training") _print_model(model, print_settings) # STEP 3: the final prediction prediction = model.predict([ex.predicted for ex in examples]) if print_settings.get("print_prediction"): msg.divider(f"STEP 3 - prediction") msg.info(str(prediction)) msg.good(f"Succesfully ended analysis - model looks good.")