def make_update(model: Model, docs: Iterable[Doc], optimizer: Optimizer, objective_func: Callable) -> float: """Perform an update over a single batch of documents. docs (iterable): A batch of `Doc` objects. optimizer (callable): An optimizer. RETURNS loss: A float for the loss. """ predictions, backprop = model.begin_update(docs) loss, gradients = objective_func(model.ops, docs, predictions) backprop(gradients) model.finish_update(optimizer) # Don't want to return a cupy object here # The gradients are modified in-place by the BERT MLM, # so we get an accurate loss return float(loss)
def debug_model( config, resolved_train_config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None, ): if not isinstance(model, Model): msg.fail( f"Requires a Thinc Model to be analysed, but found {type(model)} instead.", exits=1, ) if print_settings is None: print_settings = {} # STEP 0: Printing before training msg.info(f"Analysing model with ID {model.id}") if print_settings.get("print_before_training"): msg.divider(f"STEP 0 - before training") _print_model(model, print_settings) # STEP 1: Initializing the model and printing again X = _get_docs() # The output vector might differ from the official type of the output layer with data_validation(False): try: dot_names = [resolved_train_config["train_corpus"]] with show_validation_error(): (train_corpus, ) = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") except ValueError: try: _set_output_dim(nO=7, model=model) with show_validation_error(): nlp.initialize( lambda: [Example.from_dict(x, {}) for x in X]) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.", exits=1, ) if print_settings.get("print_after_init"): msg.divider(f"STEP 1 - after initialization") _print_model(model, print_settings) # STEP 2: Updating the model and printing again optimizer = Adam(0.001) set_dropout_rate(model, 0.2) # ugly hack to deal with Tok2Vec listeners tok2vec = None if model.has_ref("tok2vec") and model.get_ref( "tok2vec").name == "tok2vec-listener": tok2vec = nlp.get_pipe("tok2vec") goldY = None for e in range(3): if tok2vec: tok2vec.update([Example.from_dict(x, {}) for x in X]) Y, get_dX = model.begin_update(X) if goldY is None: goldY = _simulate_gold(Y) dY = get_gradient(goldY, Y, model.ops) get_dX(dY) model.finish_update(optimizer) if print_settings.get("print_after_training"): msg.divider(f"STEP 2 - after training") _print_model(model, print_settings) # STEP 3: the final prediction prediction = model.predict(X) if print_settings.get("print_prediction"): msg.divider(f"STEP 3 - prediction") msg.info(str(prediction)) msg.good(f"Succesfully ended analysis - model looks good.")
def train_model( model: Model, *, train: Sequence[Tuple[str, str]], test: Sequence[Tuple[str, str]], n_iter: int, batch_size: int | thinc.types.Generator = 32, learn_rate: float | List[float] | thinc.types.Generator = 0.001, ) -> Model: """ Args: model train test n_iter batch_size learn_rate """ # binarize language labels # NOTE: thinc seems to require type "float32" arrays for training labels # errors otherwise... :/ lb = sklearn.preprocessing.LabelBinarizer() lb.fit([lang for _, lang in train]) # THIS NEXT LINE IS CRITICAL: we need to save the training class labels # but don't want to keep this label binarizer around; so, add it to the model model.layers[-1].attrs["classes"] = lb.classes_ Y_train = lb.transform([lang for _, lang in train]).astype("float32") Y_test = lb.transform([lang for _, lang in test]) # make sure data is on the right device? # Y_train = self.model.ops.asarray(Y_train) # Y_test = self.model.ops.asarray(Y_test) X_train = [text for text, _ in train] X_test = [text for text, _ in test] losser = thinc.api.CategoricalCrossentropy(normalize=True) optimizer = thinc.api.Adam(learn_rate) model.initialize(X=X_train[:10], Y=Y_train[:10]) print(f"{'epoch':>5} {'loss':>8} {'score':>8}") # iterate over epochs for n in range(n_iter): loss = 0.0 # iterate over batches batches = model.ops.multibatch(batch_size, X_train, Y_train, shuffle=True) for X, Y in tqdm(batches, leave=False): Yh, backprop = model.begin_update(X) dYh, loss_batch = losser(Yh, Y) loss += loss_batch backprop(dYh) model.finish_update(optimizer) optimizer.step_schedules() if optimizer.averages: with model.use_params(optimizer.averages): score = evaluate_model(model, X_test=X_test, Y_test=Y_test, batch_size=1000) else: score = evaluate_model(model, X_test=X_test, Y_test=Y_test, batch_size=1000) print(f"{n:>5} {loss:>8.3f} {score:>8.3f}") if optimizer.averages: with model.use_params(optimizer.averages): pred_langs = models.get_model_preds( model, X_test, model.layers[-1].attrs["classes"]) else: pred_langs = models.get_model_preds(model, X_test, model.layers[-1].attrs["classes"]) true_langs = list(lb.inverse_transform(Y_test)) print(sklearn.metrics.classification_report(true_langs, pred_langs)) return model