def create_pretraining_model(nlp, pretrain_config): """Define a network for the pretraining. We simply add an output layer onto the tok2vec input model. The tok2vec input model needs to be a model that takes a batch of Doc objects (as a list), and returns a list of arrays. Each array in the output needs to have one row per token in the doc. The actual tok2vec layer is stored as a reference, and only this bit will be serialized to file and read back in when calling the 'train' command. """ with nlp.select_pipes(enable=[]): nlp.initialize() tok2vec = get_tok2vec_ref(nlp, pretrain_config) # If the config referred to a Tok2VecListener, grab the original model instead if type(tok2vec).__name__ == "Tok2VecListener": original_tok2vec = ( tok2vec.upstream_name if tok2vec.upstream_name != "*" else "tok2vec" ) tok2vec = nlp.get_pipe(original_tok2vec).model try: tok2vec.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) except ValueError: component = pretrain_config["component"] layer = pretrain_config["layer"] raise ValueError(Errors.E874.format(component=component, layer=layer)) create_function = pretrain_config["objective"] model = create_function(nlp.vocab, tok2vec) model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) set_dropout_rate(model, pretrain_config["dropout"]) return model
def update( self, examples: Iterable[Example], *, drop: float = 0.0, sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None, ) -> Dict[str, float]: """Learn from a batch of documents and gold-standard information, updating the pipe's model. Delegates to predict and get_loss. examples (Iterable[Example]): A batch of Example objects. drop (float): The dropout rate. sgd (thinc.api.Optimizer): The optimizer. losses (Dict[str, float]): Optional record of the loss during training. Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. DOCS: https://spacy.io/api/entitylinker#update """ self.validate_kb() if losses is None: losses = {} losses.setdefault(self.name, 0.0) if not examples: return losses validate_examples(examples, "EntityLinker.update") set_dropout_rate(self.model, drop) docs = [eg.predicted for eg in examples] # save to restore later old_ents = [doc.ents for doc in docs] for doc, ex in zip(docs, examples): if self.use_gold_ents: ents, _ = ex.get_aligned_ents_and_ner() doc.ents = ents else: # only keep matching ents doc.ents = ex.get_matching_ents() # make sure we have something to learn from, if not, short-circuit if not self.batch_has_learnable_example(examples): return losses sentence_encodings, bp_context = self.model.begin_update(docs) # now restore the ents for doc, old in zip(docs, old_ents): doc.ents = old loss, d_scores = self.get_loss( sentence_encodings=sentence_encodings, examples=examples ) bp_context(d_scores) if sgd is not None: self.finish_update(sgd) losses[self.name] += loss return losses
def update(self, examples, drop=0.0, sgd=None, losses=None, set_annotations=False): """Update the model. examples (iterable): A batch of examples drop (float): The droput rate. sgd (callable): An optimizer. RETURNS (dict): Results from the update. """ if losses is None: losses = {} docs = [eg.predicted for eg in examples] if isinstance(docs, Doc): docs = [docs] set_dropout_rate(self.model, drop) trf_full, bp_trf_full = self.model.begin_update(docs) d_tensors = [] losses.setdefault(self.name, 0.0) def accumulate_gradient(d_trf_datas: List[TransformerData]): """Accumulate tok2vec loss and gradient. This is passed as a callback to all but the last listener. Only the last one does the backprop. """ nonlocal d_tensors for i, d_trf_data in enumerate(d_trf_datas): for d_tensor in d_trf_data.tensors: losses[self.name] += float( (d_tensor**2).sum()) # type: ignore if i >= len(d_tensors): d_tensors.append(d_trf_data.tensors) else: for j, d_tensor in enumerate(d_trf_data.tensors): d_tensors[i][j] += d_tensor def backprop(d_trf_datas: List[TransformerData]): """Callback to actually do the backprop. Passed to last listener.""" nonlocal d_tensors accumulate_gradient(d_trf_datas) d_trf_full = trf_full.unsplit_by_doc(d_tensors) d_docs = bp_trf_full(d_trf_full) if sgd is not None: self.model.finish_update(sgd) d_tensors = [] return d_docs batch_id = TransformerListener.get_batch_id(docs) for listener in self.listeners[:-1]: listener.receive(batch_id, trf_full.doc_data, accumulate_gradient) self.listeners[-1].receive(batch_id, trf_full.doc_data, backprop) if set_annotations: self.set_annotations(docs, trf_full)
def update( self, examples: Iterable[Example], *, drop: float = 0.0, sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None, ): """Learn from a batch of documents and gold-standard information, updating the pipe's model. examples (Iterable[Example]): A batch of Example objects. drop (float): The dropout rate. sgd (thinc.api.Optimizer): The optimizer. losses (Dict[str, float]): Optional record of the loss during training. Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. DOCS: https://spacy.io/api/tok2vec#update """ if losses is None: losses = {} validate_examples(examples, "Tok2Vec.update") docs = [eg.predicted for eg in examples] set_dropout_rate(self.model, drop) tokvecs, bp_tokvecs = self.model.begin_update(docs) d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] losses.setdefault(self.name, 0.0) def accumulate_gradient(one_d_tokvecs): """Accumulate tok2vec loss and gradient. This is passed as a callback to all but the last listener. Only the last one does the backprop. """ nonlocal d_tokvecs for i in range(len(one_d_tokvecs)): d_tokvecs[i] += one_d_tokvecs[i] losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] def backprop(one_d_tokvecs): """Callback to actually do the backprop. Passed to last listener.""" accumulate_gradient(one_d_tokvecs) d_docs = bp_tokvecs(d_tokvecs) if sgd is not None: self.finish_update(sgd) return d_docs batch_id = Tok2VecListener.get_batch_id(docs) for listener in self.listeners[:-1]: listener.receive(batch_id, tokvecs, accumulate_gradient) if self.listeners: self.listeners[-1].receive(batch_id, tokvecs, backprop) return losses
def get_updated_model(): fix_random_seed(seed) optimizer = Adam(0.001) model = model_func(**kwargs).initialize() initial_params = get_all_params(model) set_dropout_rate(model, dropout) for _ in range(5): Y, get_dX = model.begin_update(get_X()) dY = get_gradient(model, Y) get_dX(dY) model.finish_update(optimizer) updated_params = get_all_params(model) with pytest.raises(AssertionError): assert_array_equal(initial_params, updated_params) return model
def rehearse( self, examples: Iterable[Example], *, drop: float = 0.0, sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None, ) -> Dict[str, float]: """Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the current model to make predictions similar to an initial model, to try to address the "catastrophic forgetting" problem. This feature is experimental. examples (Iterable[Example]): A batch of Example objects. drop (float): The dropout rate. sgd (thinc.api.Optimizer): The optimizer. losses (Dict[str, float]): Optional record of the loss during training. Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. DOCS: https://spacy.io/api/textcategorizer#rehearse """ if losses is not None: losses.setdefault(self.name, 0.0) if self._rehearsal_model is None: return losses validate_examples(examples, "TextCategorizer.rehearse") self._validate_categories(examples) docs = [eg.predicted for eg in examples] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return losses set_dropout_rate(self.model, drop) scores, bp_scores = self.model.begin_update(docs) target = self._rehearsal_model(examples) gradient = scores - target bp_scores(gradient) if sgd is not None: self.finish_update(sgd) if losses is not None: losses[self.name] += (gradient**2).sum() return losses
def create_pretraining_model(nlp, pretrain_config): """Define a network for the pretraining. We simply add an output layer onto the tok2vec input model. The tok2vec input model needs to be a model that takes a batch of Doc objects (as a list), and returns a list of arrays. Each array in the output needs to have one row per token in the doc. The actual tok2vec layer is stored as a reference, and only this bit will be serialized to file and read back in when calling the 'train' command. """ nlp.initialize() component = nlp.get_pipe(pretrain_config["component"]) if pretrain_config.get("layer"): tok2vec = component.model.get_ref(pretrain_config["layer"]) else: tok2vec = component.model create_function = pretrain_config["objective"] model = create_function(nlp.vocab, tok2vec) model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) set_dropout_rate(model, pretrain_config["dropout"]) return model
def update( self, examples: Iterable[Example], *, drop: float = 0.0, sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None, ) -> Dict[str, float]: """Learn from a batch of documents and gold-standard information, updating the pipe's model. Delegates to predict and get_loss. examples (Iterable[Example]): A batch of Example objects. drop (float): The dropout rate. sgd (thinc.api.Optimizer): The optimizer. losses (Dict[str, float]): Optional record of the loss during training. Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. DOCS: https://spacy.io/api/spancategorizer#update """ if losses is None: losses = {} losses.setdefault(self.name, 0.0) validate_examples(examples, "SpanCategorizer.update") self._validate_categories(examples) if not any( len(eg.predicted) if eg.predicted else 0 for eg in examples): # Handle cases where there are no tokens in any docs. return losses docs = [eg.predicted for eg in examples] spans = self.suggester(docs, ops=self.model.ops) if spans.lengths.sum() == 0: return losses set_dropout_rate(self.model, drop) scores, backprop_scores = self.model.begin_update((docs, spans)) loss, d_scores = self.get_loss(examples, (spans, scores)) backprop_scores(d_scores) # type: ignore if sgd is not None: self.finish_update(sgd) losses[self.name] += loss return losses
def update( self, examples: Iterable[Example], *, drop: float = 0.0, sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None, ) -> Dict[str, float]: """Learn from a batch of documents and gold-standard information, updating the pipe's model. Delegates to predict and get_loss. examples (Iterable[Example]): A batch of Example objects. drop (float): The dropout rate. sgd (thinc.api.Optimizer): The optimizer. losses (Dict[str, float]): Optional record of the loss during training. Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. DOCS: https://spacy.io/api/entitylinker#update """ self.validate_kb() if losses is None: losses = {} losses.setdefault(self.name, 0.0) if not examples: return losses validate_examples(examples, "EntityLinker.update") sentence_docs = [] for eg in examples: sentences = [s for s in eg.reference.sents] kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) for ent in eg.reference.ents: # KB ID of the first token is the same as the whole span kb_id = kb_ids[ent.start] if kb_id: try: # find the sentence in the list of sentences. sent_index = sentences.index(ent.sent) except AttributeError: # Catch the exception when ent.sent is None and provide a user-friendly warning raise RuntimeError(Errors.E030) from None # get n previous sentences, if there are any start_sentence = max(0, sent_index - self.n_sents) # get n posterior sentences, or as many < n as there are end_sentence = min( len(sentences) - 1, sent_index + self.n_sents) # get token positions start_token = sentences[start_sentence].start end_token = sentences[end_sentence].end # append that span as a doc to training sent_doc = eg.predicted[start_token:end_token].as_doc() sentence_docs.append(sent_doc) set_dropout_rate(self.model, drop) if not sentence_docs: warnings.warn(Warnings.W093.format(name="Entity Linker")) return losses sentence_encodings, bp_context = self.model.begin_update(sentence_docs) loss, d_scores = self.get_loss(sentence_encodings=sentence_encodings, examples=examples) bp_context(d_scores) if sgd is not None: self.finish_update(sgd) losses[self.name] += loss return losses
def update( self, examples: Iterable[Example], *, drop: float = 0.0, sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None, ) -> Dict[str, float]: """Prepare for an update to the transformer. Like the `Tok2Vec` component, the `Transformer` component is unusual in that it does not receive "gold standard" annotations to calculate a weight update. The optimal output of the transformer data is unknown; it's a hidden layer inside the network that is updated by backpropagating from output layers. The `Transformer` component therefore does not perform a weight update during its own `update` method. Instead, it runs its transformer model and communicates the output and the backpropagation callback to any downstream components that have been connected to it via the TransformerListener sublayer. If there are multiple listeners, the last layer will actually backprop to the transformer and call the optimizer, while the others simply increment the gradients. examples (Iterable[Example]): A batch of Example objects. Only the `predicted` doc object is used, the reference doc is ignored. drop (float): The dropout rate. sgd (thinc.api.Optimizer): The optimizer. losses (Dict[str, float]): Optional record of the loss during training. Updated using the component name as the key. RETURNS (Dict[str, float]): The updated losses dictionary. DOCS: https://spacy.io/api/transformer#update """ validate_examples(examples, "Transformer.update") if losses is None: losses = {} docs = [eg.predicted for eg in examples] if isinstance(docs, Doc): docs = [docs] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return losses set_dropout_rate(self.model, drop) trf_full, bp_trf_full = self.model.begin_update(docs) d_tensors = [] losses.setdefault(self.name, 0.0) def accumulate_gradient(d_trf_datas: List[TransformerData]): """Accumulate tok2vec loss and gradient. This is passed as a callback to all but the last listener. Only the last one does the backprop. """ nonlocal d_tensors for i, d_trf_data in enumerate(d_trf_datas): for d_tensor in d_trf_data.tensors: # type: ignore losses[self.name] += float((d_tensor ** 2).sum()) if i >= len(d_tensors): d_tensors.append(list(d_trf_data.tensors)) else: for j, d_tensor in enumerate(d_trf_data.tensors): d_tensors[i][j] += d_tensor def backprop(d_trf_datas: List[TransformerData]): """Callback to actually do the backprop. Passed to last listener.""" nonlocal d_tensors accumulate_gradient(d_trf_datas) d_trf_full = trf_full.unsplit_by_doc(d_tensors) d_docs = bp_trf_full(d_trf_full) if sgd is not None: self.model.finish_update(sgd) d_tensors = [] return d_docs batch_id = TransformerListener.get_batch_id(docs) for listener in self.listeners[:-1]: listener.receive(batch_id, trf_full.doc_data, accumulate_gradient) if self.listeners: self.listeners[-1].receive(batch_id, trf_full.doc_data, backprop) return losses
def debug_model( config, resolved_train_config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None, ): if not isinstance(model, Model): msg.fail( f"Requires a Thinc Model to be analysed, but found {type(model)} instead.", exits=1, ) if print_settings is None: print_settings = {} # STEP 0: Printing before training msg.info(f"Analysing model with ID {model.id}") if print_settings.get("print_before_training"): msg.divider(f"STEP 0 - before training") _print_model(model, print_settings) # STEP 1: Initializing the model and printing again X = _get_docs() # The output vector might differ from the official type of the output layer with data_validation(False): try: dot_names = [resolved_train_config["train_corpus"]] with show_validation_error(): (train_corpus, ) = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") except ValueError: try: _set_output_dim(nO=7, model=model) with show_validation_error(): nlp.initialize( lambda: [Example.from_dict(x, {}) for x in X]) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.", exits=1, ) if print_settings.get("print_after_init"): msg.divider(f"STEP 1 - after initialization") _print_model(model, print_settings) # STEP 2: Updating the model and printing again optimizer = Adam(0.001) set_dropout_rate(model, 0.2) # ugly hack to deal with Tok2Vec listeners tok2vec = None if model.has_ref("tok2vec") and model.get_ref( "tok2vec").name == "tok2vec-listener": tok2vec = nlp.get_pipe("tok2vec") goldY = None for e in range(3): if tok2vec: tok2vec.update([Example.from_dict(x, {}) for x in X]) Y, get_dX = model.begin_update(X) if goldY is None: goldY = _simulate_gold(Y) dY = get_gradient(goldY, Y, model.ops) get_dX(dY) model.finish_update(optimizer) if print_settings.get("print_after_training"): msg.divider(f"STEP 2 - after training") _print_model(model, print_settings) # STEP 3: the final prediction prediction = model.predict(X) if print_settings.get("print_prediction"): msg.divider(f"STEP 3 - prediction") msg.info(str(prediction)) msg.good(f"Succesfully ended analysis - model looks good.")
def test_set_dropout_2(model_with_no_args): model = model_with_no_args model.name = "dropout" model.attrs["dropout_rate"] = 0.0 set_dropout_rate(model, 0.2) assert model.attrs["dropout_rate"] == 0.2
def test_set_dropout(): model = Dropout() assert model.attrs["dropout_rate"] == 0.0 set_dropout_rate(model, 0.2) assert model.attrs["dropout_rate"] == 0.2
def debug_model( config, resolved_train_config, nlp, pipe, *, print_settings: Optional[Dict[str, Any]] = None, ): if not hasattr(pipe, "model"): msg.fail( f"The component '{pipe}' does not specify an object that holds a Model.", exits=1, ) model = pipe.model if not isinstance(model, Model): msg.fail( f"Requires a Thinc Model to be analysed, but found {type(model)} instead.", exits=1, ) if print_settings is None: print_settings = {} # STEP 0: Printing before training msg.info(f"Analysing model with ID {model.id}") if print_settings.get("print_before_training"): msg.divider(f"STEP 0 - before training") _print_model(model, print_settings) # STEP 1: Initializing the model and printing again with data_validation(False): try: dot_names = [resolved_train_config["train_corpus"]] with show_validation_error(): (train_corpus, ) = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") examples = list(itertools.islice(train_corpus(nlp), 5)) except ValueError: try: _set_output_dim(nO=7, model=model) with show_validation_error(): examples = [Example.from_dict(x, {}) for x in _get_docs()] nlp.initialize(lambda: examples) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( "Could not initialize the model: you'll have to provide a valid 'train_corpus' argument in the config file.", exits=1, ) if print_settings.get("print_after_init"): msg.divider(f"STEP 1 - after initialization") _print_model(model, print_settings) # STEP 2: Updating the model and printing again set_dropout_rate(model, 0.2) # ugly hack to deal with Tok2Vec/Transformer listeners upstream_component = None if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref( "tok2vec").name: upstream_component = nlp.get_pipe("tok2vec") if (model.has_ref("tok2vec") and "transformer-listener" in model.get_ref("tok2vec").name): upstream_component = nlp.get_pipe("transformer") for e in range(3): if upstream_component: upstream_component.update(examples) pipe.update(examples) if print_settings.get("print_after_training"): msg.divider(f"STEP 2 - after training") _print_model(model, print_settings) # STEP 3: the final prediction prediction = model.predict([ex.predicted for ex in examples]) if print_settings.get("print_prediction"): msg.divider(f"STEP 3 - prediction") msg.info(str(prediction)) msg.good(f"Succesfully ended analysis - model looks good.")