def main(pytorch: bool = False, gpu_id: int = -1): global CONFIG fix_random_seed(0) if gpu_id >= 0: require_gpu(gpu_id) print("Set GPU", gpu_id) backends = {"pytorch": pytorch} for name, use_backend in backends.items(): if not use_backend: print(f"Skipping {name}") continue set_backend(name, gpu_id) C = registry.resolve(Config().from_str(CONFIG)) model = C["model"] X, Y = get_dummy_data(**C["data"]) print("Copy to device") X = [model.ops.asarray(x) for x in X] Y = [model.ops.asarray(y) for y in Y] print("Begin init", len(X)) model.initialize(X=X[:5]) print("Pre-batch") n_words = sum(len(x) for x in X) X = [ model.layers[0].predict(batch) for batch in model.ops.minibatch(16, X) ] model.layers.pop(0) print("Start") start_time = timer() end_time = timer() print(name, n_words, end_time - start_time)
def test_issue5551(textcat_config): """Test that after fixing the random seed, the results of the pipeline are truly identical""" component = "textcat" pipe_cfg = Config().from_str(textcat_config) results = [] for i in range(3): fix_random_seed(0) nlp = English() text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g." annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}} pipe = nlp.add_pipe(component, config=pipe_cfg, last=True) for label in set(annots["cats"]): pipe.add_label(label) # Train nlp.initialize() doc = nlp.make_doc(text) nlp.update([Example.from_dict(doc, annots)]) # Store the result of each iteration result = pipe.model.predict([doc]) results.append(result[0]) # All results should be the same because of the fixed seed assert len(results) == 3 ops = get_current_ops() assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1])) assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]))
def test_LSTM_learns(): fix_random_seed(0) nO = 2 nI = 2 def sgd(key, weights, gradient): weights -= 0.001 * gradient return weights, gradient * 0 model = with_padded(LSTM(nO, nI)) X = [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]] Y = [[0.2, 0.2], [0.3, 0.3], [0.4, 0.4]] X = [model.ops.asarray(x, dtype="f").reshape((1, -1)) for x in X] Y = [model.ops.asarray(y, dtype="f").reshape((1, -1)) for y in Y] model = model.initialize(X, Y) Yhs, bp_Yhs = model.begin_update(X) loss1 = sum([((yh - y) ** 2).sum() for yh, y in zip(Yhs, Y)]) Yhs, bp_Yhs = model.begin_update(X) dYhs = [yh - y for yh, y in zip(Yhs, Y)] dXs = bp_Yhs(dYhs) model.finish_update(sgd) Yhs, bp_Yhs = model.begin_update(X) dYhs = [yh - y for yh, y in zip(Yhs, Y)] dXs = bp_Yhs(dYhs) # noqa: F841 loss2 = sum([((yh - y) ** 2).sum() for yh, y in zip(Yhs, Y)]) assert loss1 > loss2, (loss1, loss2)
def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": raw_config = config config = raw_config.interpolate() if "seed" not in config["training"]: raise ValueError(Errors.E1015.format(value="[training] seed")) if "gpu_allocator" not in config["training"]: raise ValueError(Errors.E1015.format(value="[training] gpu_allocator")) if config["training"]["seed"] is not None: fix_random_seed(config["training"]["seed"]) allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) # Use original config here before it's resolved to functions sourced = get_sourced_components(config) nlp = load_model_from_config(raw_config, auto_fill=True) logger.info("Set up nlp object from config") config = nlp.config.interpolate() # Resolve all training-relevant sections using the filled nlp config T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] if not isinstance(T["train_corpus"], str): raise ConfigValidationError( desc=Errors.E897.format( field="training.train_corpus", type=type(T["train_corpus"]) ) ) if not isinstance(T["dev_corpus"], str): raise ConfigValidationError( desc=Errors.E897.format( field="training.dev_corpus", type=type(T["dev_corpus"]) ) ) train_corpus, dev_corpus = resolve_dot_names(config, dot_names) optimizer = T["optimizer"] # Components that shouldn't be updated during training frozen_components = T["frozen_components"] # Sourced components that require resume_training resume_components = [p for p in sourced if p not in frozen_components] logger.info(f"Pipeline: {nlp.pipe_names}") if resume_components: with nlp.select_pipes(enable=resume_components): logger.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) # Make sure that listeners are defined before initializing further nlp._link_components() with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) logger.info(f"Initialized pipeline components: {nlp.pipe_names}") # Detect components with listeners that are not frozen consistently for name, proc in nlp.pipeline: if getattr(proc, "listening_components", None): # e.g. tok2vec/transformer for listener in proc.listening_components: if listener in frozen_components and name not in frozen_components: logger.warning(Warnings.W087.format(name=name, listener=listener)) # We always check this regardless, in case user freezes tok2vec if listener not in frozen_components and name in frozen_components: logger.warning(Warnings.W086.format(name=name, listener=listener)) return nlp
def test_overfitting_IO(): # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly fix_random_seed(0) nlp = English() textcat = nlp.add_pipe("textcat") train_examples = [] for text, annotations in TRAIN_DATA_SINGLE_LABEL: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert textcat.model.get_dim("nO") == 2 for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["textcat"] < 0.01 # test the trained model test_text = "I am happy." doc = nlp(test_text) cats = doc.cats assert cats["POSITIVE"] > 0.9 assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001) # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) cats2 = doc2.cats assert cats2["POSITIVE"] > 0.9 assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx( 1.0, 0.001) # Test scoring scores = nlp.evaluate(train_examples) assert scores["cats_micro_f"] == 1.0 assert scores["cats_macro_f"] == 1.0 assert scores["cats_macro_auc"] == 1.0 assert scores["cats_score"] == 1.0 assert "cats_score_desc" in scores # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham." ] batch_cats_1 = [doc.cats for doc in nlp.pipe(texts)] batch_cats_2 = [doc.cats for doc in nlp.pipe(texts)] no_batch_cats = [doc.cats for doc in [nlp(text) for text in texts]] for cats_1, cats_2 in zip(batch_cats_1, batch_cats_2): for cat in cats_1: assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5) for cats_1, cats_2 in zip(batch_cats_1, no_batch_cats): for cat in cats_1: assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)
def test_models_initialize_consistently(seed, model_func, kwargs): fix_random_seed(seed) model1 = model_func(**kwargs) model1.initialize() fix_random_seed(seed) model2 = model_func(**kwargs) model2.initialize() params1 = get_all_params(model1) params2 = get_all_params(model2) assert_array_equal(params1, params2)
def test_cat_readers(reader, additional_config): nlp_config_string = """ [training] seed = 0 [training.score_weights] cats_macro_auc = 1.0 [corpora] @readers = "PLACEHOLDER" [nlp] lang = "en" pipeline = ["tok2vec", "textcat_multilabel"] [components] [components.tok2vec] factory = "tok2vec" [components.textcat_multilabel] factory = "textcat_multilabel" """ config = Config().from_str(nlp_config_string) fix_random_seed(config["training"]["seed"]) config["corpora"]["@readers"] = reader config["corpora"].update(additional_config) nlp = load_model_from_config(config, auto_fill=True) T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) optimizer = T["optimizer"] # simulate a training loop nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) for example in train_corpus(nlp): assert example.y.cats # this shouldn't fail if each training example has at least one positive label assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] nlp.update([example], sgd=optimizer) # simulate performance benchmark on dev corpus dev_examples = list(dev_corpus(nlp)) for example in dev_examples: # this shouldn't fail if each dev example has at least one positive label assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] scores = nlp.evaluate(dev_examples) assert scores["cats_score"] # ensure the pipeline runs doc = nlp("Quick test") assert doc.cats
def test_overfitting_IO_multi(): # Simple test to try and quickly overfit the multi-label textcat component - ensuring the ML models work correctly fix_random_seed(0) nlp = English() textcat = nlp.add_pipe("textcat_multilabel") train_examples = [] for text, annotations in TRAIN_DATA_MULTI_LABEL: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert textcat.model.get_dim("nO") == 3 for i in range(100): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["textcat_multilabel"] < 0.01 # test the trained model test_text = "I am confused but happy." doc = nlp(test_text) cats = doc.cats assert cats["HAPPY"] > 0.9 assert cats["CONFUSED"] > 0.9 # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) cats2 = doc2.cats assert cats2["HAPPY"] > 0.9 assert cats2["CONFUSED"] > 0.9 # Test scoring scores = nlp.evaluate(train_examples) assert scores["cats_micro_f"] == 1.0 assert scores["cats_macro_f"] == 1.0 assert "cats_score_desc" in scores # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham." ] batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)] batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)] no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps)
def get_updated_model(): fix_random_seed(seed) optimizer = Adam(0.001) model = model_func(**kwargs).initialize() initial_params = get_all_params(model) set_dropout_rate(model, dropout) for _ in range(5): Y, get_dX = model.begin_update(get_X()) dY = get_gradient(model, Y) get_dX(dY) model.finish_update(optimizer) updated_params = get_all_params(model) with pytest.raises(AssertionError): assert_array_equal(initial_params, updated_params) return model
def _train_parser(parser): fix_random_seed(1) parser.add_label("left") parser.initialize(lambda: [_parser_example(parser)]) sgd = Adam(0.001) for i in range(5): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) gold = { "heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"] } example = Example.from_dict(doc, gold) parser.update([example], sgd=sgd, losses=losses) return parser
def test_multibatch(): fix_random_seed(0) ops = get_current_ops() arr1 = numpy.asarray([1, 2, 3, 4]) arr2 = numpy.asarray([5, 6, 7, 8]) batches = list(ops.multibatch(2, arr1, arr2)) assert numpy.concatenate(batches).tolist() == [[1, 2], [5, 6], [3, 4], [7, 8]] batches = list(ops.multibatch(2, arr1, arr2, shuffle=True)) assert len(batches) == 2 assert len(batches[0]) == 2 assert len(batches[1]) == 2 batches = list(ops.multibatch(2, [1, 2, 3, 4], [5, 6, 7, 8])) assert batches == [[[1, 2], [5, 6]], [[3, 4], [7, 8]]] with pytest.raises(ValueError): ops.multibatch(10, (i for i in range(100)), (i for i in range(100))) with pytest.raises(ValueError): ops.multibatch(10, arr1, (i for i in range(100)), arr2)
def test_resize_same_results(name, textcat_config): # Ensure that the resized textcat classifiers still produce the same results for old labels fix_random_seed(0) nlp = English() pipe_config = {"model": textcat_config} textcat = nlp.add_pipe(name, config=pipe_config) train_examples = [] for text, annotations in TRAIN_DATA_SINGLE_LABEL: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert textcat.model.maybe_get_dim("nO") in [2, None] for i in range(5): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) # test the trained model before resizing test_text = "I am happy." doc = nlp(test_text) assert len(doc.cats) == 2 pos_pred = doc.cats["POSITIVE"] neg_pred = doc.cats["NEGATIVE"] # test the trained model again after resizing textcat.add_label("NEUTRAL") doc = nlp(test_text) assert len(doc.cats) == 3 assert doc.cats["POSITIVE"] == pos_pred assert doc.cats["NEGATIVE"] == neg_pred assert doc.cats["NEUTRAL"] <= 1 for i in range(5): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) # test the trained model again after training further with new label doc = nlp(test_text) assert len(doc.cats) == 3 assert doc.cats["POSITIVE"] != pos_pred assert doc.cats["NEGATIVE"] != neg_pred for cat in doc.cats: assert doc.cats[cat] <= 1
def test_minibatch(): fix_random_seed(0) ops = get_current_ops() items = [1, 2, 3, 4, 5, 6] batches = ops.minibatch(3, items) assert list(batches) == [[1, 2, 3], [4, 5, 6]] batches = ops.minibatch((i for i in (3, 2, 1)), items) assert list(batches) == [[1, 2, 3], [4, 5], [6]] batches = list(ops.minibatch(3, numpy.asarray(items))) assert isinstance(batches[0], numpy.ndarray) assert numpy.array_equal(batches[0], numpy.asarray([1, 2, 3])) assert numpy.array_equal(batches[1], numpy.asarray([4, 5, 6])) batches = list(ops.minibatch((i for i in (3, 2, 1)), items, shuffle=True)) assert batches != [[1, 2, 3], [4, 5], [6]] assert len(batches[0]) == 3 assert len(batches[1]) == 2 assert len(batches[2]) == 1 with pytest.raises(ValueError): ops.minibatch(10, (i for i in range(100))) with pytest.raises(ValueError): ops.minibatch(10, True)
def main(numpy: bool = False, pytorch: bool = False, generic: bool = False, gpu_id: int = -1): global CONFIG fix_random_seed(0) if gpu_id >= 0: require_gpu(gpu_id) print("Set GPU", gpu_id) backends = {"pytorch": pytorch, "numpy": numpy, "generic": generic} for name, use_backend in backends.items(): if not use_backend: print(f"Skipping {name}") continue set_backend(name, gpu_id) print("Getting data") C = registry.resolve(Config().from_str(CONFIG)) model = C["model"] X, Y = get_dummy_data(**C["data"]) print("Copy to device") X = [model.ops.asarray(x) for x in X] Y = [model.ops.asarray(y) for y in Y] print("Begin init", len(X)) model.initialize(X=X[:5]) print("Pre-batch") n_words = sum(len(x) for x in X) batches = model.ops.multibatch(16, X, Y) batches = [(model.layers[0].predict(x), y) for x, y in batches] model.layers.pop(0) print("Start") start_time = timer() total = run_forward(model, [x for x, y in batches]) end_time = timer() print(name, n_words, total, end_time - start_time) start_time = timer() total = run_forward_backward(model, batches) end_time = timer() print(name, n_words, total, end_time - start_time)
def test_models_predict_consistently(seed, model_func, kwargs, get_X): fix_random_seed(seed) model1 = model_func(**kwargs).initialize() Y1 = model1.predict(get_X()) fix_random_seed(seed) model2 = model_func(**kwargs).initialize() Y2 = model2.predict(get_X()) if model1.has_ref("tok2vec"): tok2vec1 = model1.get_ref("tok2vec").predict(get_X()) tok2vec2 = model2.get_ref("tok2vec").predict(get_X()) for i in range(len(tok2vec1)): for j in range(len(tok2vec1[i])): assert_array_equal( numpy.asarray(model1.ops.to_numpy(tok2vec1[i][j])), numpy.asarray(model2.ops.to_numpy(tok2vec2[i][j])), ) try: Y1 = model1.ops.to_numpy(Y1) Y2 = model2.ops.to_numpy(Y2) except Exception: pass if isinstance(Y1, numpy.ndarray): assert_array_equal(Y1, Y2) elif isinstance(Y1, List): assert len(Y1) == len(Y2) for y1, y2 in zip(Y1, Y2): try: y1 = model1.ops.to_numpy(y1) y2 = model2.ops.to_numpy(y2) except Exception: pass assert_array_equal(y1, y2) else: raise ValueError(f"Could not compare type {type(Y1)}")
def evaluate( model: str, data_path: Path, output: Optional[Path] = None, use_gpu: int = -1, gold_preproc: bool = False, displacy_path: Optional[Path] = None, displacy_limit: int = 25, silent: bool = True, spans_key: str = "sc", ) -> Dict[str, Any]: msg = Printer(no_print=silent, pretty=not silent) fix_random_seed() setup_gpu(use_gpu, silent=silent) data_path = util.ensure_path(data_path) output_path = util.ensure_path(output) displacy_path = util.ensure_path(displacy_path) if not data_path.exists(): msg.fail("Evaluation data not found", data_path, exits=1) if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = Corpus(data_path, gold_preproc=gold_preproc) nlp = util.load_model(model) dev_dataset = list(corpus(nlp)) scores = nlp.evaluate(dev_dataset) metrics = { "TOK": "token_acc", "TAG": "tag_acc", "POS": "pos_acc", "MORPH": "morph_acc", "LEMMA": "lemma_acc", "UAS": "dep_uas", "LAS": "dep_las", "NER P": "ents_p", "NER R": "ents_r", "NER F": "ents_f", "TEXTCAT": "cats_score", "SENT P": "sents_p", "SENT R": "sents_r", "SENT F": "sents_f", "SPAN P": f"spans_{spans_key}_p", "SPAN R": f"spans_{spans_key}_r", "SPAN F": f"spans_{spans_key}_f", "SPEED": "speed", } results = {} data = {} for metric, key in metrics.items(): if key in scores: if key == "cats_score": metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" if isinstance(scores[key], (int, float)): if key == "speed": results[metric] = f"{scores[key]:.0f}" else: results[metric] = f"{scores[key]*100:.2f}" else: results[metric] = "-" data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] msg.table(results, title="Results") data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent) if displacy_path: factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) render_deps = "parser" in factory_names render_ents = "ner" in factory_names render_parses( docs, displacy_path, model_name=model, limit=displacy_limit, deps=render_deps, ents=render_ents, ) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) if output_path is not None: srsly.write_json(output_path, data) msg.good(f"Saved results to {output_path}") return data
def train( nlp: "Language", output_path: Optional[Path] = None, *, use_gpu: int = -1, stdout: IO = sys.stdout, stderr: IO = sys.stderr, ) -> Tuple["Language", Optional[Path]]: """Train a pipeline. nlp (Language): The initialized nlp object with the full config. output_path (Path): Optional output path to save trained model to. use_gpu (int): Whether to train on GPU. Make sure to call require_gpu before calling this function. stdout (file): A file-like object to write output messages. To disable printing, set to io.StringIO. stderr (file): A second file-like object to write output messages. To disable printing, set to io.StringIO. RETURNS (tuple): The final nlp object and the path to the exported model. """ # We use no_print here so we can respect the stdout/stderr options. msg = Printer(no_print=True) # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() if config["training"]["seed"] is not None: fix_random_seed(config["training"]["seed"]) allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) optimizer = T["optimizer"] score_weights = T["score_weights"] batcher = T["batcher"] train_logger = T["logger"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Helper function to save checkpoints. This is a closure for convenience, # to avoid passing in all the args all the time. def save_checkpoint(is_best): with nlp.use_params(optimizer.averages): before_to_disk(nlp).to_disk(output_path / DIR_MODEL_LAST) if is_best: # Avoid saving twice (saving will be more expensive than # the dir copy) if (output_path / DIR_MODEL_BEST).exists(): shutil.rmtree(output_path / DIR_MODEL_BEST) shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST) # Components that shouldn't be updated during training frozen_components = T["frozen_components"] # Components that should set annotations on update annotating_components = T["annotating_components"] # Create iterator, which yields out info after each optimization step. training_step_iterator = train_while_improving( nlp, optimizer, create_train_batches(nlp, train_corpus, batcher, T["max_epochs"]), create_evaluation_callback(nlp, dev_corpus, score_weights), dropout=T["dropout"], accumulate_gradient=T["accumulate_gradient"], patience=T["patience"], max_steps=T["max_steps"], eval_frequency=T["eval_frequency"], exclude=frozen_components, annotating_components=annotating_components, ) clean_output_dir(output_path) stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n") if frozen_components: stdout.write( msg.info(f"Frozen components: {frozen_components}") + "\n") if annotating_components: stdout.write( msg.info(f"Set annotations on update for: {annotating_components}") + "\n") stdout.write( msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n") with nlp.select_pipes(disable=frozen_components): log_step, finalize_logger = train_logger(nlp, stdout, stderr) try: for batch, info, is_best_checkpoint in training_step_iterator: if is_best_checkpoint is not None: with nlp.select_pipes(disable=frozen_components): update_meta(T, nlp, info) if output_path is not None: save_checkpoint(is_best_checkpoint) info["output_path"] = str(output_path / DIR_MODEL_LAST) log_step(info if is_best_checkpoint is not None else None) except Exception as e: if output_path is not None: stdout.write( msg.warn(f"Aborting and saving the final best model. " f"Encountered exception: {repr(e)}") + "\n") raise e finally: finalize_logger() if output_path is not None: save_checkpoint(False) # This will only run if we did't hit an error if optimizer.averages: nlp.use_params(optimizer.averages) if output_path is not None: stdout.write( msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST) + "\n") return (nlp, output_path / DIR_MODEL_LAST) else: return (nlp, None)
def pretrain( config: Config, output_dir: Path, resume_path: Optional[Path] = None, epoch_resume: Optional[int] = None, use_gpu: int = -1, silent: bool = True, ): msg = Printer(no_print=silent) if config["training"]["seed"] is not None: fix_random_seed(config["training"]["seed"]) allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) nlp = load_model_from_config(config) _config = nlp.config.interpolate() P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) corpus = dot_to_object(_config, P["corpus"]) corpus = registry.resolve({"corpus": corpus})["corpus"] batcher = P["batcher"] model = create_pretraining_model(nlp, P) optimizer = P["optimizer"] # Load in pretrained weights to resume from if resume_path is not None: _resume_model(model, resume_path, epoch_resume, silent=silent) else: # Without '--resume-path' the '--epoch-resume' argument is ignored epoch_resume = 0 objective = model.attrs["loss"] # TODO: move this to logger function? tracker = ProgressTracker(frequency=10000) msg.divider( f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") row_settings = { "widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r") } msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: file_.write(model.get_ref("tok2vec").to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, "epoch_loss": tracker.epoch_loss, "epoch": epoch, } with (output_dir / "log.jsonl").open("a") as file_: file_.write(srsly.json_dumps(log) + "\n") # TODO: I think we probably want this to look more like the # 'create_train_batches' function? for epoch in range(epoch_resume, P["max_epochs"]): for batch_id, batch in enumerate(batcher(corpus(nlp))): docs = ensure_docs(batch) loss = make_update(model, docs, optimizer, objective) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if P["n_save_every"] and (batch_id % P["n_save_every"] == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0
def debug_model_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), component: str = Arg( ..., help= "Name of the pipeline component of which the model should be analysed" ), layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"), dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"), parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"), gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"), attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"), P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"), P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"), P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"), P3: bool = Opt(False, "--print-step3", "-P3", help="Print final predictions"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on ): """ Analyze a Thinc model implementation. Includes checks for internal structure and activations during training. DOCS: https://spacy.io/api/cli#debug-model """ setup_gpu(use_gpu) layers = string_to_list(layers, intify=True) print_settings = { "dimensions": dimensions, "parameters": parameters, "gradients": gradients, "attributes": attributes, "layers": layers, "print_before_training": P0, "print_after_init": P1, "print_after_training": P2, "print_prediction": P3, } config_overrides = parse_config_overrides(ctx.args) with show_validation_error(config_path): raw_config = util.load_config(config_path, overrides=config_overrides, interpolate=False) config = raw_config.interpolate() allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) with show_validation_error(config_path): nlp = util.load_model_from_config(raw_config) config = nlp.config.interpolate() T = registry.resolve(config["training"], schema=ConfigSchemaTraining) seed = T["seed"] if seed is not None: msg.info(f"Fixing random seed: {seed}") fix_random_seed(seed) pipe = nlp.get_pipe(component) if not hasattr(pipe, "model"): msg.fail( f"The component '{component}' does not specify an object that holds a Model.", exits=1, ) model = pipe.model debug_model(config, T, nlp, model, print_settings=print_settings)
loaded_config = api.registry.make_from_config(config) batch_size = loaded_config['training']['batch_size'] n_iter = loaded_config['training']['n_iter'] # dataset (train_X, train_Y), (dev_X, dev_Y) = ml_datasets.mnist() cowsay.cow(f"Training size={len(train_X)}, dev size={len(dev_X)}") # model model = api.Softmax() model.initialize(X=train_X, Y=train_Y) cowsay.cow( f"Initialized model with input dimension " f"nI={model.get_dim('nI')} and output dimension nO={model.get_dim('nO')}") api.fix_random_seed(0) optimizer = loaded_config['optimizer'] print("Training") for _ in range(n_iter): for X, Y in model.ops.multibatch(batch_size, train_X, train_Y, shuffle=True): Yh, backprop = model.begin_update(X) backprop(Yh - Y) model.finish_update(optimizer) print("Testing") n_correct = 0 n_total = 0 for X, Y in model.ops.multibatch(batch_size, dev_X, dev_Y, shuffle=True):
CONFIG = """ [hyper_params] width = 32 vector_width = 16 learn_rate = 0.001 [training] n_iter = 10 batch_size = 128 [model] @call = cnn_tagger width = ${hyper_params.width} vector_width = ${hyper_params.vector_width} nr_classes = 17 [optimizer] @call = thinc.api:Adam learn_rate = ${hyper_params.learn_rate} """ fix_random_seed(0) oh.config.load_str(CONFIG) model = oh.config.model() optimizer = oh.config.optimizer() n_iter = oh.config.training["n_iter"] batch_size = oh.config.training["batch_size"] train_model(model, optimizer, n_iter, batch_size)