def create_optimizer(config_path): msg.info(f"Loading config from: {config_path}") config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) config = util.load_config(config_path, create_objects=True) training = config["training"] return training["optimizer"]
def main( width: int, embed_size: int, vectors: str, init_tok2vec=None, train_iters=30, train_examples=1000, eval_examples=5000, batch_size=4, dropout=0.0, learn_rate=0.15, b1=0.0, b2_ratio=0.0, adam_eps=0.0, L2=0.0, grad_norm_clip=1.0, ): opt_params = get_opt_params(locals()) random.seed(0) fix_random_seed(0) use_gpu = prefer_gpu() print("Using GPU?", use_gpu) nlp = create_pipeline('en', width, embed_size, vectors) train_textcat(nlp, train_examples, eval_examples, opt_params, dropout=dropout, batch_size=batch_size, n_iter=train_iters, init_tok2vec=init_tok2vec)
def test_zero_suggestions(): # Test with a suggester that returns 0 suggestions @registry.misc("test_zero_suggester") def make_zero_suggester(): def zero_suggester(docs, *, ops=None): if ops is None: ops = get_current_ops() return Ragged(ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs), ), dtype="i")) return zero_suggester fix_random_seed(0) nlp = English() spancat = nlp.add_pipe( "spancat", config={ "suggester": { "@misc": "test_zero_suggester" }, "spans_key": SPAN_KEY }, ) train_examples = make_examples(nlp) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert spancat.model.get_dim("nO") == 2 assert set(spancat.labels) == {"LOC", "PERSON"} nlp.update(train_examples, sgd=optimizer)
def train_ner(output_dir: str, train_data_path: str, dev_data_path: str, test_data_path: str, run_test: bool = None, model: str = None, n_iter: int = 10, meta_overrides: str = None): util.fix_random_seed(util.env_opt("seed", 0)) train_data = read_ner_from_tsv(train_data_path) dev_data = read_ner_from_tsv(dev_data_path) test_data = read_ner_from_tsv(test_data_path) os.makedirs(output_dir, exist_ok=True) if run_test: nlp = spacy.load(model) print("Loaded model '%s'" % model) evaluate_ner(nlp, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json")) evaluate_ner(nlp, test_data, dump_path=os.path.join(output_dir, "test_metrics.json")) else: train(model, train_data, dev_data, test_data, output_dir, n_iter, meta_overrides)
def test_issue6177(): """Test that after fixing the random seed, the results of the pipeline are truly identical""" # NOTE: no need to transform this code to v3 when 'master' is merged into 'develop'. # A similar test exists already for v3: test_issue5551 # This is just a backport results = [] for i in range(3): fix_random_seed(0) nlp = English() example = ( "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.", { "cats": { "Labe1": 1.0, "Label2": 0.0, "Label3": 0.0 } }, ) textcat = nlp.create_pipe("textcat") nlp.add_pipe(textcat) for label in set(example[1]["cats"]): textcat.add_label(label) nlp.begin_training() # Store the result of each iteration result = textcat.model.predict([nlp.make_doc(example[0])]) results.append(list(result[0])) # All results should be the same because of the fixed seed assert len(results) == 3 assert results[0] == results[1] assert results[0] == results[2]
def _train_parser(parser): fix_random_seed(1) parser.add_label("left") parser.begin_training([], **parser.cfg) sgd = Adam(NumpyOps(), 0.001) for i in range(5): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) parser.update([doc], [gold], sgd=sgd, losses=losses) return parser
def _train_parser(parser): fix_random_seed(1) parser.add_label("left") parser.begin_training([], **parser.cfg) sgd = Adam(NumpyOps(), 0.001) for i in range(5): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) parser.update([doc], [gold], sgd=sgd, losses=losses) return parser
def test_make_spangroup(max_positive, nr_results): fix_random_seed(0) nlp = Language() spancat = nlp.add_pipe( "spancat", config={ "spans_key": SPAN_KEY, "threshold": 0.5, "max_positive": max_positive }, ) doc = nlp.make_doc("Greater London") ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")( sizes=[1, 2]) indices = ngram_suggester([doc])[0].dataXd assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]])) labels = ["Thing", "City", "Person", "GreatCity"] scores = numpy.asarray( [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f") spangroup = spancat._make_span_group(doc, indices, scores, labels) assert len(spangroup) == nr_results # first span is always the second token "London" assert spangroup[0].text == "London" assert spangroup[0].label_ == "City" assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5) # second span depends on the number of positives that were allowed assert spangroup[1].text == "Greater London" if max_positive == 1: assert spangroup[1].label_ == "GreatCity" assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5) else: assert spangroup[1].label_ == "Thing" assert_almost_equal(0.8, spangroup.attrs["scores"][1], 5) if nr_results > 2: assert spangroup[2].text == "Greater London" if max_positive == 2: assert spangroup[2].label_ == "GreatCity" assert_almost_equal(0.9, spangroup.attrs["scores"][2], 5) else: assert spangroup[2].label_ == "City" assert_almost_equal(0.7, spangroup.attrs["scores"][2], 5) assert spangroup[-1].text == "Greater London" assert spangroup[-1].label_ == "GreatCity" assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)
def test_overfitting_IO(): # Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly fix_random_seed(0) nlp = English() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) train_examples = make_examples(nlp) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert spancat.model.get_dim("nO") == 2 assert set(spancat.labels) == {"LOC", "PERSON"} for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["spancat"] < 0.01 # test the trained model test_text = "I like London and Berlin" doc = nlp(test_text) assert doc.spans[spancat.key] == doc.spans[SPAN_KEY] spans = doc.spans[SPAN_KEY] assert len(spans) == 2 assert len(spans.attrs["scores"]) == 2 assert min(spans.attrs["scores"]) > 0.9 assert set([span.text for span in spans]) == {"London", "Berlin"} assert set([span.label_ for span in spans]) == {"LOC"} # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) spans2 = doc2.spans[SPAN_KEY] assert len(spans2) == 2 assert len(spans2.attrs["scores"]) == 2 assert min(spans2.attrs["scores"]) > 0.9 assert set([span.text for span in spans2]) == {"London", "Berlin"} assert set([span.label_ for span in spans2]) == {"LOC"} # Test scoring scores = nlp.evaluate(train_examples) assert f"spans_{SPAN_KEY}_f" in scores assert scores[f"spans_{SPAN_KEY}_p"] == 1.0 assert scores[f"spans_{SPAN_KEY}_r"] == 1.0 assert scores[f"spans_{SPAN_KEY}_f"] == 1.0 # also test that the spancat works for just a single entity in a sentence doc = nlp("London") assert len(doc.spans[spancat.key]) == 1
def test_overfitting_IO_overlapping(): # Test for overfitting on overlapping entities fix_random_seed(0) nlp = English() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) train_examples = make_examples(nlp, data=TRAIN_DATA_OVERLAPPING) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert spancat.model.get_dim("nO") == 3 assert set(spancat.labels) == {"PERSON", "LOC", "DOUBLE_LOC"} for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["spancat"] < 0.01 # test the trained model test_text = "I like London and Berlin" doc = nlp(test_text) spans = doc.spans[SPAN_KEY] assert len(spans) == 3 assert len(spans.attrs["scores"]) == 3 assert min(spans.attrs["scores"]) > 0.9 assert set([span.text for span in spans]) == { "London", "Berlin", "London and Berlin", } assert set([span.label_ for span in spans]) == {"LOC", "DOUBLE_LOC"} # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) spans2 = doc2.spans[SPAN_KEY] assert len(spans2) == 3 assert len(spans2.attrs["scores"]) == 3 assert min(spans2.attrs["scores"]) > 0.9 assert set([span.text for span in spans2]) == { "London", "Berlin", "London and Berlin", } assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
def test_simple_train(): fix_random_seed(0) nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) get_examples = make_get_examples(nlp) nlp.initialize(get_examples) sgd = nlp.create_optimizer() assert len(spancat.labels) != 0 for i in range(40): losses = {} nlp.update(list(get_examples()), losses=losses, drop=0.1, sgd=sgd) doc = nlp("I like London and Berlin.") assert doc.spans[spancat.key] == doc.spans[SPAN_KEY] assert len(doc.spans[spancat.key]) == 2 assert doc.spans[spancat.key][0].text == "London" scores = nlp.evaluate(get_examples()) assert f"spans_{SPAN_KEY}_f" in scores assert scores[f"spans_{SPAN_KEY}_f"] == 1.0
def main( pretrain_epoch: int, train_iters=30, eval_examples=5000, dropout=0.2, learn_rate=0.15, b1=0.0, b2_ratio=0.0, adam_eps=0.0, L2=0.0, grad_norm_clip=1.0, ): opt_params = get_opt_params(locals()) random.seed(0) fix_random_seed(0) use_gpu = prefer_gpu() print("Using GPU?", use_gpu) nlp = create_pipeline() train_textcat(nlp, eval_examples, opt_params, train_iters, dropout, pretrain_epoch) save_model(nlp, pretrain_epoch)
def train_ner(output_dir: str, train_data_path: str, dev_data_path: str, test_data_path: str, run_test: bool = None, model: str = None, n_iter: int = 10, meta_overrides: str = None): model = "en_core_sci_sm" util.fix_random_seed(util.env_opt("seed", 0)) with Path(train_data_path).open('rb') as file: train_data = pickle.load(file) with Path(dev_data_path).open('rb') as file: dev_data = pickle.load(file) with Path(test_data_path).open('rb') as file: test_data = pickle.load(file) # train_data = read_ner_from_tsv(train_data_path) # dev_data = read_ner_from_tsv(dev_data_path) # test_data = read_ner_from_tsv(test_data_path) os.makedirs(output_dir, exist_ok=True) if run_test: nlp = spacy.load(model) print("Loaded model '%s'" % model) evaluate_ner(nlp, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json")) evaluate_ner(nlp, test_data, dump_path=os.path.join(output_dir, "test_metrics.json")) else: train(model, train_data, dev_data, test_data, output_dir, n_iter, meta_overrides)
def main(gpu_id=0, nb_epoch=100): fix_random_seed(0) if gpu_id >= 0: require_gpu(gpu_id=gpu_id) train, test = datasets.imdb(limit=0) print("Load data") train_X, train_y = zip(*train) test_X, test_y = zip(*test) train_y = Model.ops.asarray(to_categorical(train_y, nb_classes=2)) test_y = Model.ops.asarray(to_categorical(test_y, nb_classes=2)) nlp = spacy.load("en_vectors_web_lg") nlp.add_pipe(nlp.create_pipe("sentencizer"), first=True) register_vectors(Model.ops, nlp.vocab.vectors.name, nlp.vocab.vectors.data) preprocessor = FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) train_X = [ preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(train_X)) ] test_X = [ preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(test_X)) ] dev_X = train_X[-1000:] dev_y = train_y[-1000:] train_X = train_X[:-1000] train_y = train_y[:-1000] print("Parse data") n_sent = sum([len(list(sents)) for sents in train_X]) print("%d sentences" % n_sent) model = build_model(2, vectors_name=nlp.vocab.vectors.name, width=128, conv_depth=2, depth=2, train_X=train_X, train_y=train_y) with model.begin_training(train_X[:100], train_y[:100]) as (trainer, optimizer): epoch_loss = [0.0] def report_progress(): with model.use_params(optimizer.averages): print( epoch_loss[-1], epoch_var[-1], model.evaluate(dev_X, dev_y), trainer.dropout, ) epoch_loss.append(0.0) epoch_var.append(0.0) trainer.each_epoch.append(report_progress) batch_sizes = compounding(64, 64, 1.01) trainer.dropout = 0.3 trainer.batch_size = int(next(batch_sizes)) trainer.dropout_decay = 0.0 trainer.nb_epoch = nb_epoch # optimizer.alpha = 0.1 # optimizer.max_grad_norm = 10.0 # optimizer.b1 = 0.0 # optimizer.b2 = 0.0 epoch_var = [0.0] for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) losses = ((yh - y)**2.0).sum(axis=1) / y.shape[0] epoch_var[-1] += losses.var() loss = losses.mean() backprop((yh - y) / yh.shape[0], optimizer) epoch_loss[-1] += loss trainer.batch_size = int(next(batch_sizes)) with model.use_params(optimizer.averages): print("Avg dev.: %.3f" % model.evaluate(dev_X, dev_y))
def custom_train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", vectors=None, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, orth_variant_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm msg = Printer() util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [ p for p in output_path.iterdir() if p.is_dir() ]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] msg.text("Training pipeline: {}".format(pipeline)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) for pipe in pipeline: if pipe not in nlp.pipe_names: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) else: if pipe == "textcat": textcat_cfg = nlp.get_pipe("textcat").cfg base_cfg = { "exclusive_classes": textcat_cfg["exclusive_classes"], "architecture": textcat_cfg["architecture"], "positive_label": textcat_cfg["positive_label"], } pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } if base_cfg != pipe_cfg: msg.fail( "The base textcat model configuration does" "not match the provided training options. " "Existing cfg: {}, provided cfg: {}".format( base_cfg, pipe_cfg), exits=1, ) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) ### Here are our modifications: lang_cls.Defaults.tag_map = custom_tag_map nlp = lang_cls() assert nlp.vocab.morphology.n_tags == 36 ### for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail("Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name)) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model: # Start with an existing model, use default optimizer optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg["labels"] if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( "The textcat_positive_label (tpl) '{}' does not match any " "label in the training data.".format(textcat_positive_label), exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( "A textcat_positive_label (tpl) '{}' was provided for training " "data that does not appear to be a binary classification " "problem with two labels.".format(textcat_positive_label), exits=1, ) train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) train_labels = set() if textcat_multilabel: multilabel_found = False for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1: multilabel_found = True if not multilabel_found and not base_model: msg.warn("The textcat training instances look like they have " "mutually-exclusive classes. Remove the flag " "'--textcat-multilabel' to train a classifier with " "mutually-exclusive classes.") if not textcat_multilabel: for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1 and not base_model: msg.warn( "Some textcat training instances do not have exactly " "one positive label. Modifying training options to " "include the flag '--textcat-multilabel' for classes " "that are not mutually exclusive.") nlp.get_pipe("textcat").cfg["exclusive_classes"] = False textcat_multilabel = True break if base_model and set(textcat_labels) != train_labels: msg.fail( "Cannot extend textcat model using data with different " "labels. Base model labels: {}, training data labels: " "{}.".format(textcat_labels, list(train_labels)), exits=1, ) if textcat_multilabel: msg.text( "Textcat evaluation score: ROC AUC score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) elif textcat_positive_label and len(textcat_labels) == 2: msg.text("Textcat evaluation score: F1-score for the " "label '{}'".format(textcat_positive_label)) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: msg.warn( "If the textcat component is a binary classifier with " "exclusive classes, provide '--textcat_positive_label' for " "an evaluation on the positive class.") msg.text( "Textcat evaluation score: F1-score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) else: msg.fail( "Unsupported textcat configuration. Use `spacy debug-data` " "for more information.") # fmt: off row_head, output_stats = _configure_training_output( pipeline, use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = { "widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2 } # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path( epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta["labels"] = nlp.meta["labels"] meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) if i == 0 and "textcat" in pipeline: textcats_per_cat = scorer.scores.get( "textcats_per_cat", {}) for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( "Textcat ROC AUC score is undefined due to " "only one value in label '{}'.".format( cat)) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: msg.text("Early stopping, best iteration " "is: {}".format(i - iter_since_best)) msg.text("Best score = {}; Final iteration " "score = {}".format(best_score, current_score)) break finally: with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) msg.good("Created best model", best_model_path)
def __call__(self, train_data, dev_data, test_data, n_iter, dropout, patience=10): if "ner" in self.nlp.pipe_names: logging.warning("Pipeline already has NER, removing...") self.nlp.remove_pipe("ner") ner = self.nlp.create_pipe("ner") # ner.add_multitask_objective(get_position_label) self.nlp.add_pipe(ner, last=True) for sent, ann in train_data + dev_data + test_data: for _, _, label in ann: ner.add_label(label) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != "ner"] logging.info("Starting the training with {} iterations".format(n_iter)) fix_random_seed(42) best_dev_score = 0 best_dev_itn = -1 with self.nlp.disable_pipes(*other_pipes): # only train NER # if not self.model: # FIXME: pre-train the model mlflow.log_param("base_model", Path(self.model).name) mlflow.log_param("n_iter", n_iter) mlflow.log_param("dropout", dropout) self.nlp.begin_training() for itn in tqdm(list(range(n_iter)), desc="iterations"): random.shuffle(train_data) losses = {} # batch up the examples using spaCy's minibatch batches = list( minibatch(train_data, size=compounding(4.0, 32.0, 1.001))) for batch in tqdm(batches, desc="batches"): texts, annotations = zip(*batch) self.nlp.update( docs=texts, # batch of texts golds=[ self._sent_to_goldparse(t, a) for t, a in batch ], # batch of annotations drop= dropout, # dropout - make it harder to memorise data losses=losses, ) # logging.info("Losses: {}".format(losses)) mlflow.log_metric("loss", losses["ner"], step=itn) train_scores = self.evaluate(train_data) mlflow.log_metrics( {"train_" + k: v for k, v in train_scores.items()}, step=itn) dev_scores = self.evaluate(dev_data) mlflow.log_metrics( {"dev_" + k: v for k, v in dev_scores.items()}, step=itn) self.store_model( Path(self.output_dir) / "model_{}".format(itn)) act_score = dev_scores["f1"] if best_dev_score < act_score: best_dev_score = act_score best_dev_itn = itn else: if itn - best_dev_itn >= patience: break load_model_from_path( Path(self.output_dir) / "model_{}".format(best_dev_itn)) scores = self.evaluate(test_data) logging.info("Test scores {}".format(scores)) mlflow.log_metrics(scores, step=itn) self.store_model(Path(self.output_dir) / "model-final")
def train(pretrained, output_dir, train_data, dev_data, n_iter=30, n_sents=0, parser_multitasks='', entity_multitasks='', use_gpu=-1, no_tagger=False, no_parser=False, no_entities=False, gold_preproc=False, version="0.0.0", meta_path=None, verbose=False): """ Re-train a pre-trained model. Expects data in spaCy's JSON format. This code is based on https://github.com/explosion/spaCy/blob/master/spacy/cli/train.py. """ # There is a bug that prevents me from using the GPU when resuming # training from a saved model. See # https://github.com/explosion/spaCy/issues/1806. if use_gpu >= 0: msg = "\nWARNING: using GPU may require re-installing thinc. " msg += "See https://github.com/explosion/spaCy/issues/1806.\n" print(msg) util.fix_random_seed() util.set_env_log(True) n_sents = n_sents or None output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) dev_path = util.ensure_path(dev_data) meta_path = util.ensure_path(meta_path) if not output_path.exists(): output_path.mkdir() if not train_path.exists(): prints(train_path, title=Messages.M050, exits=1) if dev_path and not dev_path.exists(): prints(dev_path, title=Messages.M051, exits=1) if meta_path is not None and not meta_path.exists(): prints(meta_path, title=Messages.M020, exits=1) meta = util.read_json(meta_path) if meta_path else {} if not isinstance(meta, dict): prints(Messages.M053.format(meta_type=type(meta)), title=Messages.M052, exits=1) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), util.env_opt('dropout_to', 0.2), util.env_opt('dropout_decay', 0.0)) batch_sizes = util.compounding(util.env_opt('batch_from', 1), util.env_opt('batch_to', 16), util.env_opt('batch_compound', 1.001)) max_doc_len = util.env_opt('max_doc_len', 5000) corpus = GoldCorpus(train_path, dev_path, limit=n_sents) n_train_words = corpus.count_train() # Load pre-trained model. Remove components that we are not # re-training. nlp = load(pretrained) if no_tagger and 'tagger' in nlp.pipe_names: nlp.remove_pipe('tagger') if no_parser and 'parser' in nlp.pipe_names: nlp.remove_pipe('parser') if no_entities and 'ner' in nlp.pipe_names: nlp.remove_pipe('ner') meta.setdefault('name', 'unnamed') meta['pipeline'] = nlp.pipe_names meta.setdefault('lang', nlp.lang) nlp.meta.update(meta) # Add multi-task objectives if parser_multitasks: for objective in parser_multitasks.split(','): nlp.parser.add_multitask_objective(objective) if entity_multitasks: for objective in entity_multitasks.split(','): nlp.entity.add_multitask_objective(objective) # Get optimizer optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None print(nlp.pipe_names) print(nlp.pipeline) print( "Itn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS" ) try: train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0, gold_preproc=gold_preproc, max_length=0) train_docs = list(train_docs) for i in range(n_iter): with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in minibatch(train_docs, size=batch_sizes): batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len] if not batch: continue docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses) pbar.update(sum(len(doc) for doc in docs)) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ('model%d' % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device('cpu'): nlp_loaded = util.load_model_from_path( epoch_model_path) dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = (output_path / ('model%d' % i) / 'accuracy.json') with acc_loc.open('w') as file_: file_.write(json_dumps(scorer.scores)) meta_loc = output_path / ('model%d' % i) / 'meta.json' meta['accuracy'] = scorer.scores meta['speed'] = { 'nwords': nwords, 'cpu': cpu_wps, 'gpu': gpu_wps } meta['vectors'] = { 'width': nlp.vocab.vectors_length, 'vectors': len(nlp.vocab.vectors), 'keys': nlp.vocab.vectors.n_keys } meta['lang'] = nlp.lang meta['pipeline'] = nlp.pipe_names meta['spacy_version'] = '>=%s' % about.__version__ meta.setdefault('name', 'model%d' % i) meta.setdefault('version', version) with meta_loc.open('w') as file_: file_.write(json_dumps(meta)) util.set_env_log(True) print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps) finally: print("Saving model...") with nlp.use_params(optimizer.averages): final_model_path = output_path / 'model-final' nlp.to_disk(final_model_path)