def test_load_model_blank_shortcut(): """Test that using a model name like "blank:en" works as a shortcut for spacy.blank("en"). """ nlp = util.load_model("blank:en") assert nlp.lang == "en" assert nlp.pipeline == [] with pytest.raises(ImportError): util.load_model("blank:fjsfijsdof")
def _load_vectors(nlp, vectors): util.load_model(vectors, vocab=nlp.vocab) for lex in nlp.vocab: values = {} for attr, func in nlp.vocab.lex_attr_getters.items(): # These attrs are expected to be set by data. Others should # be set by calling the language functions. if attr not in (CLUSTER, PROB, IS_OOV, LANG): values[lex.vocab.strings[attr]] = func(lex.orth_) lex.set_attrs(**values) lex.is_oov = False
def test_load_model_blank_shortcut(): """Test that using a model name like "blank:en" works as a shortcut for spacy.blank("en"). """ nlp = util.load_model("blank:en") assert nlp.lang == "en" assert nlp.pipeline == [] # ImportError for loading an unsupported language with pytest.raises(ImportError): util.load_model("blank:zxx") # ImportError for requesting an invalid language code that isn't registered with pytest.raises(ImportError): util.load_model("blank:fjsfijsdof")
def test_issue4042(): """Test that serialization of an EntityRuler before NER works fine.""" nlp = English() # add ner pipe ner = nlp.add_pipe("ner") ner.add_label("SOME_LABEL") nlp.initialize() # Add entity ruler patterns = [ {"label": "MY_ORG", "pattern": "Apple"}, {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, ] # works fine with "after" ruler = nlp.add_pipe("entity_ruler", before="ner") ruler.add_patterns(patterns) doc1 = nlp("What do you think about Apple ?") assert doc1.ents[0].label_ == "MY_ORG" with make_tempdir() as d: output_dir = ensure_path(d) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) nlp2 = load_model(output_dir) doc2 = nlp2("What do you think about Apple ?") assert doc2.ents[0].label_ == "MY_ORG"
def test_load_model_version_compat(): """Test warnings for various spacy_version specifications in meta. Since this is more of a hack for v2, manually specify the current major.minor version to simplify test creation.""" nlp = util.load_model("blank:en") assert nlp.meta["spacy_version"].startswith(">=2.3") with make_tempdir() as d: # no change: compatible nlp.to_disk(d) meta_path = Path(d / "meta.json") util.get_model_meta(d) # additional compatible upper pin nlp.meta["spacy_version"] = ">=2.3.0,<2.4.0" srsly.write_json(meta_path, nlp.meta) util.get_model_meta(d) # incompatible older version nlp.meta["spacy_version"] = ">=2.2.5" srsly.write_json(meta_path, nlp.meta) with pytest.warns(UserWarning): util.get_model_meta(d) # invalid version specification nlp.meta["spacy_version"] = ">@#$%_invalid_version" srsly.write_json(meta_path, nlp.meta) with pytest.warns(UserWarning): util.get_model_meta(d)
def load( name: Union[str, Path] = __DEFAULT_MODEL, vocab: Union[Vocab, bool] = True, disable: Iterable[str] = util.SimpleFrozenList(), exclude: Iterable[str] = util.SimpleFrozenList(), config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), ) -> Language: """Loads a HuSpaCy model. Args: name (str): model name vocab (Vocab): A Vocab object. If True, a vocab is created. disable (Iterable[str]): Names of pipeline components to disable. Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling nlp.enable_pipe. exclude (Iterable[str]): Names of pipeline components to exclude. Excluded components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. Returns: Language: The loaded nlp object """ return util.load_model(name, vocab=vocab, disable=disable, exclude=exclude, config=config)
def test_issue4190(): test_string = "Test c." # Load default language nlp_1 = English() doc_1a = nlp_1(test_string) result_1a = [token.text for token in doc_1a] # noqa: F841 # Modify tokenizer customize_tokenizer(nlp_1) doc_1b = nlp_1(test_string) result_1b = [token.text for token in doc_1b] # Save and Reload with make_tempdir() as model_dir: nlp_1.to_disk(model_dir) nlp_2 = util.load_model(model_dir) # This should be the modified tokenizer doc_2 = nlp_2(test_string) result_2 = [token.text for token in doc_2] assert result_1b == result_2
def test_issue4054(en_vocab): """Test that a new blank model can be made with a vocab from file, and that serialization does not drop the language at any point.""" nlp1 = English() vocab1 = nlp1.vocab with make_tempdir() as d: vocab_dir = ensure_path(d / "vocab") if not vocab_dir.exists(): vocab_dir.mkdir() vocab1.to_disk(vocab_dir) vocab2 = Vocab().from_disk(vocab_dir) nlp2 = spacy.blank("en", vocab=vocab2) nlp_dir = ensure_path(d / "nlp") if not nlp_dir.exists(): nlp_dir.mkdir() nlp2.to_disk(nlp_dir) nlp3 = load_model(nlp_dir) assert nlp3.lang == "en"
def test_issue4190(): def customize_tokenizer(nlp): prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) infix_re = compile_infix_regex(nlp.Defaults.infixes) # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') exceptions = { k: v for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() if not (len(k) == 2 and k[1] == ".") } new_tokenizer = Tokenizer( nlp.vocab, exceptions, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, faster_heuristics=False, ) nlp.tokenizer = new_tokenizer test_string = "Test c." # Load default language nlp_1 = English() doc_1a = nlp_1(test_string) result_1a = [token.text for token in doc_1a] # noqa: F841 # Modify tokenizer customize_tokenizer(nlp_1) doc_1b = nlp_1(test_string) result_1b = [token.text for token in doc_1b] # Save and Reload with make_tempdir() as model_dir: nlp_1.to_disk(model_dir) nlp_2 = load_model(model_dir) # This should be the modified tokenizer doc_2 = nlp_2(test_string) result_2 = [token.text for token in doc_2] assert result_1b == result_2 assert nlp_2.tokenizer.faster_heuristics is False
from spacy import util as spc_util import pt_core_news_sm nlp = pt_core_news_sm.load() doc = nlp('justiça tribunal') tk = [k for k in doc] print('Semelhança entre justica e tribunal = ', tk[0].similarity(tk[1])) pathw2v = './vectors_spacy' spc_util.load_model(pathw2v, vocab=nlp.vocab) doc = nlp('justiça tribunal') tk = [k for k in doc] print('Semelhança entre justica e tribunal = ', tk[0].similarity(tk[1]))
start = time.time() sys.path.insert(0, '../') # third party import pandas as pd from spacy.util import load_model # local from src.config import PATHS, FILENAMES, LEXISNEXIS from src.spacy_helpers import serialize_batch, fetch_docs from src.doc_analysis import basic_stats, attribute_counter, most_common ### Serialize LexisNexis documents print("[1] serialize batches") nlp = load_model(PATHS.model) for batch in LEXISNEXIS.batches: serialize_batch(nlp, batch) ### Store some general stats print("[2] store stats") def get_stats(batch): data = list() for doc in fetch_docs(PATHS.data_prc / batch, nlp.vocab): stats = basic_stats(doc) data.append(stats) df = pd.DataFrame(data) df.columns = [col.lower() for col in df.columns] return df
def custom_train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", vectors=None, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, orth_variant_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm msg = Printer() util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [ p for p in output_path.iterdir() if p.is_dir() ]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] msg.text("Training pipeline: {}".format(pipeline)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) for pipe in pipeline: if pipe not in nlp.pipe_names: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) else: if pipe == "textcat": textcat_cfg = nlp.get_pipe("textcat").cfg base_cfg = { "exclusive_classes": textcat_cfg["exclusive_classes"], "architecture": textcat_cfg["architecture"], "positive_label": textcat_cfg["positive_label"], } pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } if base_cfg != pipe_cfg: msg.fail( "The base textcat model configuration does" "not match the provided training options. " "Existing cfg: {}, provided cfg: {}".format( base_cfg, pipe_cfg), exits=1, ) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) ### Here are our modifications: lang_cls.Defaults.tag_map = custom_tag_map nlp = lang_cls() assert nlp.vocab.morphology.n_tags == 36 ### for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail("Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name)) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model: # Start with an existing model, use default optimizer optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg["labels"] if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( "The textcat_positive_label (tpl) '{}' does not match any " "label in the training data.".format(textcat_positive_label), exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( "A textcat_positive_label (tpl) '{}' was provided for training " "data that does not appear to be a binary classification " "problem with two labels.".format(textcat_positive_label), exits=1, ) train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) train_labels = set() if textcat_multilabel: multilabel_found = False for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1: multilabel_found = True if not multilabel_found and not base_model: msg.warn("The textcat training instances look like they have " "mutually-exclusive classes. Remove the flag " "'--textcat-multilabel' to train a classifier with " "mutually-exclusive classes.") if not textcat_multilabel: for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1 and not base_model: msg.warn( "Some textcat training instances do not have exactly " "one positive label. Modifying training options to " "include the flag '--textcat-multilabel' for classes " "that are not mutually exclusive.") nlp.get_pipe("textcat").cfg["exclusive_classes"] = False textcat_multilabel = True break if base_model and set(textcat_labels) != train_labels: msg.fail( "Cannot extend textcat model using data with different " "labels. Base model labels: {}, training data labels: " "{}.".format(textcat_labels, list(train_labels)), exits=1, ) if textcat_multilabel: msg.text( "Textcat evaluation score: ROC AUC score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) elif textcat_positive_label and len(textcat_labels) == 2: msg.text("Textcat evaluation score: F1-score for the " "label '{}'".format(textcat_positive_label)) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: msg.warn( "If the textcat component is a binary classifier with " "exclusive classes, provide '--textcat_positive_label' for " "an evaluation on the positive class.") msg.text( "Textcat evaluation score: F1-score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) else: msg.fail( "Unsupported textcat configuration. Use `spacy debug-data` " "for more information.") # fmt: off row_head, output_stats = _configure_training_output( pipeline, use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = { "widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2 } # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path( epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta["labels"] = nlp.meta["labels"] meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) if i == 0 and "textcat" in pipeline: textcats_per_cat = scorer.scores.get( "textcats_per_cat", {}) for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( "Textcat ROC AUC score is undefined due to " "only one value in label '{}'.".format( cat)) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: msg.text("Early stopping, best iteration " "is: {}".format(i - iter_since_best)) msg.text("Best score = {}; Final iteration " "score = {}".format(best_score, current_score)) break finally: with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) msg.good("Created best model", best_model_path)