def evaluate( model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None, displacy_limit=25, return_scores=False, ): """ Evaluate a model. To render a sample of parses in a HTML file, set an output directory as the displacy_path argument. """ msg = Printer() util.fix_random_seed() if gpu_id >= 0: util.use_gpu(gpu_id) util.set_env_log(False) data_path = util.ensure_path(data_path) displacy_path = util.ensure_path(displacy_path) if not data_path.exists(): msg.fail("Evaluation data not found", data_path, exits=1) if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = GoldCorpus(data_path, data_path) nlp = util.load_model(model) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) begin = timer() scorer = nlp.evaluate(dev_docs, verbose=False) end = timer() nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) results = { "Time": "%.2f s" % (end - begin), "Words": nwords, "Words/s": "%.0f" % (nwords / (end - begin)), "TOK": "%.2f" % scorer.token_acc, "POS": "%.2f" % scorer.tags_acc, "UAS": "%.2f" % scorer.uas, "LAS": "%.2f" % scorer.las, "NER P": "%.2f" % scorer.ents_p, "NER R": "%.2f" % scorer.ents_r, "NER F": "%.2f" % scorer.ents_f, } msg.table(results, title="Results") if displacy_path: docs, golds = zip(*dev_docs) render_deps = "parser" in nlp.meta.get("pipeline", []) render_ents = "ner" in nlp.meta.get("pipeline", []) render_parses( docs, displacy_path, model_name=model, limit=displacy_limit, deps=render_deps, ents=render_ents, ) msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path) if return_scores: return scorer.scores
def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]: """Generate info about a specific model. model (str): Model name of path. silent (bool): Don't print anything, just return. RETURNS (dict): The model meta. """ msg = Printer(no_print=silent, pretty=not silent) if util.is_package(model): model_path = util.get_package_path(model) else: model_path = Path(model) meta_path = model_path / "meta.json" if not meta_path.is_file(): msg.fail("Can't find pipeline meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if model_path.resolve() != model_path: meta["source"] = str(model_path.resolve()) else: meta["source"] = str(model_path) return { k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed") }
def spinner(text='Loading...', clean=False): printer = Printer() spinchars = '⠙⠹⠸⠼⠴⠦⠧⠇⠏' def spin(s): for char in itertools.cycle(spinchars): sys.stdout.write("\r\033[96m{} {}".format(char, s)) sys.stdout.flush() time.sleep(0.1) stime = time.time() t = Process(target=spin, args=(text, )) t.start() try: yield except Exception as e: t.terminate() printer.fail(text + ' failed.') raise e t.terminate() sys.stdout.write("\r") if clean: for _ in range(len(text) // 4 + 1): sys.stdout.write("\x1b[2K") else: time_used = strfsec(int(time.time() - stime)) printer.good(f'{text} succeed in {time_used}.') sys.stdout.flush()
def extract_tar(filename: str, destination_dir: str, mode="r"): """ Extracts tar, targz and other files Parameters ---------- filename : str The tar zipped file destination_dir : str The destination directory in which the files should be placed mode : str A valid tar mode. You can refer to https://docs.python.org/3/library/tarfile.html for the different modes. Returns ------- """ msg_printer = Printer() try: with msg_printer.loading( f"Unzipping file {filename} to {destination_dir}"): stdout.flush() with tarfile.open(filename, mode) as t: t.extractall(destination_dir) msg_printer.good( f"Finished extraction {filename} to {destination_dir}") except tarfile.ExtractError: msg_printer.fail("Couldnot extract {filename} to {destination}")
def convert( input_file, output_dir="-", file_type="jsonl", n_sents=1, morphology=False, converter="auto", lang=None, ): """ Convert files into JSON format for use with train command and other experiment management functions. If no output_dir is specified, the data is written to stdout, so you can pipe them forward to a JSONL file: $ spacy convert some_file.conllu > some_file.jsonl """ msg = Printer() input_path = Path(input_file) if file_type not in FILE_TYPES: msg.fail( "Unknown file type: '{}'".format(file_type), "Supported file types: '{}'".format(", ".join(FILE_TYPES)), exits=1, ) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": # TODO: support msgpack via stdout in srsly? msg.fail( "Can't write .{} data to stdout.".format(file_type), "Please specify an output directory.", exits=1, ) if not input_path.exists(): msg.fail("Input file not found", input_path, exits=1) if output_dir != "-" and not Path(output_dir).exists(): msg.fail("Output directory not found", output_dir, exits=1) if converter == "auto": converter = input_path.suffix[1:] if converter not in CONVERTERS: msg.fail("Can't find converter for {}".format(converter), exits=1) # Use converter function to convert data func = CONVERTERS[converter] input_data = input_path.open("r", encoding="utf-8").read() data = func(input_data, n_sents=n_sents, use_morphology=morphology, lang=lang) if output_dir != "-": # Export data to a file suffix = ".{}".format(file_type) output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix) if file_type == "json": srsly.write_json(output_file, data) elif file_type == "jsonl": srsly.write_jsonl(output_file, data) elif file_type == "msg": srsly.write_msgpack(output_file, data) msg.good("Generated output file ({} documents)".format(len(data)), output_file) else: # Print to stdout if file_type == "json": srsly.write_json("-", data) elif file_type == "jsonl": srsly.write_jsonl("-", data)
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified output directory, and model data will be copied over. If --create-meta is set and a meta.json already exists in the output directory, the existing values will be used as the defaults in the command-line prompt. """ msg = Printer() input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) if not input_path or not input_path.exists(): msg.fail("Can't locate model data", input_path, exits=1) if not output_path or not output_path.exists(): msg.fail("Output directory not found", output_path, exits=1) if meta_path and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta_path = meta_path or input_path / "meta.json" if meta_path.is_file(): meta = srsly.read_json(meta_path) if not create_meta: # only print if user doesn't want to overwrite msg.good("Loaded meta.json from file", meta_path) else: meta = generate_meta(input_dir, meta, msg) for key in ("lang", "name", "version"): if key not in meta or meta[key] == "": msg.fail( "No '{}' setting found in meta.json".format(key), "This setting is required to build your package.", exits=1, ) model_name = meta["lang"] + "_" + meta["name"] model_name_v = model_name + "-" + meta["version"] main_path = output_path / model_name_v package_path = main_path / model_name if package_path.exists(): if force: shutil.rmtree(path2str(package_path)) else: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " "`--force` flag to overwrite existing " "directories.".format(path=path2str(package_path)), exits=1, ) Path.mkdir(package_path, parents=True) shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(package_path / "__init__.py", TEMPLATE_INIT) msg.good("Successfully created package '{}'".format(model_name_v), main_path) msg.text("To build the package, run `python setup.py sdist` in this directory.")
def get_build_formats(formats: List[str]) -> Tuple[bool, bool]: supported = ["sdist", "wheel", "none"] for form in formats: if form not in supported: msg = Printer() err = f"Unknown build format: {form}. Supported: {', '.join(supported)}" msg.fail(err, exits=1) if not formats or "none" in formats: return (False, False) return ("sdist" in formats, "wheel" in formats)
def info(model=None, markdown=False, silent=False): """ Print info about spaCy installation. If a model shortcut link is speficied as an argument, print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. """ msg = Printer() if model: if util.is_package(model): model_path = util.get_package_path(model) else: model_path = util.get_data_path() / model meta_path = model_path / "meta.json" if not meta_path.is_file(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if model_path.resolve() != model_path: meta["link"] = path2str(model_path) meta["source"] = path2str(model_path.resolve()) else: meta["source"] = path2str(model_path) if not silent: title = "Info about model '{}'".format(model) model_meta = { k: v for k, v in meta.items() if k not in ("accuracy", "speed") } if markdown: print_markdown(model_meta, title=title) else: msg.table(model_meta, title=title) return meta data = { "spaCy version": about.__version__, "Location": path2str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), "Models": list_models(), } if not silent: title = "Info about spaCy" if markdown: print_markdown(data, title=title) else: msg.table(data, title=title) return data
def corpus_trainer(cb, cpt, custom): from chatterbot.trainers import ChatterBotCorpusTrainer trainer = ChatterBotCorpusTrainer(cb) if(bool(custom)): for mode in custom.split(): try: trainer.train("chatterbot.corpus.english.{}".format(mode)) except(FileNotFoundError): from wasabi import Printer msg = Printer() msg.fail("That corpus doesn't exist!") return -1 print("all done training masta!") elif(bool(cpt)): trainer.train("chatterbot.corpus.english") print("all done training masta!")
def extract_zip(filename: str, destination_dir: str): """ Extracts a zipped file Parameters ---------- filename : str The zipped filename destination_dir : str The directory where the zipped will be placed """ msg_printer = Printer() try: with msg_printer.loading(f"Unzipping file {filename} to {destination_dir}"): stdout.flush() with zipfile.ZipFile(filename, "r") as z: z.extractall(destination_dir) msg_printer.good(f"Finished extraction {filename} to {destination_dir}") except zipfile.BadZipFile: msg_printer.fail("Couldnot extract {filename} to {destination}")
def init_config_cli( # fmt: off output_file: Path = Arg( ..., help= "File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True), lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"), pipeline: str = Opt( "tagger,parser,ner", "--pipeline", "-p", help= "Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')" ), optimize: Optimizations = Opt( Optimizations.efficiency.value, "--optimize", "-o", help= "Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters." ), gpu: bool = Opt( False, "--gpu", "-G", help= "Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters." ), pretraining: bool = Opt( False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"), # fmt: on ): """ Generate a starter config.cfg for training. Based on your requirements specified via the CLI arguments, this command generates a config with the optimal settings for your use case. This includes the choice of architecture, pretrained weights and related hyperparameters. DOCS: https://spacy.io/api/cli#init-config """ pipeline = string_to_list(pipeline) is_stdout = str(output_file) == "-" if not is_stdout and output_file.exists() and not force_overwrite: msg = Printer() msg.fail( "The provided output file already exists. To force overwriting the config file, set the --force or -F flag.", exits=1, ) config = init_config( lang=lang, pipeline=pipeline, optimize=optimize.value, gpu=gpu, pretraining=pretraining, silent=is_stdout, ) save_config(config, output_file, is_stdout=is_stdout)
def validate(): """ Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ msg = Printer() with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( "Server error ({})".format(r.status_code), "Couldn't fetch compatibility table.", exits=1, ) msg.good("Loaded compatibility table") compat = r.json()["spacy"] version = about.__version__ version = version.rsplit(".dev", 1)[0] current_compat = compat.get(version) if not current_compat: msg.fail( "Can't find spaCy v{} in compatibility table".format(version), about.__compatibility__, exits=1, ) all_models = set() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] model_links = get_model_links(current_compat) model_pkgs = get_model_pkgs(current_compat, all_models) incompat_links = {l for l, d in model_links.items() if not d["compat"]} incompat_models = { d["name"] for _, d in model_pkgs.items() if not d["compat"] } incompat_models.update( [d["name"] for _, d in model_links.items() if not d["compat"]]) na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent msg.divider("Installed models (spaCy v{})".format(about.__version__)) msg.info("spaCy installation: {}".format(path2str(spacy_dir))) if model_links or model_pkgs: header = ("TYPE", "NAME", "MODEL", "VERSION", "") rows = [] for name, data in model_pkgs.items(): rows.append(get_model_row(current_compat, name, data, msg)) for name, data in model_links.items(): rows.append(get_model_row(current_compat, name, data, msg, "link")) msg.table(rows, header=header) else: msg.text("No models found in your current environment.", exits=0) if update_models: msg.divider("Install updates") msg.text("Use the following commands to update the model packages:") cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.text("The following models are not available for spaCy " "v{}: {}".format(about.__version__, ", ".join(na_models))) if incompat_links: msg.text( "You may also want to overwrite the incompatible links using the " "`python -m spacy link` command with `--force`, or remove them " "from the data directory. " "Data path: {path}".format(path=path2str(get_data_path()))) if incompat_models or incompat_links: sys.exit(1)
class SimpleClassifier(nn.Module, ClassNursery): def __init__( self, encoder: nn.Module, encoding_dim: int, num_classes: int, classification_layer_bias: bool = True, label_namespace: str = "label", datasets_manager: DatasetsManager = None, device: Union[torch.device, str] = torch.device("cpu"), ): """ SimpleClassifier is a linear classifier head on top of any encoder Parameters ---------- encoder : nn.Module Any encoder that takes in lines and produces a single vector for every line. encoding_dim : int The encoding dimension num_classes : int The number of classes classification_layer_bias : bool Whether to add classification layer bias or no This is set to false only for debugging purposes ff label_namespace : str The namespace used for labels in the dataset datasets_manager: DatasetsManager The datasets manager for the model device: torch.device The device on which the model is run """ super(SimpleClassifier, self).__init__() self.encoder = encoder self.encoding_dim = encoding_dim self.num_classes = num_classes self.classification_layer_bias = classification_layer_bias self.classification_layer = nn.Linear( self.encoding_dim, num_classes, bias=self.classification_layer_bias) self._loss = CrossEntropyLoss() self.label_namespace = label_namespace self.datasets_manager = datasets_manager self.label_numericalizer = self.datasets_manager.namespace_to_numericalizer[ self.label_namespace] self.device = torch.device(device) if isinstance(device, str) else device self.msg_printer = Printer() def forward( self, lines: List[Line], labels: List[Label] = None, is_training: bool = False, is_validation: bool = False, is_test: bool = False, ) -> Dict[str, Any]: """ Parameters ---------- lines : List[Line] ``iter_dict`` from any dataset that will be passed on to the encoder labels: List[Label] A list of labels for every instance is_training : bool running forward on training dataset? is_validation : bool running forward on validation dataset? is_test : bool running forward on test dataset? Returns ------- Dict[str, Any] logits: torch.FloatTensor Un-normalized probabilities over all the classes of the shape ``[batch_size, num_classes]`` normalized_probs: torch.FloatTensor Normalized probabilities over all the classes of the shape ``[batch_size, num_classes]`` loss: float Loss value if this is a training forward pass or validation loss. There will be no loss if this is the test dataset """ encoding = self.encoder(lines) # N * C # N - batch size # C - number of classes logits = self.classification_layer(encoding) # N * C # N - batch size # C - number of classes # The normalized probabilities of classification normalized_probs = softmax(logits, dim=1) output_dict = {"logits": logits, "normalized_probs": normalized_probs} if is_training or is_validation: label_indices = [] for label in labels: label_ = label.tokens[self.label_namespace] label_ = [tok.text for tok in label_] label_ = self.label_numericalizer.numericalize_instance( instance=label_) label_indices.append( label_[0]) # taking only the first label here labels_tensor = torch.tensor(label_indices, device=self.device, dtype=torch.long) assert labels_tensor.ndimension() == 1, self.msg_printer.fail( "the labels should have 1 dimension " "your input has shape {0}".format(labels_tensor.size())) loss = self._loss(logits, labels_tensor) output_dict["loss"] = loss return output_dict
def validate(): """ Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ msg = Printer() with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( "Server error ({})".format(r.status_code), "Couldn't fetch compatibility table.", exits=1, ) msg.good("Loaded compatibility table") compat = r.json()["spacy"] version = about.__version__ version = version.rsplit(".dev", 1)[0] current_compat = compat.get(version) if not current_compat: msg.fail( "Can't find spaCy v{} in compatibility table".format(version), about.__compatibility__, exits=1, ) all_models = set() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] model_links = get_model_links(current_compat) model_pkgs = get_model_pkgs(current_compat, all_models) incompat_links = {l for l, d in model_links.items() if not d["compat"]} incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]} incompat_models.update( [d["name"] for _, d in model_links.items() if not d["compat"]] ) na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent msg.divider("Installed models (spaCy v{})".format(about.__version__)) msg.info("spaCy installation: {}".format(path2str(spacy_dir))) if model_links or model_pkgs: header = ("TYPE", "NAME", "MODEL", "VERSION", "") rows = [] for name, data in model_pkgs.items(): rows.append(get_model_row(current_compat, name, data, msg)) for name, data in model_links.items(): rows.append(get_model_row(current_compat, name, data, msg, "link")) msg.table(rows, header=header) else: msg.text("No models found in your current environment.", exits=0) if update_models: msg.divider("Install updates") msg.text("Use the following commands to update the model packages:") cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.text( "The following models are not available for spaCy " "v{}: {}".format(about.__version__, ", ".join(na_models)) ) if incompat_links: msg.text( "You may also want to overwrite the incompatible links using the " "`python -m spacy link` command with `--force`, or remove them " "from the data directory. " "Data path: {path}".format(path=path2str(get_data_path())) ) if incompat_models or incompat_links: sys.exit(1)
def verify_cli_args( msg: Printer, input_path: Union[str, Path], output_dir: Union[str, Path], file_type: FileTypes, converter: str, ner_map: Optional[Path], ): input_path = Path(input_path) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": msg.fail( f"Can't write .{file_type} data to stdout. Please specify an output directory.", exits=1, ) if not input_path.exists(): msg.fail("Input file not found", input_path, exits=1) if output_dir != "-" and not Path(output_dir).exists(): msg.fail("Output directory not found", output_dir, exits=1) if ner_map is not None and not Path(ner_map).exists(): msg.fail("NER map not found", ner_map, exits=1) if input_path.is_dir(): input_locs = walk_directory(input_path, converter) if len(input_locs) == 0: msg.fail("No input files in directory", input_path, exits=1) file_types = list(set([loc.suffix[1:] for loc in input_locs])) if converter == "auto" and len(file_types) >= 2: file_types = ",".join(file_types) msg.fail("All input files must be same type", file_types, exits=1) if converter != "auto" and converter not in CONVERTERS: msg.fail(f"Can't find converter for {converter}", exits=1)
def debug_data( lang, train_path, dev_path, base_model=None, pipeline="tagger,parser,ner", ignore_warnings=False, ignore_validation=False, verbose=False, no_format=False, ): msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings) # Make sure all files and paths exists if they are needed if not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) # Initialize the model and pipeline pipeline = [p.strip() for p in pipeline.split(",")] if base_model: nlp = load_model(base_model) else: lang_cls = get_lang_class(lang) nlp = lang_cls() msg.divider("Data format validation") # Load the data in one – might take a while but okay in this case train_data = _load_file(train_path, msg) dev_data = _load_file(dev_path, msg) # Validate data format using the JSON schema # TODO: update once the new format is ready train_data_errors = [] # TODO: validate_json dev_data_errors = [] # TODO: validate_json if not train_data_errors: msg.good("Training data JSON format is valid") if not dev_data_errors: msg.good("Development data JSON format is valid") for error in train_data_errors: msg.fail("Training data: {}".format(error)) for error in dev_data_errors: msg.fail("Develoment data: {}".format(error)) if (train_data_errors or dev_data_errors) and not ignore_validation: sys.exit(1) # Create the gold corpus to be able to better analyze data with msg.loading("Analyzing corpus..."): train_data = read_json_object(train_data) dev_data = read_json_object(dev_data) corpus = GoldCorpus(train_data, dev_data) train_docs = list(corpus.train_docs(nlp)) dev_docs = list(corpus.dev_docs(nlp)) msg.good("Corpus is loadable") # Create all gold data here to avoid iterating over the train_docs constantly gold_data = _compile_gold(train_docs, pipeline) train_texts = gold_data["texts"] dev_texts = set([doc.text for doc, gold in dev_docs]) msg.divider("Training stats") msg.text("Training pipeline: {}".format(", ".join(pipeline))) for pipe in [p for p in pipeline if p not in nlp.factories]: msg.fail("Pipeline component '{}' not available in factories".format(pipe)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) else: msg.text("Starting with blank model '{}'".format(lang)) msg.text("{} training docs".format(len(train_docs))) msg.text("{} evaluation docs".format(len(dev_docs))) overlap = len(train_texts.intersection(dev_texts)) if overlap: msg.warn("{} training examples also in evaluation data".format(overlap)) else: msg.good("No overlap between training and evaluation data") if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD: text = "Low number of examples to train from a blank model ({})".format( len(train_docs) ) if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD: msg.fail(text) else: msg.warn(text) msg.text( "It's recommended to use at least {} examples (minimum {})".format( BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD ), show=verbose, ) msg.divider("Vocab & Vectors") n_words = gold_data["n_words"] msg.info( "{} total {} in the data ({} unique)".format( n_words, "word" if n_words == 1 else "words", len(gold_data["words"]) ) ) most_common_words = gold_data["words"].most_common(10) msg.text( "10 most common words: {}".format( _format_labels(most_common_words, counts=True) ), show=verbose, ) if len(nlp.vocab.vectors): msg.info( "{} vectors ({} unique keys, {} dimensions)".format( len(nlp.vocab.vectors), nlp.vocab.vectors.n_keys, nlp.vocab.vectors_length, ) ) else: msg.info("No word vectors present in the model") if "ner" in pipeline: # Get all unique NER labels present in the data labels = set(label for label in gold_data["ner"] if label not in ("O", "-")) label_counts = gold_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] has_low_data_warning = False has_no_neg_warning = False has_ws_ents_error = False msg.divider("Named Entity Recognition") msg.info( "{} new {}, {} existing {}".format( len(new_labels), "label" if len(new_labels) == 1 else "labels", len(existing_labels), "label" if len(existing_labels) == 1 else "labels", ) ) missing_values = label_counts["-"] msg.text( "{} missing {} (tokens with '-' label)".format( missing_values, "value" if missing_values == 1 else "values" ) ) if new_labels: labels_with_counts = [ (label, count) for label, count in label_counts.most_common() if label != "-" ] labels_with_counts = _format_labels(labels_with_counts, counts=True) msg.text("New: {}".format(labels_with_counts), show=verbose) if existing_labels: msg.text( "Existing: {}".format(_format_labels(existing_labels)), show=verbose ) if gold_data["ws_ents"]: msg.fail("{} invalid whitespace entity spans".format(gold_data["ws_ents"])) has_ws_ents_error = True for label in new_labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( "Low number of examples for new label '{}' ({})".format( label, label_counts[label] ) ) has_low_data_warning = True with msg.loading("Analyzing label distribution..."): neg_docs = _get_examples_without_label(train_docs, label) if neg_docs == 0: msg.warn( "No examples for texts WITHOUT new label '{}'".format(label) ) has_no_neg_warning = True if not has_low_data_warning: msg.good("Good amount of examples for all labels") if not has_no_neg_warning: msg.good("Examples without occurences available for all labels") if not has_ws_ents_error: msg.good("No entities consisting of or starting/ending with whitespace") if has_low_data_warning: msg.text( "To train a new entity type, your data should include at " "least {} insteances of the new label".format(NEW_LABEL_THRESHOLD), show=verbose, ) if has_no_neg_warning: msg.text( "Training data should always include examples of entities " "in context, as well as examples without a given entity " "type.", show=verbose, ) if has_ws_ents_error: msg.text( "As of spaCy v2.1.0, entity spans consisting of or starting/ending " "with whitespace characters are considered invalid." ) if "textcat" in pipeline: msg.divider("Text Classification") labels = [label for label in gold_data["textcat"]] model_labels = _get_labels_from_model(nlp, "textcat") new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] msg.info( "Text Classification: {} new label(s), {} existing label(s)".format( len(new_labels), len(existing_labels) ) ) if new_labels: labels_with_counts = _format_labels( gold_data["textcat"].most_common(), counts=True ) msg.text("New: {}".format(labels_with_counts), show=verbose) if existing_labels: msg.text( "Existing: {}".format(_format_labels(existing_labels)), show=verbose ) if "tagger" in pipeline: msg.divider("Part-of-speech Tagging") labels = [label for label in gold_data["tags"]] tag_map = nlp.Defaults.tag_map msg.info( "{} {} in data ({} {} in tag map)".format( len(labels), "label" if len(labels) == 1 else "labels", len(tag_map), "label" if len(tag_map) == 1 else "labels", ) ) labels_with_counts = _format_labels( gold_data["tags"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) non_tagmap = [l for l in labels if l not in tag_map] if not non_tagmap: msg.good("All labels present in tag map for language '{}'".format(nlp.lang)) for label in non_tagmap: msg.fail( "Label '{}' not found in tag map for language '{}'".format( label, nlp.lang ) ) if "parser" in pipeline: msg.divider("Dependency Parsing") labels = [label for label in gold_data["deps"]] msg.info( "{} {} in data".format( len(labels), "label" if len(labels) == 1 else "labels" ) ) labels_with_counts = _format_labels( gold_data["deps"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) msg.divider("Summary") good_counts = msg.counts[MESSAGES.GOOD] warn_counts = msg.counts[MESSAGES.WARN] fail_counts = msg.counts[MESSAGES.FAIL] if good_counts: msg.good( "{} {} passed".format( good_counts, "check" if good_counts == 1 else "checks" ) ) if warn_counts: msg.warn( "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings") ) if fail_counts: msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors")) if fail_counts: sys.exit(1)
def package( input_dir: Path, output_dir: Path, meta_path: Optional[Path] = None, code_paths: List[Path] = [], name: Optional[str] = None, version: Optional[str] = None, create_meta: bool = False, create_sdist: bool = True, create_wheel: bool = False, force: bool = False, silent: bool = True, ) -> None: msg = Printer(no_print=silent, pretty=not silent) input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) if create_wheel and not has_wheel(): err = "Generating a binary .whl file requires wheel to be installed" msg.fail(err, "pip install wheel", exits=1) if not input_path or not input_path.exists(): msg.fail("Can't locate pipeline data", input_path, exits=1) if not output_path or not output_path.exists(): msg.fail("Output directory not found", output_path, exits=1) if create_sdist or create_wheel: opts = ["sdist" if create_sdist else "", "wheel" if create_wheel else ""] msg.info(f"Building package artifacts: {', '.join(opt for opt in opts if opt)}") for code_path in code_paths: if not code_path.exists(): msg.fail("Can't find code file", code_path, exits=1) # Import the code here so it's available when model is loaded (via # get_meta helper). Also verifies that everything works util.import_file(code_path.stem, code_path) if code_paths: msg.good(f"Including {len(code_paths)} Python module(s) with custom code") if meta_path and not meta_path.exists(): msg.fail("Can't find pipeline meta.json", meta_path, exits=1) meta_path = meta_path or input_dir / "meta.json" if not meta_path.exists() or not meta_path.is_file(): msg.fail("Can't load pipeline meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) meta = get_meta(input_dir, meta) if meta["requirements"]: msg.good( f"Including {len(meta['requirements'])} package requirement(s) from " f"meta and config", ", ".join(meta["requirements"]), ) if name is not None: if not name.isidentifier(): msg.fail( f"Model name ('{name}') is not a valid module name. " "This is required so it can be imported as a module.", "We recommend names that use ASCII A-Z, a-z, _ (underscore), " "and 0-9. " "For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers", exits=1, ) if not _is_permitted_package_name(name): msg.fail( f"Model name ('{name}') is not a permitted package name. " "This is required to correctly load the model with spacy.load.", "We recommend names that use ASCII A-Z, a-z, _ (underscore), " "and 0-9. " "For specific details see: https://www.python.org/dev/peps/pep-0426/#name", exits=1, ) meta["name"] = name if version is not None: meta["version"] = version if not create_meta: # only print if user doesn't want to overwrite msg.good("Loaded meta.json from file", meta_path) else: meta = generate_meta(meta, msg) errors = validate(ModelMetaSchema, meta) if errors: msg.fail("Invalid pipeline meta.json") print("\n".join(errors)) sys.exit(1) model_name = meta["name"] if not model_name.startswith(meta["lang"] + "_"): model_name = f"{meta['lang']}_{model_name}" model_name_v = model_name + "-" + meta["version"] main_path = output_dir / model_name_v package_path = main_path / model_name if package_path.exists(): if force: shutil.rmtree(str(package_path)) else: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " "`--force` flag to overwrite existing directories.", exits=1, ) Path.mkdir(package_path, parents=True) shutil.copytree(str(input_dir), str(package_path / model_name_v)) for file_name in FILENAMES_DOCS: file_path = package_path / model_name_v / file_name if file_path.exists(): shutil.copy(str(file_path), str(main_path)) readme_path = main_path / "README.md" if not readme_path.exists(): readme = generate_readme(meta) create_file(readme_path, readme) create_file(package_path / model_name_v / "README.md", readme) msg.good("Generated README.md from meta.json") else: msg.info("Using existing README.md from pipeline directory") imports = [] for code_path in code_paths: imports.append(code_path.stem) shutil.copy(str(code_path), str(package_path)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) init_py = TEMPLATE_INIT.format( imports="\n".join(f"from . import {m}" for m in imports) ) create_file(package_path / "__init__.py", init_py) msg.good(f"Successfully created package directory '{model_name_v}'", main_path) if create_sdist: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "sdist"], capture=False) zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}" msg.good(f"Successfully created zipped Python package", zip_file) if create_wheel: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) wheel_name_squashed = re.sub("_+", "_", model_name_v) wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}" msg.good(f"Successfully created binary wheel", wheel) if "__" in model_name: msg.warn( f"Model name ('{model_name}') contains a run of underscores. " "Runs of underscores are not significant in installed package names.", )
class PrecisionRecallFMeasure(BaseMetric, ClassNursery): def __init__(self, datasets_manager: DatasetsManager): """ Parameters ---------- datasets_manager : DatasetsManager The dataset manager managing the labels and other information """ super(PrecisionRecallFMeasure, self).__init__(datasets_manager=datasets_manager) self.datasets_manager = datasets_manager self.idx2labelname_mapping = None self.msg_printer = Printer() self.classification_metrics_utils = ClassificationMetricsUtils() self.label_namespace = self.datasets_manager.label_namespaces[0] self.normalized_probs_namespace = "normalized_probs" self.label_numericalizer = self.datasets_manager.namespace_to_numericalizer[ self.label_namespace] # setup counters to calculate true positives, false positives, # false negatives and true negatives # The keys are the different class indices in the dataset and the # values are the number of true positives, false positives, false negative # true negatvies for the dataset self.tp_counter = {} self.fp_counter = {} self.fn_counter = {} self.tn_counter = {} def print_confusion_metrics( self, predicted_probs: torch.FloatTensor, labels: torch.LongTensor, labels_mask: Optional[torch.ByteTensor] = None, ) -> None: """ Prints confusion matrix Parameters ---------- predicted_probs : torch.FloatTensor Predicted Probabilities ``[batch_size, num_classes]`` labels : torch.FloatTensor True labels of the size ``[batch_size, 1]`` labels_mask : Optional[torch.ByteTensor] Labels mask indicating 1 in thos places where the true label is ignored Otherwise 0. It should be of same size as labels """ assert predicted_probs.ndimension() == 2, self.msg_printer.fail( "The predicted probs should " "have 2 dimensions. The probs " "that you passed have shape " "{0}".format(predicted_probs.size())) assert labels.ndimension() == 2, self.msg_printer.fail( "The labels should have 2 dimension." "The labels that you passed have shape " "{0}".format(labels.size())) if labels_mask is None: labels_mask = torch.zeros_like(labels, dtype=torch.bool) # TODO: for now k=1, change it to different number of ks top_probs, top_indices = predicted_probs.topk(k=1, dim=1) # convert to 1d numpy top_indices_numpy = top_indices.cpu().numpy().tolist() # convert labels to 1 dimension true_labels_numpy = labels.cpu().numpy().tolist() ( confusion_mtrx, classes, ) = self.classification_metrics_utils.get_confusion_matrix_and_labels( predicted_tag_indices=top_indices_numpy, true_tag_indices=true_labels_numpy, true_masked_label_indices=labels_mask, ) if self.idx2labelname_mapping is not None: classes_with_names = [ f"cls_{class_}({self.idx2labelname_mapping[class_]})" for class_ in classes ] else: classes_with_names = classes assert ( len(classes) == confusion_mtrx.shape[1] ), f"len(classes) = {len(classes)} confusion matrix shape {confusion_mtrx.shape}" header = [f"{class_}" for class_ in classes] header.insert(0, "pred(cols)/true(rows)") confusion_mtrx = pd.DataFrame(confusion_mtrx) confusion_mtrx.insert(0, "class_name", classes_with_names) self.msg_printer.table(data=confusion_mtrx.values.tolist(), header=header, divider=True) def calc_metric(self, lines: List[Line], labels: List[Label], model_forward_dict: Dict[str, Any]) -> None: """ Updates the values being tracked for calculating the metric For Precision Recall FMeasure we update the true positive, false positive and false negative of the different classes being tracked Parameters ---------- lines : List[Line] A list of lines labels: List[Label] A list of labels. This has to be the label used for classification Refer to the documentation of Label for more information model_forward_dict : Dict[str, Any] The dictionary obtained after a forward pass The model_forward_pass is expected to have ``normalized_probs`` that usually is of the size ``[batch_size, num_classes]`` """ normalized_probs = model_forward_dict[self.normalized_probs_namespace] labels_tensor = [] for label in labels: tokens = label.tokens[self.label_namespace] tokens = [tok.text for tok in tokens] numericalized_instance = self.label_numericalizer.numericalize_instance( instance=tokens) labels_tensor.extend(numericalized_instance) labels_tensor = torch.LongTensor(labels_tensor) labels_tensor = labels_tensor.view(-1, 1) labels_mask = torch.zeros_like(labels_tensor).type(torch.ByteTensor) normalized_probs = normalized_probs.cpu() assert normalized_probs.ndimension() == 2, self.msg_printer.fail( "The predicted probs should " "have 2 dimensions. The probs " "that you passed have shape " "{0}".format(normalized_probs.size())) assert labels_tensor.ndimension() == 2, self.msg_printer.fail( "The labels should have 2 dimension." "The labels that you passed have shape " "{0}".format(labels_tensor.size())) # TODO: for now k=1, change it to different number of ks top_probs, top_indices = normalized_probs.topk(k=1, dim=1) # convert to 1d numpy top_indices_numpy = top_indices.cpu().numpy().tolist() # convert labels to 1 dimension true_labels_numpy = labels_tensor.cpu().numpy().tolist() labels_mask = labels_mask.tolist() ( confusion_mtrx, classes, ) = self.classification_metrics_utils.get_confusion_matrix_and_labels( true_tag_indices=true_labels_numpy, predicted_tag_indices=top_indices_numpy, true_masked_label_indices=labels_mask, ) # For further confirmation on how I calculated this I searched for stackoverflow on # 18th of July 2019. This seems to be the correct way to calculate tps, fps, fns # You can refer to https://stackoverflow.com/a/43331484/2704763 # calculate tps tps = np.around(np.diag(confusion_mtrx), decimals=4) # calculate fps fps = np.around(np.sum(confusion_mtrx, axis=0) - tps, decimals=4) # calculate fns fns = np.around(np.sum(confusion_mtrx, axis=1) - tps, decimals=4) tps = tps.tolist() fps = fps.tolist() fns = fns.tolist() class_tps_mapping = dict(zip(classes, tps)) class_fps_mapping = dict(zip(classes, fps)) class_fns_mapping = dict(zip(classes, fns)) self.tp_counter = merge_dictionaries_with_sum(self.tp_counter, class_tps_mapping) self.fp_counter = merge_dictionaries_with_sum(self.fp_counter, class_fps_mapping) self.fn_counter = merge_dictionaries_with_sum(self.fn_counter, class_fns_mapping) def get_metric(self) -> Dict[str, Any]: """ Returns different values being tracked to calculate Precision Recall FMeasure Returns ------- Dict[str, Any] Returns a dictionary with the following key value pairs for every namespace precision: Dict[str, float] The precision for different classes recall: Dict[str, float] The recall values for different classes fscore: Dict[str, float] The fscore values for different classes, num_tp: Dict[str, int] The number of true positives for different classes, num_fp: Dict[str, int] The number of false positives for different classes, num_fn: Dict[str, int] The number of false negatives for different classes "macro_precision": float The macro precision value considering all different classes, macro_recall: float The macro recall value considering all different classes macro_fscore: float The macro fscore value considering all different classes micro_precision: float The micro precision value considering all different classes, micro_recall: float The micro recall value considering all different classes. micro_fscore: float The micro fscore value considering all different classes """ ( precision_dict, recall_dict, fscore_dict, ) = self.classification_metrics_utils.get_prf_from_counters( tp_counter=self.tp_counter, fp_counter=self.fp_counter, fn_counter=self.fn_counter, ) # macro scores # for a detailed discussion on micro and macro scores please follow the discussion @ # https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin # micro scores ( micro_precision, micro_recall, micro_fscore, ) = self.classification_metrics_utils.get_micro_prf_from_counters( tp_counter=self.tp_counter, fp_counter=self.fp_counter, fn_counter=self.fn_counter, ) # macro scores ( macro_precision, macro_recall, macro_fscore, ) = self.classification_metrics_utils.get_macro_prf_from_prf_dicts( precision_dict=precision_dict, recall_dict=recall_dict, fscore_dict=fscore_dict, ) metric = { self.label_namespace: { "precision": precision_dict, "recall": recall_dict, "fscore": fscore_dict, "num_tp": self.tp_counter, "num_fp": self.fp_counter, "num_fn": self.fn_counter, "macro_precision": macro_precision, "macro_recall": macro_recall, "macro_fscore": macro_fscore, "micro_precision": micro_precision, "micro_recall": micro_recall, "micro_fscore": micro_fscore, } } return metric def reset(self) -> None: """ Resets all the counters Resets the ``tp_counter`` which is the true positive counter Resets the ``fp_counter`` which is the false positive counter Resets the ``fn_counter`` - which is the false negative counter Resets the ``tn_counter`` - which is the true nagative counter """ self.tp_counter = {} self.fp_counter = {} self.fn_counter = {} self.tn_counter = {} def report_metrics(self, report_type="wasabi"): """ Reports metrics in a printable format Parameters ---------- report_type : type Select one of ``[wasabi, paper]`` If wasabi, then we return a printable table that represents the precision recall and fmeasures for different classes """ if report_type == "wasabi": table = self.classification_metrics_utils.generate_table_report_from_counters( tp_counter=self.tp_counter, fp_counter=self.fp_counter, fn_counter=self.fn_counter, ) return {self.label_namespace: table}
def train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", vectors=None, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, orth_variant_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm msg = Printer() util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [ p for p in output_path.iterdir() if p.is_dir() ]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] msg.text("Training pipeline: {}".format(pipeline)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) for pipe in pipeline: if pipe not in nlp.pipe_names: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) else: if pipe == "textcat": textcat_cfg = nlp.get_pipe("textcat").cfg base_cfg = { "exclusive_classes": textcat_cfg["exclusive_classes"], "architecture": textcat_cfg["architecture"], "positive_label": textcat_cfg["positive_label"], } pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } if base_cfg != pipe_cfg: msg.fail( "The base textcat model configuration does" "not match the provided training options. " "Existing cfg: {}, provided cfg: {}".format( base_cfg, pipe_cfg), exits=1, ) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail("Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name)) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model: # Start with an existing model, use default optimizer optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg["labels"] if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( "The textcat_positive_label (tpl) '{}' does not match any " "label in the training data.".format(textcat_positive_label), exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( "A textcat_positive_label (tpl) '{}' was provided for training " "data that does not appear to be a binary classification " "problem with two labels.".format(textcat_positive_label), exits=1, ) train_docs = corpus.train_docs(nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0) train_labels = set() if textcat_multilabel: multilabel_found = False for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1: multilabel_found = True if not multilabel_found and not base_model: msg.warn("The textcat training instances look like they have " "mutually-exclusive classes. Remove the flag " "'--textcat-multilabel' to train a classifier with " "mutually-exclusive classes.") if not textcat_multilabel: for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1 and not base_model: msg.warn( "Some textcat training instances do not have exactly " "one positive label. Modifying training options to " "include the flag '--textcat-multilabel' for classes " "that are not mutually exclusive.") nlp.get_pipe("textcat").cfg["exclusive_classes"] = False textcat_multilabel = True break if base_model and set(textcat_labels) != train_labels: msg.fail( "Cannot extend textcat model using data with different " "labels. Base model labels: {}, training data labels: " "{}.".format(textcat_labels, list(train_labels)), exits=1, ) if textcat_multilabel: msg.text( "Textcat evaluation score: ROC AUC score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) elif textcat_positive_label and len(textcat_labels) == 2: msg.text("Textcat evaluation score: F1-score for the " "label '{}'".format(textcat_positive_label)) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: msg.warn( "If the textcat component is a binary classifier with " "exclusive classes, provide '--textcat_positive_label' for " "an evaluation on the positive class.") msg.text( "Textcat evaluation score: F1-score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) else: msg.fail( "Unsupported textcat configuration. Use `spacy debug-data` " "for more information.") # fmt: off row_head, output_stats = _configure_training_output( pipeline, use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = { "widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2 } # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path( epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta["labels"] = nlp.meta["labels"] meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) if i == 0 and "textcat" in pipeline: textcats_per_cat = scorer.scores.get( "textcats_per_cat", {}) for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( "Textcat ROC AUC score is undefined due to " "only one value in label '{}'.".format( cat)) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: msg.text("Early stopping, best iteration " "is: {}".format(i - iter_since_best)) msg.text("Best score = {}; Final iteration " "score = {}".format(best_score, current_score)) break finally: with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) msg.good("Created best model", best_model_path)
def evaluate( model: str, data_path: Path, output: Optional[Path] = None, use_gpu: int = -1, gold_preproc: bool = False, displacy_path: Optional[Path] = None, displacy_limit: int = 25, silent: bool = True, spans_key: str = "sc", ) -> Dict[str, Any]: msg = Printer(no_print=silent, pretty=not silent) fix_random_seed() setup_gpu(use_gpu, silent=silent) data_path = util.ensure_path(data_path) output_path = util.ensure_path(output) displacy_path = util.ensure_path(displacy_path) if not data_path.exists(): msg.fail("Evaluation data not found", data_path, exits=1) if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = Corpus(data_path, gold_preproc=gold_preproc) nlp = util.load_model(model) dev_dataset = list(corpus(nlp)) scores = nlp.evaluate(dev_dataset) metrics = { "TOK": "token_acc", "TAG": "tag_acc", "POS": "pos_acc", "MORPH": "morph_acc", "LEMMA": "lemma_acc", "UAS": "dep_uas", "LAS": "dep_las", "NER P": "ents_p", "NER R": "ents_r", "NER F": "ents_f", "TEXTCAT": "cats_score", "SENT P": "sents_p", "SENT R": "sents_r", "SENT F": "sents_f", "SPAN P": f"spans_{spans_key}_p", "SPAN R": f"spans_{spans_key}_r", "SPAN F": f"spans_{spans_key}_f", "SPEED": "speed", } results = {} data = {} for metric, key in metrics.items(): if key in scores: if key == "cats_score": metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" if isinstance(scores[key], (int, float)): if key == "speed": results[metric] = f"{scores[key]:.0f}" else: results[metric] = f"{scores[key]*100:.2f}" else: results[metric] = "-" data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] msg.table(results, title="Results") data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent) if displacy_path: factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) render_deps = "parser" in factory_names render_ents = "ner" in factory_names render_parses( docs, displacy_path, model_name=model, limit=displacy_limit, deps=render_deps, ents=render_ents, ) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) if output_path is not None: srsly.write_json(output_path, data) msg.good(f"Saved results to {output_path}") return data
def train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", vectors=None, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ msg = Printer() util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [ p for p in output_path.iterdir() if p.is_dir() ]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] msg.text("Training pipeline: {}".format(pipeline)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline] nlp.disable_pipes(*other_pipes) for pipe in pipeline: if pipe not in nlp.pipe_names: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail("Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name)) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model: # Start with an existing model, use default optimizer optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None # Load in pre-trained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # fmt: off row_head = [ "Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS" ] row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7] if has_beam_widths: row_head.insert(1, "Beam W.") row_widths.insert(1, 7) row_settings = { "widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2 } # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs(nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, debug) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path( epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: msg.text("Early stopping, best iteration " "is: {}".format(i - iter_since_best)) msg.text("Best score = {}; Final iteration " "score = {}".format(best_score, current_score)) break finally: with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) msg.good("Created best model", best_model_path)
def convert( input_file, output_dir="-", file_type="json", n_sents=1, seg_sents=False, model=None, morphology=False, converter="auto", lang=None, ): """ Convert files into JSON format for use with train command and other experiment management functions. If no output_dir is specified, the data is written to stdout, so you can pipe them forward to a JSON file: $ spacy convert some_file.conllu > some_file.json """ no_print = output_dir == "-" msg = Printer(no_print=no_print) input_path = Path(input_file) if file_type not in FILE_TYPES: msg.fail( "Unknown file type: '{}'".format(file_type), "Supported file types: '{}'".format(", ".join(FILE_TYPES)), exits=1, ) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": # TODO: support msgpack via stdout in srsly? msg.fail( "Can't write .{} data to stdout.".format(file_type), "Please specify an output directory.", exits=1, ) if not input_path.exists(): msg.fail("Input file not found", input_path, exits=1) if output_dir != "-" and not Path(output_dir).exists(): msg.fail("Output directory not found", output_dir, exits=1) input_data = input_path.open("r", encoding="utf-8").read() if converter == "auto": converter = input_path.suffix[1:] if converter == "ner" or converter == "iob": converter_autodetect = autodetect_ner_format(input_data) if converter_autodetect == "ner": msg.info("Auto-detected token-per-line NER format") converter = converter_autodetect elif converter_autodetect == "iob": msg.info("Auto-detected sentence-per-line NER format") converter = converter_autodetect else: msg.warn( "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert" ) if converter not in CONVERTERS: msg.fail("Can't find converter for {}".format(converter), exits=1) # Use converter function to convert data func = CONVERTERS[converter] data = func( input_data, n_sents=n_sents, seg_sents=seg_sents, use_morphology=morphology, lang=lang, model=model, no_print=no_print, ) if output_dir != "-": # Export data to a file suffix = ".{}".format(file_type) output_file = Path(output_dir) / Path( input_path.parts[-1]).with_suffix(suffix) if file_type == "json": srsly.write_json(output_file, data) elif file_type == "jsonl": srsly.write_jsonl(output_file, data) elif file_type == "msg": srsly.write_msgpack(output_file, data) msg.good("Generated output file ({} documents): {}".format( len(data), output_file)) else: # Print to stdout if file_type == "json": srsly.write_json("-", data) elif file_type == "jsonl": srsly.write_jsonl("-", data)
class SimpleClassifier(nn.Module, ClassNursery): def __init__( self, encoder: nn.Module, encoding_dim: int, num_classes: int, classification_layer_bias: bool, ): """ SimpleClassifier is a linear classifier head on top of any encoder Parameters ---------- encoder : nn.Module Any encoder that takes in instances encoding_dim : int The encoding dimension num_classes : int The number of classes classification_layer_bias : bool Whether to add classification layer bias or no This is set to false only for debugging purposes ff """ super(SimpleClassifier, self).__init__() self.encoder = encoder self.encoding_dim = encoding_dim self.num_classes = num_classes print(self.num_classes) self.classification_layer_bias = classification_layer_bias self.classification_layer = nn.Linear( encoding_dim, num_classes, bias=self.classification_layer_bias) self._loss = CrossEntropyLoss() self.msg_printer = Printer() def forward( self, iter_dict: Dict[str, Any], is_training: bool, is_validation: bool, is_test: bool, ) -> Dict[str, Any]: """ Parameters ---------- iter_dict : Dict[str, Any] ``iter_dict`` from any dataset that will be passed on to the encoder is_training : bool running forward on training dataset? is_validation : bool running forward on training dataset ? is_test : bool running forward on test dataset? Returns ------- Dict[str, Any] logits: torch.FloatTensor Un-normalized probabilities over all the classes of the shape ``[batch_size, num_classes]`` normalized_probs: torch.FloatTensor Normalized probabilities over all the classes of the shape ``[batch_size, num_classes]`` loss: float Loss value if this is a training forward pass or validation loss. There will be no loss if this is the test dataset """ encoding = self.encoder(iter_dict=iter_dict) # N * C # N - batch size # C - number of classes logits = self.classification_layer(encoding) # N * C # N - batch size # C - number of classes # The normalized probabilities of classification normalized_probs = softmax(logits, dim=1) output_dict = {"logits": logits, "normalized_probs": normalized_probs} if is_training or is_validation: labels = iter_dict["label"] labels = labels.squeeze(1) assert labels.ndimension() == 1, self.msg_printer.fail( "the labels should have 1 dimension " "your input has shape {0}".format(labels.size())) loss = self._loss(logits, labels) output_dict["loss"] = loss return output_dict
def train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", vectors=None, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ msg = Printer() util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] msg.text("Training pipeline: {}".format(pipeline)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline] nlp.disable_pipes(*other_pipes) for pipe in pipeline: if pipe not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe(pipe)) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: nlp.add_pipe(nlp.create_pipe(pipe)) if learn_tokens: nlp.add_pipe(nlp.create_pipe("merge_subtokens")) if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail( "Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name) ) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model: # Start with an existing model, use default optimizer optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None # Load in pre-trained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # fmt: off row_head = ["Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS"] row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7] if has_beam_widths: row_head.insert(1, "Beam W.") row_widths.insert(1, 7) row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0 ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8 ) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc) ) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, debug) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path(epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc) ) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: msg.text( "Early stopping, best iteration " "is: {}".format(i - iter_since_best) ) msg.text( "Best score = {}; Final iteration " "score = {}".format(best_score, current_score) ) break finally: with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) msg.good("Created best model", best_model_path)
class PrecisionRecallFMeasure(BaseMetric, ClassNursery): def __init__(self, idx2labelname_mapping: Optional[Dict[int, str]] = None): """ Parameters ---------- idx2labelname_mapping : Dict[int, str] Mapping from index to label. If this is not provided then we are going to use the class indices in all the reports """ super(PrecisionRecallFMeasure, self).__init__() self.idx2labelname_mapping = idx2labelname_mapping self.msg_printer = Printer() self.classification_metrics_utils = ClassificationMetricsUtils( idx2labelname_mapping=idx2labelname_mapping ) # setup counters to calculate true positives, false positives, # false negatives and true negatives # The keys are the different class indices in the dataset and the # values are the number of true positives, false positives, false negative # true negatvies for the dataset self.tp_counter = {} self.fp_counter = {} self.fn_counter = {} self.tn_counter = {} def print_confusion_metrics( self, predicted_probs: torch.FloatTensor, labels: torch.LongTensor, labels_mask: Optional[torch.ByteTensor] = None, ) -> None: """ Prints confusion matrix Parameters ---------- predicted_probs : torch.FloatTensor Predicted Probabilities ``[batch_size, num_classes]`` labels : torch.FloatTensor True labels of the size ``[batch_size, 1]`` labels_mask : Optional[torch.ByteTensor] Labels mask indicating 1 in thos places where the true label is ignored Otherwise 0. It should be of same size as labels """ assert predicted_probs.ndimension() == 2, self.msg_printer.fail( "The predicted probs should " "have 2 dimensions. The probs " "that you passed have shape " "{0}".format(predicted_probs.size()) ) assert labels.ndimension() == 2, self.msg_printer.fail( "The labels should have 2 dimension." "The labels that you passed have shape " "{0}".format(labels.size()) ) if labels_mask is None: labels_mask = torch.zeros_like(labels).type(torch.ByteTensor) # TODO: for now k=1, change it to different number of ks top_probs, top_indices = predicted_probs.topk(k=1, dim=1) # convert to 1d numpy top_indices_numpy = top_indices.cpu().numpy().tolist() # convert labels to 1 dimension true_labels_numpy = labels.cpu().numpy().tolist() confusion_mtrx, classes = self.classification_metrics_utils.get_confusion_matrix_and_labels( predicted_tag_indices=top_indices_numpy, true_tag_indices=true_labels_numpy, masked_label_indices=labels_mask, ) if self.idx2labelname_mapping is not None: classes_with_names = [ f"cls_{class_}({self.idx2labelname_mapping[class_]})" for class_ in classes ] else: classes_with_names = classes assert ( len(classes) == confusion_mtrx.shape[1] ), f"len(classes) = {len(classes)} confusion matrix shape {confusion_mtrx.shape}" header = [f"{class_}" for class_ in classes] header.insert(0, "pred(cols)/true(rows)") confusion_mtrx = pd.DataFrame(confusion_mtrx) confusion_mtrx.insert(0, "class_name", classes_with_names) self.msg_printer.table( data=confusion_mtrx.values.tolist(), header=header, divider=True ) def calc_metric( self, iter_dict: Dict[str, Any], model_forward_dict: Dict[str, Any] ) -> None: """ Updates the values being tracked for calculating the metric For Precision Recall FMeasure we update the true positive, false positive and false negative of the different classes being tracked Parameters ---------- iter_dict : Dict[str, Any] The ``iter_dict`` from the dataset is expected to have ``label`` which are labels for instances. They are usually of the size ``[batch_size]`` Optionally there can be a ``label_mask`` of the size ``[batch_size]`` The ``label_mask`` is 1 where the label should be masked otherwise if the label is not masked then it is 0 model_forward_dict : Dict[str, Any] The dictionary obtained after a forward pass The model_forward_pass is expected to have ``normalized_probs`` that usually is of the size ``[batch_size, num_classes]`` """ normalized_probs = model_forward_dict["normalized_probs"] labels = iter_dict["label"] labels_mask = iter_dict.get("label_mask") if labels_mask is None: labels_mask = torch.zeros_like(labels).type(torch.ByteTensor) normalized_probs = normalized_probs.cpu() labels = labels.cpu() assert normalized_probs.ndimension() == 2, self.msg_printer.fail( "The predicted probs should " "have 2 dimensions. The probs " "that you passed have shape " "{0}".format(normalized_probs.size()) ) assert labels.ndimension() == 2, self.msg_printer.fail( "The labels should have 2 dimension." "The labels that you passed have shape " "{0}".format(labels.size()) ) # TODO: for now k=1, change it to different number of ks top_probs, top_indices = normalized_probs.topk(k=1, dim=1) # convert to 1d numpy top_indices_numpy = top_indices.cpu().numpy().tolist() # convert labels to 1 dimension true_labels_numpy = labels.cpu().numpy().tolist() labels_mask = labels_mask.tolist() confusion_mtrx, classes = self.classification_metrics_utils.get_confusion_matrix_and_labels( true_tag_indices=true_labels_numpy, predicted_tag_indices=top_indices_numpy, masked_label_indices=labels_mask, ) # For further confirmation on how I calculated this I searched for stackoverflow on # 18th of July 2019. This seems to be the correct way to calculate tps, fps, fns # You can refer to https://stackoverflow.com/a/43331484/2704763 # calculate tps tps = np.around(np.diag(confusion_mtrx), decimals=4) # calculate fps fps = np.around(np.sum(confusion_mtrx, axis=0) - tps, decimals=4) # calculate fns fns = np.around(np.sum(confusion_mtrx, axis=1) - tps, decimals=4) tps = tps.tolist() fps = fps.tolist() fns = fns.tolist() class_tps_mapping = dict(zip(classes, tps)) class_fps_mapping = dict(zip(classes, fps)) class_fns_mapping = dict(zip(classes, fns)) self.tp_counter = merge_dictionaries_with_sum( self.tp_counter, class_tps_mapping ) self.fp_counter = merge_dictionaries_with_sum( self.fp_counter, class_fps_mapping ) self.fn_counter = merge_dictionaries_with_sum( self.fn_counter, class_fns_mapping ) def get_metric(self) -> Dict[str, Any]: """ Returns different values being tracked to calculate Precision Recall FMeasure Returns ------- Dict[str, Any] Returns a dictionary with the following key value pairs precision: Dict[str, float] The precision for different classes recall: Dict[str, float] The recall values for different classes fscore: Dict[str, float] The fscore values for different classes, num_tp: Dict[str, int] The number of true positives for different classes, num_fp: Dict[str, int] The number of false positives for different classes, num_fn: Dict[str, int] The number of false negatives for different classes "macro_precision": float The macro precision value considering all different classes, macro_recall: float The macro recall value considering all different classes macro_fscore: float The macro fscore value considering all different classes micro_precision: float The micro precision value considering all different classes, micro_recall: float The micro recall value considering all different classes. micro_fscore: float The micro fscore value considering all different classes """ precision_dict, recall_dict, fscore_dict = self.classification_metrics_utils.get_prf_from_counters( tp_counter=self.tp_counter, fp_counter=self.fp_counter, fn_counter=self.fn_counter, ) # macro scores # for a detailed discussion on micro and macro scores please follow the discussion @ # https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin # micro scores micro_precision, micro_recall, micro_fscore = self.classification_metrics_utils.get_micro_prf_from_counters( tp_counter=self.tp_counter, fp_counter=self.fp_counter, fn_counter=self.fn_counter, ) # macro scores macro_precision, macro_recall, macro_fscore = self.classification_metrics_utils.get_macro_prf_from_prf_dicts( precision_dict=precision_dict, recall_dict=recall_dict, fscore_dict=fscore_dict, ) return { "precision": precision_dict, "recall": recall_dict, "fscore": fscore_dict, "num_tp": self.tp_counter, "num_fp": self.fp_counter, "num_fn": self.fn_counter, "macro_precision": macro_precision, "macro_recall": macro_recall, "macro_fscore": macro_fscore, "micro_precision": micro_precision, "micro_recall": micro_recall, "micro_fscore": micro_fscore, } def reset(self) -> None: """ Resets all the counters Resets the ``tp_counter`` which is the true positive counter Resets the ``fp_counter`` which is the false positive counter Resets the ``fn_counter`` - which is the false negative counter Resets the ``tn_counter`` - which is the true nagative counter """ self.tp_counter = {} self.fp_counter = {} self.fn_counter = {} self.tn_counter = {} def report_metrics(self, report_type="wasabi"): """ Reports metrics in a printable format Parameters ---------- report_type : type Select one of ``[wasabi, paper]`` If wasabi, then we return a printable table that represents the precision recall and fmeasures for different classes """ accuracy_metrics = self.get_metric() precision = accuracy_metrics["precision"] recall = accuracy_metrics["recall"] fscore = accuracy_metrics["fscore"] macro_precision = accuracy_metrics["macro_precision"] macro_recall = accuracy_metrics["macro_recall"] macro_fscore = accuracy_metrics["macro_fscore"] micro_precision = accuracy_metrics["micro_precision"] micro_recall = accuracy_metrics["micro_recall"] micro_fscore = accuracy_metrics["micro_fscore"] if report_type == "wasabi": table = self.classification_metrics_utils.generate_table_report_from_counters( tp_counter=self.tp_counter, fp_counter=self.fp_counter, fn_counter=self.fn_counter, ) return table elif report_type == "paper": "Refer to the paper Logical Structure Recovery in Scholarly Articles with " "Rich Document Features Table 2. It generates just fscores and returns" class_nums = fscore.keys() class_nums = sorted(class_nums, reverse=False) fscores = [fscore[class_num] for class_num in class_nums] fscores.extend([micro_fscore, macro_fscore]) return fscores
from wasabi import Printer import sys with (open(sys.argv[1])) as fd: thesolution = fd.read() with (open(sys.argv[2])) as fd: thetest = fd.read() __msg__ = Printer() __solution__ = """{}""".format(thesolution) exec(thesolution) exec(thetest) print(globals().keys()) try: test() except AssertionError as e: __msg__.fail(e)
def pretrain( texts_loc, vectors_model, output_dir, width=96, depth=4, embed_rows=2000, loss_func="cosine", use_vectors=False, dropout=0.2, n_iter=1000, batch_size=3000, max_length=500, min_length=5, seed=0, n_save_every=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Specifically, we load pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pre-trained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pre-trained weights files to the 'spacy train' command. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure all settings are the same between pretraining and training. The API and errors around this need some improvement. """ config = dict(locals()) msg = Printer() util.fix_random_seed(seed) has_gpu = prefer_gpu() msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() msg.good("Created output directory") srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") # Load texts from file or stdin if texts_loc != "-": # reading from a file texts_loc = Path(texts_loc) if not texts_loc.exists(): msg.fail("Input text file doesn't exist", texts_loc, exits=1) with msg.loading("Loading input texts..."): texts = list(srsly.read_jsonl(texts_loc)) msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin msg.text("Reading input text from stdin...") texts = srsly.read_jsonl("-") with msg.loading("Loading model '{}'...".format(vectors_model)): nlp = util.load_model(vectors_model) msg.good("Loaded model '{}'".format(vectors_model)) pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name model = create_pretraining_model( nlp, Tok2Vec( width, embed_rows, conv_depth=depth, pretrained_vectors=pretrained_vectors, bilstm_depth=0, # Requires PyTorch. Experimental. cnn_maxout_pieces=3, # You can try setting this higher subword_features=True, # Set to False for Chinese etc ), ) optimizer = create_default_optimizer(model.ops) tracker = ProgressTracker(frequency=10000) msg.divider("Pre-training tok2vec layer") row_settings = { "widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r") } msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open("wb") as file_: file_.write(model.tok2vec.to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, "epoch_loss": tracker.epoch_loss, "epoch": epoch, } with (output_dir / "log.jsonl").open("a") as file_: file_.write(srsly.json_dumps(log) + "\n") for epoch in range(n_iter): for batch_id, batch in enumerate( util.minibatch_by_words(((text, None) for text in texts), size=batch_size)): docs = make_docs( nlp, [text for (text, _) in batch], max_length=max_length, min_length=min_length, ) loss = make_update(model, docs, optimizer, objective=loss_func, drop=dropout) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10**7: break if n_save_every and (batch_id % n_save_every == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 if texts_loc != "-": # Reshuffle the texts if texts were loaded from a file random.shuffle(texts)
def package( input_dir: Path, output_dir: Path, meta_path: Optional[Path] = None, code_paths: List[Path] = [], name: Optional[str] = None, version: Optional[str] = None, create_meta: bool = False, create_sdist: bool = True, create_wheel: bool = False, force: bool = False, silent: bool = True, ) -> None: msg = Printer(no_print=silent, pretty=not silent) input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) if create_wheel and not has_wheel(): err = "Generating a binary .whl file requires wheel to be installed" msg.fail(err, "pip install wheel", exits=1) if not input_path or not input_path.exists(): msg.fail("Can't locate pipeline data", input_path, exits=1) if not output_path or not output_path.exists(): msg.fail("Output directory not found", output_path, exits=1) if create_sdist or create_wheel: opts = [ "sdist" if create_sdist else "", "wheel" if create_wheel else "" ] msg.info( f"Building package artifacts: {', '.join(opt for opt in opts if opt)}" ) for code_path in code_paths: if not code_path.exists(): msg.fail("Can't find code file", code_path, exits=1) # Import the code here so it's available when model is loaded (via # get_meta helper). Also verifies that everything works util.import_file(code_path.stem, code_path) if code_paths: msg.good( f"Including {len(code_paths)} Python module(s) with custom code") if meta_path and not meta_path.exists(): msg.fail("Can't find pipeline meta.json", meta_path, exits=1) meta_path = meta_path or input_dir / "meta.json" if not meta_path.exists() or not meta_path.is_file(): msg.fail("Can't load pipeline meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) meta = get_meta(input_dir, meta) if name is not None: meta["name"] = name if version is not None: meta["version"] = version if not create_meta: # only print if user doesn't want to overwrite msg.good("Loaded meta.json from file", meta_path) else: meta = generate_meta(meta, msg) errors = validate(ModelMetaSchema, meta) if errors: msg.fail("Invalid pipeline meta.json") print("\n".join(errors)) sys.exit(1) model_name = meta["lang"] + "_" + meta["name"] model_name_v = model_name + "-" + meta["version"] main_path = output_dir / model_name_v package_path = main_path / model_name if package_path.exists(): if force: shutil.rmtree(str(package_path)) else: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " "`--force` flag to overwrite existing directories.", exits=1, ) Path.mkdir(package_path, parents=True) shutil.copytree(str(input_dir), str(package_path / model_name_v)) license_path = package_path / model_name_v / "LICENSE" if license_path.exists(): shutil.move(str(license_path), str(main_path)) imports = [] for code_path in code_paths: imports.append(code_path.stem) shutil.copy(str(code_path), str(package_path)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) init_py = TEMPLATE_INIT.format(imports="\n".join(f"from . import {m}" for m in imports)) create_file(package_path / "__init__.py", init_py) msg.good(f"Successfully created package '{model_name_v}'", main_path) if create_sdist: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "sdist"], capture=False) zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}" msg.good(f"Successfully created zipped Python package", zip_file) if create_wheel: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}" msg.good(f"Successfully created binary wheel", wheel)
def pretrain( texts_loc, vectors_model, output_dir, width=96, depth=4, embed_rows=2000, loss_func="cosine", use_vectors=False, dropout=0.2, n_iter=1000, batch_size=3000, max_length=500, min_length=5, seed=0, n_save_every=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Specifically, we load pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pre-trained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pre-trained weights files to the 'spacy train' command. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure all settings are the same between pretraining and training. The API and errors around this need some improvement. """ config = dict(locals()) msg = Printer() util.fix_random_seed(seed) has_gpu = prefer_gpu() msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() msg.good("Created output directory") srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") # Load texts from file or stdin if texts_loc != "-": # reading from a file texts_loc = Path(texts_loc) if not texts_loc.exists(): msg.fail("Input text file doesn't exist", texts_loc, exits=1) with msg.loading("Loading input texts..."): texts = list(srsly.read_jsonl(texts_loc)) msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin msg.text("Reading input text from stdin...") texts = srsly.read_jsonl("-") with msg.loading("Loading model '{}'...".format(vectors_model)): nlp = util.load_model(vectors_model) msg.good("Loaded model '{}'".format(vectors_model)) pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name model = create_pretraining_model( nlp, Tok2Vec( width, embed_rows, conv_depth=depth, pretrained_vectors=pretrained_vectors, bilstm_depth=0, # Requires PyTorch. Experimental. cnn_maxout_pieces=3, # You can try setting this higher subword_features=True, # Set to False for Chinese etc ), ) optimizer = create_default_optimizer(model.ops) tracker = ProgressTracker(frequency=10000) msg.divider("Pre-training tok2vec layer") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open( "wb" ) as file_: file_.write(model.tok2vec.to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, "epoch_loss": tracker.epoch_loss, "epoch": epoch, } with (output_dir / "log.jsonl").open("a") as file_: file_.write(srsly.json_dumps(log) + "\n") for epoch in range(n_iter): for batch_id, batch in enumerate( util.minibatch_by_words(((text, None) for text in texts), size=batch_size) ): docs = make_docs( nlp, [text for (text, _) in batch], max_length=max_length, min_length=min_length, ) loss = make_update( model, docs, optimizer, objective=loss_func, drop=dropout ) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: break if n_save_every and (batch_id % n_save_every == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 if texts_loc != "-": # Reshuffle the texts if texts were loaded from a file random.shuffle(texts)
def debug_data( lang, train_path, dev_path, base_model=None, pipeline="tagger,parser,ner", ignore_warnings=False, ignore_validation=False, verbose=False, no_format=False, ): msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings) # Make sure all files and paths exists if they are needed if not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) # Initialize the model and pipeline pipeline = [p.strip() for p in pipeline.split(",")] if base_model: nlp = load_model(base_model) else: lang_cls = get_lang_class(lang) nlp = lang_cls() msg.divider("Data format validation") # Validate data format using the JSON schema # TODO: update once the new format is ready # TODO: move validation to GoldCorpus in order to be able to load from dir train_data_errors = [] # TODO: validate_json dev_data_errors = [] # TODO: validate_json if not train_data_errors: msg.good("Training data JSON format is valid") if not dev_data_errors: msg.good("Development data JSON format is valid") for error in train_data_errors: msg.fail("Training data: {}".format(error)) for error in dev_data_errors: msg.fail("Develoment data: {}".format(error)) if (train_data_errors or dev_data_errors) and not ignore_validation: sys.exit(1) # Create the gold corpus to be able to better analyze data loading_train_error_message = "" loading_dev_error_message = "" with msg.loading("Loading corpus..."): corpus = GoldCorpus(train_path, dev_path) try: train_docs = list(corpus.train_docs(nlp)) train_docs_unpreprocessed = list( corpus.train_docs_without_preprocessing(nlp)) except ValueError as e: loading_train_error_message = "Training data cannot be loaded: {}".format( str(e)) try: dev_docs = list(corpus.dev_docs(nlp)) except ValueError as e: loading_dev_error_message = "Development data cannot be loaded: {}".format( str(e)) if loading_train_error_message or loading_dev_error_message: if loading_train_error_message: msg.fail(loading_train_error_message) if loading_dev_error_message: msg.fail(loading_dev_error_message) sys.exit(1) msg.good("Corpus is loadable") # Create all gold data here to avoid iterating over the train_docs constantly gold_train_data = _compile_gold(train_docs, pipeline) gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline) gold_dev_data = _compile_gold(dev_docs, pipeline) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] msg.divider("Training stats") msg.text("Training pipeline: {}".format(", ".join(pipeline))) for pipe in [p for p in pipeline if p not in nlp.factories]: msg.fail( "Pipeline component '{}' not available in factories".format(pipe)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) else: msg.text("Starting with blank model '{}'".format(lang)) msg.text("{} training docs".format(len(train_docs))) msg.text("{} evaluation docs".format(len(dev_docs))) overlap = len(train_texts.intersection(dev_texts)) if overlap: msg.warn( "{} training examples also in evaluation data".format(overlap)) else: msg.good("No overlap between training and evaluation data") if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD: text = "Low number of examples to train from a blank model ({})".format( len(train_docs)) if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD: msg.fail(text) else: msg.warn(text) msg.text( "It's recommended to use at least {} examples (minimum {})".format( BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD), show=verbose, ) msg.divider("Vocab & Vectors") n_words = gold_train_data["n_words"] msg.info("{} total {} in the data ({} unique)".format( n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"]))) if gold_train_data["n_misaligned_words"] > 0: msg.warn("{} misaligned tokens in the training data".format( gold_train_data["n_misaligned_words"])) if gold_dev_data["n_misaligned_words"] > 0: msg.warn("{} misaligned tokens in the dev data".format( gold_dev_data["n_misaligned_words"])) most_common_words = gold_train_data["words"].most_common(10) msg.text( "10 most common words: {}".format( _format_labels(most_common_words, counts=True)), show=verbose, ) if len(nlp.vocab.vectors): msg.info("{} vectors ({} unique keys, {} dimensions)".format( len(nlp.vocab.vectors), nlp.vocab.vectors.n_keys, nlp.vocab.vectors_length, )) else: msg.info("No word vectors present in the model") if "ner" in pipeline: # Get all unique NER labels present in the data labels = set(label for label in gold_train_data["ner"] if label not in ("O", "-")) label_counts = gold_train_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] has_low_data_warning = False has_no_neg_warning = False has_ws_ents_error = False msg.divider("Named Entity Recognition") msg.info("{} new {}, {} existing {}".format( len(new_labels), "label" if len(new_labels) == 1 else "labels", len(existing_labels), "label" if len(existing_labels) == 1 else "labels", )) missing_values = label_counts["-"] msg.text("{} missing {} (tokens with '-' label)".format( missing_values, "value" if missing_values == 1 else "values")) if new_labels: labels_with_counts = [ (label, count) for label, count in label_counts.most_common() if label != "-" ] labels_with_counts = _format_labels(labels_with_counts, counts=True) msg.text("New: {}".format(labels_with_counts), show=verbose) if existing_labels: msg.text("Existing: {}".format(_format_labels(existing_labels)), show=verbose) if gold_train_data["ws_ents"]: msg.fail("{} invalid whitespace entity spans".format( gold_train_data["ws_ents"])) has_ws_ents_error = True for label in new_labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( "Low number of examples for new label '{}' ({})".format( label, label_counts[label])) has_low_data_warning = True with msg.loading("Analyzing label distribution..."): neg_docs = _get_examples_without_label(train_docs, label) if neg_docs == 0: msg.warn( "No examples for texts WITHOUT new label '{}'".format( label)) has_no_neg_warning = True if not has_low_data_warning: msg.good("Good amount of examples for all labels") if not has_no_neg_warning: msg.good("Examples without occurrences available for all labels") if not has_ws_ents_error: msg.good( "No entities consisting of or starting/ending with whitespace") if has_low_data_warning: msg.text( "To train a new entity type, your data should include at " "least {} instances of the new label".format( NEW_LABEL_THRESHOLD), show=verbose, ) if has_no_neg_warning: msg.text( "Training data should always include examples of entities " "in context, as well as examples without a given entity " "type.", show=verbose, ) if has_ws_ents_error: msg.text( "As of spaCy v2.1.0, entity spans consisting of or starting/ending " "with whitespace characters are considered invalid.") if "textcat" in pipeline: msg.divider("Text Classification") labels = [label for label in gold_train_data["textcat"]] model_labels = _get_labels_from_model(nlp, "textcat") new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] msg.info("Text Classification: {} new label(s), {} existing label(s)". format(len(new_labels), len(existing_labels))) if new_labels: labels_with_counts = _format_labels( gold_train_data["textcat"].most_common(), counts=True) msg.text("New: {}".format(labels_with_counts), show=verbose) if existing_labels: msg.text("Existing: {}".format(_format_labels(existing_labels)), show=verbose) if "tagger" in pipeline: msg.divider("Part-of-speech Tagging") labels = [label for label in gold_train_data["tags"]] tag_map = nlp.Defaults.tag_map msg.info("{} {} in data ({} {} in tag map)".format( len(labels), "label" if len(labels) == 1 else "labels", len(tag_map), "label" if len(tag_map) == 1 else "labels", )) labels_with_counts = _format_labels( gold_train_data["tags"].most_common(), counts=True) msg.text(labels_with_counts, show=verbose) non_tagmap = [l for l in labels if l not in tag_map] if not non_tagmap: msg.good("All labels present in tag map for language '{}'".format( nlp.lang)) for label in non_tagmap: msg.fail( "Label '{}' not found in tag map for language '{}'".format( label, nlp.lang)) if "parser" in pipeline: msg.divider("Dependency Parsing") # profile sentence length msg.info("Found {} sentence{} with an average length of {:.1f} words.". format( gold_train_data["n_sents"], "s" if len(train_docs) > 1 else "", gold_train_data["n_words"] / gold_train_data["n_sents"])) # profile labels labels_train = [label for label in gold_train_data["deps"]] labels_train_unpreprocessed = [ label for label in gold_train_unpreprocessed_data["deps"] ] labels_dev = [label for label in gold_dev_data["deps"]] if gold_train_unpreprocessed_data["n_nonproj"] > 0: msg.info("Found {} nonprojective train sentence{}".format( gold_train_unpreprocessed_data["n_nonproj"], "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "")) if gold_dev_data["n_nonproj"] > 0: msg.info("Found {} nonprojective dev sentence{}".format( gold_dev_data["n_nonproj"], "s" if gold_dev_data["n_nonproj"] > 1 else "")) msg.info("{} {} in train data".format( len(labels_train_unpreprocessed), "label" if len(labels_train) == 1 else "labels")) msg.info("{} {} in projectivized train data".format( len(labels_train), "label" if len(labels_train) == 1 else "labels")) labels_with_counts = _format_labels( gold_train_unpreprocessed_data["deps"].most_common(), counts=True) msg.text(labels_with_counts, show=verbose) # rare labels in train for label in gold_train_unpreprocessed_data["deps"]: if gold_train_unpreprocessed_data["deps"][ label] <= DEP_LABEL_THRESHOLD: msg.warn("Low number of examples for label '{}' ({})".format( label, gold_train_unpreprocessed_data["deps"][label])) has_low_data_warning = True # rare labels in projectivized train rare_projectivized_labels = [] for label in gold_train_data["deps"]: if gold_train_data["deps"][ label] <= DEP_LABEL_THRESHOLD and "||" in label: rare_projectivized_labels.append("{}: {}".format( label, str(gold_train_data["deps"][label]))) if len(rare_projectivized_labels) > 0: msg.warn( "Low number of examples for {} label{} in the " "projectivized dependency trees used for training. You may " "want to projectivize labels such as punct before " "training in order to improve parser performance.".format( len(rare_projectivized_labels), "s" if len(rare_projectivized_labels) > 1 else "")) msg.warn("Projectivized labels with low numbers of examples: " "{}".format("\n".join(rare_projectivized_labels)), show=verbose) has_low_data_warning = True # labels only in train if set(labels_train) - set(labels_dev): msg.warn("The following labels were found only in the train data: " "{}".format( ", ".join(set(labels_train) - set(labels_dev))), show=verbose) # labels only in dev if set(labels_dev) - set(labels_train): msg.warn("The following labels were found only in the dev data: " + ", ".join(set(labels_dev) - set(labels_train)), show=verbose) if has_low_data_warning: msg.text( "To train a parser, your data should include at " "least {} instances of each label.".format( DEP_LABEL_THRESHOLD), show=verbose, ) # multiple root labels if len(gold_train_unpreprocessed_data["roots"]) > 1: msg.warn( "Multiple root labels ({}) ".format(", ".join( gold_train_unpreprocessed_data["roots"])) + "found in training data. spaCy's parser uses a single root " "label ROOT so this distinction will not be available.") # these should not happen, but just in case if gold_train_data["n_nonproj"] > 0: msg.fail( "Found {} nonprojective projectivized train sentence{}".format( gold_train_data["n_nonproj"], "s" if gold_train_data["n_nonproj"] > 1 else "")) if gold_train_data["n_cycles"] > 0: msg.fail( "Found {} projectivized train sentence{} with cycles".format( gold_train_data["n_cycles"], "s" if gold_train_data["n_cycles"] > 1 else "")) msg.divider("Summary") good_counts = msg.counts[MESSAGES.GOOD] warn_counts = msg.counts[MESSAGES.WARN] fail_counts = msg.counts[MESSAGES.FAIL] if good_counts: msg.good("{} {} passed".format( good_counts, "check" if good_counts == 1 else "checks")) if warn_counts: msg.warn("{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")) if fail_counts: msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors")) if fail_counts: sys.exit(1)
def link(origin, link_name, force=False, model_path=None): """ Create a symlink for models within the spacy/data directory. Accepts either the name of a pip package, or the local path to the model data directory. Linking models allows loading them via spacy.load(link_name). """ msg = Printer() if util.is_package(origin): model_path = util.get_package_path(origin) else: model_path = Path(origin) if model_path is None else Path(model_path) if not model_path.exists(): msg.fail( "Can't locate model data", "The data should be located in {}".format(path2str(model_path)), exits=1, ) data_path = util.get_data_path() if not data_path or not data_path.exists(): spacy_loc = Path(__file__).parent.parent msg.fail( "Can't find the spaCy data path to create model symlink", "Make sure a directory `/data` exists within your spaCy " "installation and try again. The data directory should be located " "here:".format(path=spacy_loc), exits=1, ) link_path = util.get_data_path() / link_name if link_path.is_symlink() and not force: msg.fail( "Link '{}' already exists".format(link_name), "To overwrite an existing link, use the --force flag", exits=1, ) elif link_path.is_symlink(): # does a symlink exist? # NB: It's important to check for is_symlink here and not for exists, # because invalid/outdated symlinks would return False otherwise. link_path.unlink() elif link_path.exists(): # does it exist otherwise? # NB: Check this last because valid symlinks also "exist". msg.fail( "Can't overwrite symlink '{}'".format(link_name), "This can happen if your data directory contains a directory or " "file of the same name.", exits=1, ) details = "%s --> %s" % (path2str(model_path), path2str(link_path)) try: symlink_to(link_path, model_path) except: # noqa: E722 # This is quite dirty, but just making sure other errors are caught. msg.fail( "Couldn't link model to '{}'".format(link_name), "Creating a symlink in spacy/data failed. Make sure you have the " "required permissions and try re-running the command as admin, or " "use a virtualenv. You can still import the model as a module and " "call its load() method, or create the symlink manually.", ) msg.text(details) raise msg.good("Linking successful", details) msg.text( "You can now load the model via spacy.load('{}')".format(link_name))
def pretrain( texts_loc, vectors_model, output_dir, width=96, depth=4, bilstm_depth=2, embed_rows=2000, loss_func="cosine", use_vectors=False, dropout=0.2, n_iter=1000, batch_size=3000, max_length=500, min_length=5, seed=0, n_save_every=None, init_tok2vec=None, epoch_start=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Specifically, we load pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pretrained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pretrained weights files to the 'spacy train' command. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure all settings are the same between pretraining and training. The API and errors around this need some improvement. """ config = dict(locals()) for key in config: if isinstance(config[key], Path): config[key] = str(config[key]) msg = Printer() util.fix_random_seed(seed) has_gpu = prefer_gpu() if has_gpu: import torch torch.set_default_tensor_type("torch.cuda.FloatTensor") msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() msg.good("Created output directory") srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") # Load texts from file or stdin if texts_loc != "-": # reading from a file texts_loc = Path(texts_loc) if not texts_loc.exists(): msg.fail("Input text file doesn't exist", texts_loc, exits=1) with msg.loading("Loading input texts..."): texts = list(srsly.read_jsonl(texts_loc)) if not texts: msg.fail("Input file is empty", texts_loc, exits=1) msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin msg.text("Reading input text from stdin...") texts = srsly.read_jsonl("-") with msg.loading("Loading model '{}'...".format(vectors_model)): nlp = util.load_model(vectors_model) msg.good("Loaded model '{}'".format(vectors_model)) pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name model = create_pretraining_model( nlp, Tok2Vec( width, embed_rows, conv_depth=depth, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. cnn_maxout_pieces=3, # You can try setting this higher subword_features=True, # Set to False for Chinese etc ), ) # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Parse the epoch number from the given weight file model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) if model_name: # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' epoch_start = int(model_name.group(0)[5:][:-4]) + 1 else: if not epoch_start: msg.fail( "You have to use the '--epoch-start' argument when using a renamed weight file for " "'--init-tok2vec'", exits=True, ) elif epoch_start < 0: msg.fail( "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid" % epoch_start, exits=True, ) else: # Without '--init-tok2vec' the '--epoch-start' argument is ignored epoch_start = 0 optimizer = create_default_optimizer(model.ops) tracker = ProgressTracker(frequency=10000) msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start) row_settings = { "widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r") } msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open("wb") as file_: file_.write(model.tok2vec.to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, "epoch_loss": tracker.epoch_loss, "epoch": epoch, } with (output_dir / "log.jsonl").open("a") as file_: file_.write(srsly.json_dumps(log) + "\n") skip_counter = 0 for epoch in range(epoch_start, n_iter + epoch_start): for batch_id, batch in enumerate( util.minibatch_by_words(((text, None) for text in texts), size=batch_size)): docs, count = make_docs( nlp, [text for (text, _) in batch], max_length=max_length, min_length=min_length, ) skip_counter += count loss = make_update(model, docs, optimizer, objective=loss_func, drop=dropout) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10**7: break if n_save_every and (batch_id % n_save_every == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 if texts_loc != "-": # Reshuffle the texts if texts were loaded from a file random.shuffle(texts) if skip_counter > 0: msg.warn( "Skipped {count} empty values".format(count=str(skip_counter))) msg.good("Successfully finished pretrain")
import sys from wasabi import Printer from spacy.cli import download, link, info, package, train, pretrain, convert from spacy.cli import init_model, profile, evaluate, validate, debug_data msg = Printer() commands = { "download": download, "link": link, "info": info, "train": train, "pretrain": pretrain, "debug-data": debug_data, "evaluate": evaluate, "convert": convert, "package": package, "init-model": init_model, "profile": profile, "validate": validate, } if len(sys.argv) == 1: msg.info("Available commands", ", ".join(commands), exits=1) command = sys.argv.pop(1) sys.argv[0] = "spacy %s" % command if command in commands: plac.call(commands[command], sys.argv[1:]) else: available = "Available: {}".format(", ".join(commands)) msg.fail("Unknown command: {}".format(command), available, exits=1)
class Engine(ClassNursery): def __init__( self, model: nn.Module, datasets_manager: DatasetsManager, optimizer: optim, batch_size: int, save_dir: str, num_epochs: int, save_every: int, log_train_metrics_every: int, train_metric: BaseMetric, validation_metric: BaseMetric, test_metric: BaseMetric, experiment_name: Optional[str] = None, experiment_hyperparams: Optional[Dict[str, Any]] = None, tensorboard_logdir: str = None, track_for_best: str = "loss", collate_fn=list, device: Union[torch.device, str] = torch.device("cpu"), gradient_norm_clip_value: Optional[float] = 5.0, lr_scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, use_wandb: bool = False, sample_proportion: float = 1.0, seeds: Dict[str, int] = None, ): """ Engine runs the models end to end. It iterates through the train dataset and passes it through the model. During training it helps in tracking a lot of parameters for the run and saving the parameters. It also reports validation and test parameters from time to time. Many utilities required for end-end running of the model is here. Parameters ---------- model : nn.Module A pytorch module defining a model to be run datasets_manager : DatasetsManager A datasets manager that handles all the different datasets optimizer : torch.optim Any Optimizer object instantiated using ``torch.optim`` batch_size : int Batch size for the dataset. The same batch size is used for ``train``, ``valid`` and ``test`` dataset save_dir : int The experiments are saved in ``save_dir``. We save checkpoints, the best model, logs and other information into the save dir num_epochs : int The number of epochs to run the training save_every : int The model will be checkpointed every ``save_every`` number of iterations log_train_metrics_every : int The train metrics will be reported every ``log_train_metrics_every`` iterations during training train_metric : BaseMetric Anything that is an instance of ``BaseMetric`` for calculating training metrics validation_metric : BaseMetric Anything that is an instance of ``BaseMetric`` for calculating validation metrics test_metric : BaseMetric Anything that is an instance of ``BaseMetric`` for calculating test metrics experiment_name : str The experiment should be given a name for ease of tracking. Instead experiment name is not given, we generate a unique 10 digit sha for the experiment. experiment_hyperparams : Dict[str, Any] This is mostly used for tracking the different hyper-params of the experiment being run. This may be used by ``wandb`` to save the hyper-params tensorboard_logdir : str The directory where all the tensorboard runs are stored. If ``None`` is passed then it defaults to the tensorboard default of storing the log in the current directory. track_for_best : str Which metric should be tracked for deciding the best model?. Anything that the metric emits and is a single value can be used for tracking. The defauly value is ``loss``. If its loss, then the best value will be the lowest one. For some other metrics like ``macro_fscore``, the best metric might be the one that has the highest value collate_fn : Callable[[List[Any]], List[Any]] Collates the different examples into a single batch of examples. This is the same terminology adopted from ``pytorch``. There is no different device : torch.device The device on which the model will be placed. If this is "cpu", then the model and the tensors will all be on cpu. If this is "cuda:0", then the model and the tensors will be placed on cuda device 0. You can mention any other cuda device that is suitable for your environment gradient_norm_clip_value : float To avoid gradient explosion, the gradients of the norm will be clipped if the gradient norm exceeds this value lr_scheduler : torch.optim.lr_scheduler Any pytorch ``lr_scheduler`` can be used for reducing the learning rate if the performance on the validation set reduces. use_wandb : bool wandb or weights and biases is a tool that is used to track experiments online. Sciwing comes with inbuilt functionality to track experiments on weights and biases seeds: Dict[str, int] The dict of seeds to be set. Set the random_seed, pytorch_seed and numpy_seed Found in https://github.com/allenai/allennlp/blob/master/allennlp/common/util.py """ if isinstance(device, str): device = torch.device(device) if seeds is None: seeds = {} self.seeds = seeds self._set_seeds() self.model = model self.datasets_manager = datasets_manager self.train_dataset = self.datasets_manager.train_dataset self.validation_dataset = self.datasets_manager.dev_dataset self.test_dataset = self.datasets_manager.test_dataset self.optimizer = optimizer self.batch_size = batch_size self.save_dir = pathlib.Path(save_dir) self.num_epochs = num_epochs self.msg_printer = Printer() self.save_every = save_every self.log_train_metrics_every = log_train_metrics_every self.tensorboard_logdir = tensorboard_logdir self.train_metric_calc = train_metric self.validation_metric_calc = validation_metric self.test_metric_calc = test_metric self.summaryWriter = SummaryWriter(log_dir=tensorboard_logdir) self.track_for_best = track_for_best self.collate_fn = collate_fn self.device = device self.best_track_value = None self.set_best_track_value(self.best_track_value) self.gradient_norm_clip_value = gradient_norm_clip_value self.lr_scheduler = lr_scheduler self.lr_scheduler_is_plateau = isinstance( self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) self.use_wandb = wandb and use_wandb self.sample_proportion = sample_proportion self.label_namespaces = self.datasets_manager.label_namespaces self.datasets_manager.print_stats() if experiment_name is None: hash_ = hashlib.sha1() hash_.update(str(time.time()).encode("utf-8")) digest = hash_.hexdigest() experiment_name = digest[:10] self.experiment_name = experiment_name self.experiment_hyperparams = experiment_hyperparams or {} if self.use_wandb: wandb.init( project="project-scwing", name=self.experiment_name, config=self.experiment_hyperparams, ) if not self.save_dir.is_dir(): self.save_dir.mkdir(parents=True) with open(self.save_dir.joinpath("hyperparams.json"), "w") as fp: json.dump(self.experiment_hyperparams, fp) self.num_workers = 1 self.model.to(self.device) self.train_loader = self.get_loader(self.train_dataset) self.validation_loader = self.get_loader(self.validation_dataset) self.test_loader = self.get_loader(self.test_dataset) # refresh the iters at the beginning of every epoch self.train_iter = None self.validation_iter = None self.test_iter = None # initializing loss meters self.train_loss_meter = LossMeter() self.validation_loss_meter = LossMeter() self.msg_printer.divider("ENGINE STARTING") time.sleep(3) # get the loggers ready self.train_log_filename = self.save_dir.joinpath("train.log") self.validation_log_filename = self.save_dir.joinpath("validation.log") self.test_log_filename = self.save_dir.joinpath("test.log") self.train_logger = logzero.setup_logger( name="train-logger", logfile=self.train_log_filename, level=logging.INFO) self.validation_logger = logzero.setup_logger( name="valid-logger", logfile=self.validation_log_filename, level=logging.INFO, ) self.test_logger = logzero.setup_logger(name="test-logger", logfile=self.test_log_filename, level=logging.INFO) if self.lr_scheduler_is_plateau: if self.best_track_value == "loss" and self.lr_scheduler.mode == "max": self.msg_printer.warn( "You are optimizing loss and lr schedule mode is max instead of min" ) if (self.best_track_value == "macro_fscore" or self.best_track_value == "fscore" and self.lr_scheduler.mode == "min"): self.msg_printer.warn( f"You are optimizing for macro_fscore and lr scheduler mode is min instead of max" ) if (self.best_track_value == "micro_fscore" and self.lr_scheduler.mode == "min"): self.msg_printer.warn( f"You are optimizing for micro_fscore and lr scheduler mode is min instead of max" ) def get_loader(self, dataset: Dataset) -> DataLoader: """ Returns the DataLoader for the Dataset Parameters ---------- dataset : Dataset Returns ------- DataLoader A pytorch DataLoader """ dataset_size = len(dataset) sample_size = int(np.floor(dataset_size * self.sample_proportion)) indices = np.random.choice(range(dataset_size), size=sample_size, replace=False) sampler = SubsetRandomSampler(indices=indices) loader = DataLoader( dataset=dataset, batch_size=self.batch_size, num_workers=self.num_workers, collate_fn=self.collate_fn, pin_memory=True, sampler=sampler, ) return loader def is_best_lower(self, current_best=None): """ Returns True if the current value of the metric is lower than the best metric. This is useful for tracking metrics like loss where, lower the value, the better it is Parameters ---------- current_best : float The current value for the metric that is being tracked Returns ------- bool """ return True if current_best < self.best_track_value else False def is_best_higher(self, current_best=None): """ Returns ``True`` if the current value of the metric is HIGHER than the best metric. This is useful for tracking metrics like FSCORE where, higher the value, the better it is Parameters ---------- current_best : float The current value for the metric that is being tracked Returns ------- bool """ return True if current_best >= self.best_track_value else False def set_best_track_value(self, current_best=None): """ Set the best value of the value being tracked Parameters ---------- current_best : float The current value that is best Returns ------- """ if self.track_for_best == "loss": self.best_track_value = np.inf if current_best is None else current_best elif self.track_for_best == "macro_fscore" or self.track_for_best == "fscore": self.best_track_value = 0 if current_best is None else current_best elif self.track_for_best == "micro_fscore": self.best_track_value = 0 if current_best is None else current_best def run(self): """ Run the engine :return: """ for epoch_num in range(self.num_epochs): self.train_epoch(epoch_num) self.validation_epoch(epoch_num) self.test_epoch(epoch_num) def train_epoch(self, epoch_num: int): """ Run the training for one epoch :param epoch_num: type: int The current epoch number """ # refresh everything necessary before training begins num_iterations = 0 train_iter = self.get_iter(self.train_loader) self.train_loss_meter.reset() self.train_metric_calc.reset() self.model.train() self.msg_printer.info( f"Starting Training Epoch: {epoch_num+1}/{self.num_epochs}") while True: try: # N*T, N * 1, N * 1 lines_labels = next(train_iter) lines_labels = list(zip(*lines_labels)) lines = lines_labels[0] labels = lines_labels[1] batch_size = len(lines) model_forward_out = self.model( lines=lines, labels=labels, is_training=True, is_validation=False, is_test=False, ) self.train_metric_calc.calc_metric( lines=lines, labels=labels, model_forward_dict=model_forward_out) try: self.optimizer.zero_grad() loss = model_forward_out["loss"] loss.backward() torch.nn.utils.clip_grad_norm_( self.model.parameters(), max_norm=self.gradient_norm_clip_value) self.optimizer.step() self.train_loss_meter.add_loss(loss.item(), batch_size) except KeyError: self.msg_printer.fail( "The model output dictionary does not have " "a key called loss. Please check to have " "loss in the model output") num_iterations += 1 if (num_iterations + 1) % self.log_train_metrics_every == 0: metrics = self.train_metric_calc.report_metrics() for label_namespace, table in metrics.items(): self.msg_printer.divider( text=f"Train Metrics for {label_namespace.upper()}" ) print(table) except StopIteration: self.train_epoch_end(epoch_num) break def train_epoch_end(self, epoch_num: int): """ Performs house-keeping at the end of a training epoch At the end of the training epoch, it does some house-keeping. It reports the average loss, the average metric and other information. Parameters ---------- epoch_num : int The current epoch number (0 based) """ self.msg_printer.divider(f"Training end @ Epoch {epoch_num + 1}") average_loss = self.train_loss_meter.get_average() self.msg_printer.text("Average Loss: {0}".format(average_loss)) self.train_logger.info( f"Average loss @ Epoch {epoch_num+1} - {average_loss}") metric = self.train_metric_calc.get_metric() if self.use_wandb: wandb.log({"train_loss": average_loss}, step=epoch_num + 1) if self.track_for_best != "loss": for label_namespace in self.label_namespaces: wandb.log( { f"train_{self.track_for_best}_{label_namespace}": metric[label_namespace][self.track_for_best] }, step=epoch_num + 1, ) # save the model after every `self.save_every` epochs if (epoch_num + 1) % self.save_every == 0: torch.save( { "epoch_num": epoch_num, "optimizer_state": self.optimizer.state_dict(), "model_state": self.model.state_dict(), "loss": average_loss, }, self.save_dir.joinpath(f"model_epoch_{epoch_num+1}.pt"), ) # log loss to tensor board self.summaryWriter.add_scalars( "train_validation_loss", {"train_loss": average_loss or np.inf}, epoch_num + 1, ) def validation_epoch(self, epoch_num: int): """ Runs one validation epoch on the validation dataset Parameters ---------- epoch_num : int 0-based epoch number """ self.model.eval() valid_iter = iter(self.validation_loader) self.validation_loss_meter.reset() self.validation_metric_calc.reset() self.msg_printer.info( f"Starting Validation Epoch: {epoch_num + 1}/{self.num_epochs}") while True: try: lines_labels = next(valid_iter) lines_labels = list(zip(*lines_labels)) lines = lines_labels[0] labels = lines_labels[1] batch_size = len(lines) with torch.no_grad(): model_forward_out = self.model( lines=lines, labels=labels, is_training=False, is_validation=True, is_test=False, ) loss = model_forward_out["loss"] self.validation_loss_meter.add_loss(loss, batch_size) self.validation_metric_calc.calc_metric( lines=lines, labels=labels, model_forward_dict=model_forward_out) except StopIteration: self.validation_epoch_end(epoch_num) break def validation_epoch_end(self, epoch_num: int): """Performs house-keeping at the end of validation epoch Parameters ---------- epoch_num : int The current epoch number """ self.msg_printer.divider(f"Validation @ Epoch {epoch_num+1}") metric_report = self.validation_metric_calc.report_metrics() average_loss = self.validation_loss_meter.get_average() for label_namespace, table in metric_report.items(): self.msg_printer.divider( text=f"Validation Metrics for {label_namespace.upper()}") print(table) self.msg_printer.text(f"Average Loss: {average_loss}") self.validation_logger.info( f"Validation Loss @ Epoch {epoch_num+1} - {average_loss}") if self.use_wandb: wandb.log({"validation_loss": average_loss}, step=epoch_num + 1) metric = self.validation_metric_calc.get_metric() if self.track_for_best != "loss": for label_namespace in self.label_namespaces: wandb.log( { f"validation_{self.track_for_best}_{label_namespace}": metric[label_namespace][self.track_for_best] }, step=epoch_num + 1, ) self.summaryWriter.add_scalars( "train_validation_loss", {"validation_loss": average_loss or np.inf}, epoch_num + 1, ) is_best: bool = None value_tracked: str = None if self.track_for_best == "loss": value_tracked = average_loss is_best = self.is_best_lower(average_loss) elif (self.track_for_best == "micro_fscore" or self.track_for_best == "macro_fscore" or self.track_for_best == "fscore"): # If there are multiple namespaces for the metric # we decide the best model based on the average score values_tracked = [] metrics = self.validation_metric_calc.get_metric() for label_namespace in self.label_namespaces: value_tracked = metrics[label_namespace][self.track_for_best] values_tracked.append(value_tracked) value_tracked = sum(values_tracked) / len(values_tracked) is_best = self.is_best_higher(current_best=value_tracked) if self.lr_scheduler is not None: self.lr_scheduler.step(value_tracked) if is_best: self.set_best_track_value(current_best=value_tracked) self.msg_printer.good(f"Found Best Model @ epoch {epoch_num + 1}") torch.save( { "epoch_num": epoch_num, "optimizer_state": self.optimizer.state_dict(), "model_state": self.model.state_dict(), "loss": average_loss, }, self.save_dir.joinpath("best_model.pt"), ) def test_epoch(self, epoch_num: int): """Runs the test epoch for ``epoch_num`` Loads the best model that is saved during the training and runs the test dataset. Parameters ---------- epoch_num : int zero based epoch number for which the test dataset is run This is after the last training epoch. """ self.msg_printer.divider("Running on Test Batch") self.load_model_from_file(self.save_dir.joinpath("best_model.pt")) self.model.eval() test_iter = iter(self.test_loader) while True: try: lines_labels = next(test_iter) lines_labels = list(zip(*lines_labels)) lines = lines_labels[0] labels = lines_labels[1] with torch.no_grad(): model_forward_out = self.model( lines=lines, labels=labels, is_training=False, is_validation=False, is_test=True, ) self.test_metric_calc.calc_metric( lines=lines, labels=labels, model_forward_dict=model_forward_out) except StopIteration: self.test_epoch_end(epoch_num) break def test_epoch_end(self, epoch_num: int): """ Performs house-keeping at the end of the test epoch It reports the metric that is being traced at the end of the test epoch Parameters ---------- epoch_num : int Epoch num after which the test dataset is run """ metric_report = self.test_metric_calc.report_metrics() for label_namespace, table in metric_report.items(): self.msg_printer.divider( text=f"Test Metrics for {label_namespace.upper()}") print(table) precision_recall_fmeasure = self.test_metric_calc.get_metric() self.msg_printer.divider(f"Test @ Epoch {epoch_num+1}") self.test_logger.info( f"Test Metrics @ Epoch {epoch_num+1} - {precision_recall_fmeasure}" ) if self.use_wandb: wandb.log({"test_metrics": str(precision_recall_fmeasure)}) self.summaryWriter.close() def get_train_dataset(self): """ Returns the train dataset of the experiment Returns ------- Dataset Anything that conforms to the pytorch style dataset. """ return self.train_dataset def get_validation_dataset(self): """ Returns the validation dataset of the experiment Returns ------- Dataset Anything that conforms to the pytorch style dataset. """ return self.validation_dataset def get_test_dataset(self): """ Returns the test dataset of the experiment Returns ------- Dataset Anything that conforms to the pytorch style dataset. """ return self.test_dataset @staticmethod def get_iter(loader: DataLoader) -> Iterator: """ Returns the iterator for a pytorch data loader. The ``loader`` is a pytorch DataLoader that iterates over the dataset in batches and employs many strategies to do so. We want an iterator that returns the dataset in batches. The end of the iterator would signify the end of an epoch and then we can use that information to perform house-keeping. Parameters ---------- loader : DataLoader a pytorch data loader Returns ------- Iterator An iterator over the data loader """ iterator = iter(loader) return iterator def load_model_from_file(self, filename: str): self.msg_printer.divider("LOADING MODEL FROM FILE") with self.msg_printer.loading( f"Loading Pytorch Model from file {filename}"): model_chkpoint = torch.load(filename) self.msg_printer.good("Finished Loading the Model") model_state = model_chkpoint["model_state"] self.model.load_state_dict(model_state) def _set_seeds(self): seed = self.seeds.get("random_seed", 17290) numpy_seed = self.seeds.get("numpy_seed", 1729) torch_seed = self.seeds.get("pytorch_seed", 172) if seed is not None: random.seed(seed) if numpy_seed is not None: np.random.seed(numpy_seed) if torch_seed is not None: torch.manual_seed(torch_seed) # Seed all GPUs with the same seed if available. if torch.cuda.is_available(): torch.cuda.manual_seed_all(torch_seed)
def link(origin, link_name, force=False, model_path=None): """ Create a symlink for models within the spacy/data directory. Accepts either the name of a pip package, or the local path to the model data directory. Linking models allows loading them via spacy.load(link_name). """ msg = Printer() if util.is_package(origin): model_path = util.get_package_path(origin) else: model_path = Path(origin) if model_path is None else Path(model_path) if not model_path.exists(): msg.fail( "Can't locate model data", "The data should be located in {}".format(path2str(model_path)), exits=1, ) data_path = util.get_data_path() if not data_path or not data_path.exists(): spacy_loc = Path(__file__).parent.parent msg.fail( "Can't find the spaCy data path to create model symlink", "Make sure a directory `/data` exists within your spaCy " "installation and try again. The data directory should be located " "here:".format(path=spacy_loc), exits=1, ) link_path = util.get_data_path() / link_name if link_path.is_symlink() and not force: msg.fail( "Link '{}' already exists".format(link_name), "To overwrite an existing link, use the --force flag", exits=1, ) elif link_path.is_symlink(): # does a symlink exist? # NB: It's important to check for is_symlink here and not for exists, # because invalid/outdated symlinks would return False otherwise. link_path.unlink() elif link_path.exists(): # does it exist otherwise? # NB: Check this last because valid symlinks also "exist". msg.fail( "Can't overwrite symlink '{}'".format(link_name), "This can happen if your data directory contains a directory or " "file of the same name.", exits=1, ) details = "%s --> %s" % (path2str(model_path), path2str(link_path)) try: symlink_to(link_path, model_path) except: # noqa: E722 # This is quite dirty, but just making sure other errors are caught. msg.fail( "Couldn't link model to '{}'".format(link_name), "Creating a symlink in spacy/data failed. Make sure you have the " "required permissions and try re-running the command as admin, or " "use a virtualenv. You can still import the model as a module and " "call its load() method, or create the symlink manually.", ) msg.text(details) raise msg.good("Linking successful", details) msg.text("You can now load the model via spacy.load('{}')".format(link_name))