def print_pipe_analysis( analysis: Dict[str, Dict[str, Union[List[str], Dict]]], *, keys: List[str] = DEFAULT_KEYS, ) -> None: """Print a formatted version of the pipe analysis produced by analyze_pipes. analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis. keys (List[str]): The meta keys to show in the table. """ msg.divider("Pipeline Overview") header = ["#", "Component", *[key.capitalize() for key in keys]] summary: ItemsView = analysis["summary"].items() body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)] msg.table(body, header=header, divider=True, multiline=True) n_problems = sum(len(p) for p in analysis["problems"].values()) if any(p for p in analysis["problems"].values()): msg.divider(f"Problems ({n_problems})") for name, problem in analysis["problems"].items(): if problem: msg.warn( f"'{name}' requirements not met: {', '.join(problem)}") else: msg.good("No problems found.")
def evaluate_model(model, eval_path): """ Evaluate a trained model on Prodigy annotations and print the accuracy. """ with msg.loading(f"Loading model '{model}'..."): nlp = spacy.load(model) data, _ = format_data(srsly.read_jsonl(eval_path)) sc = nlp.evaluate(data) result = [("F-Score", f"{sc.textcat_score:.3f}")] msg.table(result)
def validate() -> None: model_pkgs, compat = get_model_pkgs() spacy_version = get_minor_version(about.__version__) current_compat = compat.get(spacy_version, {}) if not current_compat: msg.warn(f"No compatible packages found for v{spacy_version} of spaCy") incompat_models = { d["name"] for _, d in model_pkgs.items() if not d["compat"] } na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent msg.divider(f"Installed pipeline packages (spaCy v{about.__version__})") msg.info(f"spaCy installation: {spacy_dir}") if model_pkgs: header = ("NAME", "SPACY", "VERSION", "") rows = [] for name, data in model_pkgs.items(): if data["compat"]: comp = msg.text("", color="green", icon="good", no_print=True) version = msg.text(data["version"], color="green", no_print=True) else: version = msg.text(data["version"], color="yellow", no_print=True) comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}" rows.append((data["name"], data["spacy"], version, comp)) msg.table(rows, header=header) else: msg.text("No pipeline packages found in your current environment.", exits=0) if update_models: msg.divider("Install updates") msg.text("Use the following commands to update the packages:") cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.info( f"The following packages are custom spaCy pipelines or not " f"available for spaCy v{about.__version__}:", ", ".join(na_models), ) if incompat_models: sys.exit(1)
def info(model=None, markdown=False, silent=False): """ Print info about spaCy installation. If a model shortcut link is speficied as an argument, print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. """ if model: if util.is_package(model): model_path = util.get_package_path(model) else: model_path = util.get_data_path() / model meta_path = model_path / "meta.json" if not meta_path.is_file(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if model_path.resolve() != model_path: meta["link"] = path2str(model_path) meta["source"] = path2str(model_path.resolve()) else: meta["source"] = path2str(model_path) if not silent: title = "Info about model '{}'".format(model) model_meta = { k: v for k, v in meta.items() if k not in ("accuracy", "speed") } if markdown: print_markdown(model_meta, title=title) else: msg.table(model_meta, title=title) return meta data = { "spaCy version": about.__version__, "Location": path2str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), "Models": list_models(), } if not silent: title = "Info about spaCy" if markdown: print_markdown(data, title=title) else: msg.table(data, title=title) return data
def _print_span_characteristics(span_characteristics: Dict[str, Any]): """Print all span characteristics into a table""" headers = ("Span Type", "Length", "SD", "BD") # Prepare table data with all span characteristics table_data = [ span_characteristics["lengths"], span_characteristics["sd"], span_characteristics["bd"], ] table = _format_span_row(span_data=table_data, labels=span_characteristics["labels"]) # Prepare table footer with weighted averages footer_data = [ span_characteristics["avg_length"], span_characteristics["avg_sd"], span_characteristics["avg_bd"], ] footer = ["Wgt. Average"] + [str(round(f, 2)) for f in footer_data] msg.table(table, footer=footer, header=headers, divider=True)
def debug_config( config_path: Path, *, overrides: Dict[str, Any] = {}, show_funcs: bool = False, show_vars: bool = False, ): msg.divider("Config validation") with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) nlp = util.load_model_from_config(config) config = nlp.config.interpolate() msg.divider("Config validation for [initialize]") with show_validation_error(config_path): T = registry.resolve(config["initialize"], schema=ConfigSchemaInit) msg.divider("Config validation for [training]") with show_validation_error(config_path): T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] util.resolve_dot_names(config, dot_names) msg.good("Config is valid") if show_vars: variables = get_variables(config) msg.divider(f"Variables ({len(variables)})") head = ("Variable", "Value") msg.table(variables, header=head, divider=True, widths=(41, 34), spacing=2) if show_funcs: funcs = get_registered_funcs(config) msg.divider(f"Registered functions ({len(funcs)})") for func in funcs: func_data = { "Registry": f"@{func['registry']}", "Name": func["name"], "Module": func["module"], "File": f"{func['file']} (line {func['line_no']})", } msg.info(f"[{func['path']}]") print(table(func_data).strip())
def wps(model, data): """ Measure the processing speed in words per second. It's recommended to use a larger corpus of raw text here (e.g. a few million words). """ with msg.loading(f"Loading model '{model}'..."): nlp = spacy.load(model) texts = (eg["text"] for eg in srsly.read_jsonl(data)) n_docs = 0 n_words = 0 start_time = timer() for doc in nlp.pipe(texts): n_docs += 1 n_words += len(doc) end_time = timer() wps = int(n_words / (end_time - start_time)) result = [ ("Docs", f"{n_docs:,}"), ("Words", f"{n_words:,}"), ("Words/s", f"{wps:,}"), ] msg.table(result, widths=(7, 12), aligns=("l", "r"))
def main(self, args: DeviceInfoArguments) -> int: device_info = self.get_client().device_info(args.device_name) if device_info.device_type is None: msg.fail( f"Unknown device {args.device_name}", text="See `labby devices` for a list of available devices.", ) return 1 msg.divider( f"{args.device_name} (device_info.device_type.friendly_name)") if device_info.is_connected: msg.table([ ("Connection", render.good("OK")), *self._render_device_info(device_info), ]) else: msg.table([("Connection", render.fail("Error"))]) msg.text(f"{color(device_info.error_type, bold=True)}: " + f"{device_info.error_message}") return 0
def cli_print_problems(environment: str, difficulty: str, number: int): """Print a set of generated problems from a given environment. This is useful if you when developing new environment types for verifying that the problems you're generating take the form you expect. """ import gym from mathy_envs.gym import MathyGymEnv env_name = f"mathy-{environment}-{difficulty}-v0" env: MathyGymEnv = gym.make(env_name) # type:ignore msg.divider(env_name) with msg.loading(f"Generating {number} problems..."): header = ("Complexity", "Is Valid", "Text") widths = (10, 8, 62) aligns = ("c", "c", "l") data = [] for i in range(number): state, problem = env.mathy.get_initial_state(env.env_problem_args, print_problem=False) valid = False text = problem.text try: env.mathy.parser.parse(problem.text) valid = True except BaseException as error: text = f"parse failed for '{problem.text}' with error: {error}" data.append(( problem.complexity, "✔" if valid else "✘", text, )) msg.good(f"\nGenerated {number} problems!") print( msg.table(data, header=header, divider=True, widths=widths, aligns=aligns))
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: """Simulate a CLI help prompt using the info available in the project.yml. project_dir (Path): The project directory. subcommand (Optional[str]): The subcommand or None. If a subcommand is provided, the subcommand help is shown. Otherwise, the top-level help and a list of available commands is printed. """ config = load_project_config(project_dir) config_commands = config.get("commands", []) commands = {cmd["name"]: cmd for cmd in config_commands} workflows = config.get("workflows", {}) project_loc = "" if is_cwd(project_dir) else project_dir if subcommand: validate_subcommand(commands.keys(), workflows.keys(), subcommand) print(f"Usage: {COMMAND} project run {subcommand} {project_loc}") if subcommand in commands: help_text = commands[subcommand].get("help") if help_text: print(f"\n{help_text}\n") elif subcommand in workflows: steps = workflows[subcommand] print(f"\nWorkflow consisting of {len(steps)} commands:") steps_data = [(f"{i + 1}. {step}", commands[step].get("help", "")) for i, step in enumerate(steps)] msg.table(steps_data) help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help" print(f"For command details, run: {help_cmd}") else: print("") title = config.get("title") if title: print(f"{title}\n") if config_commands: print(f"Available commands in {PROJECT_FILE}") print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}") msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) if workflows: print(f"Available workflows in {PROJECT_FILE}") print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}") msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
def evaluate( model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None, displacy_limit=25, return_scores=False, ): """ Evaluate a model. To render a sample of parses in a HTML file, set an output directory as the displacy_path argument. """ util.fix_random_seed() if gpu_id >= 0: util.use_gpu(gpu_id) util.set_env_log(False) data_path = util.ensure_path(data_path) displacy_path = util.ensure_path(displacy_path) if not data_path.exists(): msg.fail("Evaluation data not found", data_path, exits=1) if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = GoldCorpus(data_path, data_path) if model.startswith("blank:"): nlp = spacy.blank(model.replace("blank:", "")) else: nlp = util.load_model(model) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) begin = timer() scorer = nlp.evaluate(dev_docs, verbose=False) end = timer() nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) results = { "Time": "%.2f s" % (end - begin), "Words": nwords, "Words/s": "%.0f" % (nwords / (end - begin)), "TOK": "%.2f" % scorer.token_acc, "POS": "%.2f" % scorer.tags_acc, "UAS": "%.2f" % scorer.uas, "LAS": "%.2f" % scorer.las, "NER P": "%.2f" % scorer.ents_p, "NER R": "%.2f" % scorer.ents_r, "NER F": "%.2f" % scorer.ents_f, "Textcat": "%.2f" % scorer.textcat_score, } msg.table(results, title="Results") if displacy_path: docs, golds = zip(*dev_docs) render_deps = "parser" in nlp.meta.get("pipeline", []) render_ents = "ner" in nlp.meta.get("pipeline", []) render_parses( docs, displacy_path, model_name=model, limit=displacy_limit, deps=render_deps, ents=render_ents, ) msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path) if return_scores: return scorer.scores
def validate(): """ Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( "Server error ({})".format(r.status_code), "Couldn't fetch compatibility table.", exits=1, ) msg.good("Loaded compatibility table") compat = r.json()["spacy"] version = about.__version__ version = version.rsplit(".dev", 1)[0] current_compat = compat.get(version) if not current_compat: msg.fail( "Can't find spaCy v{} in compatibility table".format(version), about.__compatibility__, exits=1, ) all_models = set() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] model_links = get_model_links(current_compat) model_pkgs = get_model_pkgs(current_compat, all_models) incompat_links = {l for l, d in model_links.items() if not d["compat"]} incompat_models = { d["name"] for _, d in model_pkgs.items() if not d["compat"] } incompat_models.update( [d["name"] for _, d in model_links.items() if not d["compat"]]) na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent msg.divider("Installed models (spaCy v{})".format(about.__version__)) msg.info("spaCy installation: {}".format(path2str(spacy_dir))) if model_links or model_pkgs: header = ("TYPE", "NAME", "MODEL", "VERSION", "") rows = [] for name, data in model_pkgs.items(): rows.append(get_model_row(current_compat, name, data, msg)) for name, data in model_links.items(): rows.append(get_model_row(current_compat, name, data, msg, "link")) msg.table(rows, header=header) else: msg.text("No models found in your current environment.", exits=0) if update_models: msg.divider("Install updates") msg.text("Use the following commands to update the model packages:") cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.text("The following models are not available for spaCy " "v{}: {}".format(about.__version__, ", ".join(na_models))) if incompat_links: msg.text( "You may also want to overwrite the incompatible links using the " "`python -m spacy link` command with `--force`, or remove them " "from the data directory. " "Data path: {path}".format(path=path2str(get_data_path()))) if incompat_models or incompat_links: sys.exit(1)
def speech_to_text_demo(asr: "ASROnlineAudioFrame") -> None: """Speech to Text (ASR) Microphone Demo. Interrupt the notebook's kernel to stop the app from recoring. """ asr.reset() audio = pyaudio.PyAudio() offset = {"count": 0} columns = [] devices = [] for idx in range(audio.get_device_count()): device = audio.get_device_info_by_index(idx) if not device.get("maxInputChannels"): continue devices.append(idx) columns.append((idx, device.get("name"))) if columns: msg.good("Found the following input devices!") msg.table(columns, header=("ID", "Devices"), divider=True) if devices: device_index = -2 while device_index not in devices: msg.info("Please enter the device ID") device_index = int(input()) def callback(in_data, frame_count, time_info, status): signal = np.frombuffer(in_data, dtype=np.int16) text = asr.transcribe(signal) if text: print(text, end="") offset["count"] = asr.params.offset elif offset["count"] > 0: offset["count"] -= 1 if offset["count"] == 0: print(" ", end="") return (in_data, pyaudio.paContinue) stream = audio.open( input=True, format=pyaudio.paInt16, input_device_index=device_index, stream_callback=callback, channels=asr.params.channels, rate=asr.params.sample_rate, frames_per_buffer=asr.chunk_size, ) msg.loading("Listening...") stream.start_stream() try: while stream.is_active(): time.sleep(0.1) except (KeyboardInterrupt, Exception) as e: stream.stop_stream() stream.close() audio.terminate() msg.warn("WARNING: ASR stream stopped.", e) else: msg.fail("ERROR", "No audio input device found.")
def debug_data( config_path: Path, *, config_overrides: Dict[str, Any] = {}, ignore_warnings: bool = False, verbose: bool = False, no_format: bool = True, silent: bool = True, ): msg = Printer( no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings ) # Make sure all files and paths exists if they are needed with show_validation_error(config_path): cfg = util.load_config(config_path, overrides=config_overrides) nlp = util.load_model_from_config(cfg) config = nlp.config.interpolate() T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # Use original config here, not resolved version sourced_components = get_sourced_components(cfg) frozen_components = T["frozen_components"] resume_components = [p for p in sourced_components if p not in frozen_components] pipeline = nlp.pipe_names factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] msg.divider("Data file validation") # Create the gold corpus to be able to better analyze data dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.good("Pipeline can be initialized with data") train_dataset = list(train_corpus(nlp)) dev_dataset = list(dev_corpus(nlp)) msg.good("Corpus is loadable") # Create all gold data here to avoid iterating over the train_dataset constantly gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True) gold_train_unpreprocessed_data = _compile_gold( train_dataset, factory_names, nlp, make_proj=False ) gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] frozen_components = T["frozen_components"] msg.divider("Training stats") msg.text(f"Language: {nlp.lang}") msg.text(f"Training pipeline: {', '.join(pipeline)}") if resume_components: msg.text(f"Components from other pipelines: {', '.join(resume_components)}") if frozen_components: msg.text(f"Frozen components: {', '.join(frozen_components)}") msg.text(f"{len(train_dataset)} training docs") msg.text(f"{len(dev_dataset)} evaluation docs") if not len(gold_dev_data): msg.fail("No evaluation docs") overlap = len(train_texts.intersection(dev_texts)) if overlap: msg.warn(f"{overlap} training examples also in evaluation data") else: msg.good("No overlap between training and evaluation data") # TODO: make this feedback more fine-grained and report on updated # components vs. blank components if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD: text = f"Low number of examples to train a new pipeline ({len(train_dataset)})" if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD: msg.fail(text) else: msg.warn(text) msg.text( f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples " f"(minimum {BLANK_MODEL_MIN_THRESHOLD})", show=verbose, ) msg.divider("Vocab & Vectors") n_words = gold_train_data["n_words"] msg.info( f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)" ) if gold_train_data["n_misaligned_words"] > 0: n_misaligned = gold_train_data["n_misaligned_words"] msg.warn(f"{n_misaligned} misaligned tokens in the training data") if gold_dev_data["n_misaligned_words"] > 0: n_misaligned = gold_dev_data["n_misaligned_words"] msg.warn(f"{n_misaligned} misaligned tokens in the dev data") most_common_words = gold_train_data["words"].most_common(10) msg.text( f"10 most common words: {_format_labels(most_common_words, counts=True)}", show=verbose, ) if len(nlp.vocab.vectors): msg.info( f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} " f"unique keys, {nlp.vocab.vectors_length} dimensions)" ) n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values()) msg.warn( "{} words in training data without vectors ({:.0f}%)".format( n_missing_vectors, 100 * (n_missing_vectors / gold_train_data["n_words"]), ), ) msg.text( "10 most common words without vectors: {}".format( _format_labels( gold_train_data["words_missing_vectors"].most_common(10), counts=True, ) ), show=verbose, ) else: msg.info("No word vectors present in the package") if "spancat" in factory_names: model_labels_spancat = _get_labels_from_spancat(nlp) has_low_data_warning = False has_no_neg_warning = False msg.divider("Span Categorization") msg.table(model_labels_spancat, header=["Spans Key", "Labels"], divider=True) msg.text("Label counts in train data: ", show=verbose) for spans_key, data_labels in gold_train_data["spancat"].items(): msg.text( f"Key: {spans_key}, {_format_labels(data_labels.items(), counts=True)}", show=verbose, ) # Data checks: only take the spans keys in the actual spancat components data_labels_in_component = { spans_key: gold_train_data["spancat"][spans_key] for spans_key in model_labels_spancat.keys() } for spans_key, data_labels in data_labels_in_component.items(): for label, count in data_labels.items(): # Check for missing labels spans_key_in_model = spans_key in model_labels_spancat.keys() if (spans_key_in_model) and ( label not in model_labels_spancat[spans_key] ): msg.warn( f"Label '{label}' is not present in the model labels of key '{spans_key}'. " "Performance may degrade after training." ) # Check for low number of examples per label if count <= NEW_LABEL_THRESHOLD: msg.warn( f"Low number of examples for label '{label}' in key '{spans_key}' ({count})" ) has_low_data_warning = True # Check for negative examples with msg.loading("Analyzing label distribution..."): neg_docs = _get_examples_without_label( train_dataset, label, "spancat", spans_key ) if neg_docs == 0: msg.warn(f"No examples for texts WITHOUT new label '{label}'") has_no_neg_warning = True if has_low_data_warning: msg.text( f"To train a new span type, your data should include at " f"least {NEW_LABEL_THRESHOLD} instances of the new label", show=verbose, ) else: msg.good("Good amount of examples for all labels") if has_no_neg_warning: msg.text( "Training data should always include examples of spans " "in context, as well as examples without a given span " "type.", show=verbose, ) else: msg.good("Examples without ocurrences available for all labels") if "ner" in factory_names: # Get all unique NER labels present in the data labels = set( label for label in gold_train_data["ner"] if label not in ("O", "-", None) ) label_counts = gold_train_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") has_low_data_warning = False has_no_neg_warning = False has_ws_ents_error = False has_boundary_cross_ents_warning = False msg.divider("Named Entity Recognition") msg.info(f"{len(model_labels)} label(s)") missing_values = label_counts["-"] msg.text(f"{missing_values} missing value(s) (tokens with '-' label)") for label in labels: if len(label) == 0: msg.fail("Empty label found in train data") labels_with_counts = [ (label, count) for label, count in label_counts.most_common() if label != "-" ] labels_with_counts = _format_labels(labels_with_counts, counts=True) msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose) missing_labels = model_labels - labels if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}." ) if gold_train_data["ws_ents"]: msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans") has_ws_ents_error = True for label in labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( f"Low number of examples for label '{label}' ({label_counts[label]})" ) has_low_data_warning = True with msg.loading("Analyzing label distribution..."): neg_docs = _get_examples_without_label(train_dataset, label, "ner") if neg_docs == 0: msg.warn(f"No examples for texts WITHOUT new label '{label}'") has_no_neg_warning = True if gold_train_data["boundary_cross_ents"]: msg.warn( f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries" ) has_boundary_cross_ents_warning = True if not has_low_data_warning: msg.good("Good amount of examples for all labels") if not has_no_neg_warning: msg.good("Examples without occurrences available for all labels") if not has_ws_ents_error: msg.good("No entities consisting of or starting/ending with whitespace") if not has_boundary_cross_ents_warning: msg.good("No entities crossing sentence boundaries") if has_low_data_warning: msg.text( f"To train a new entity type, your data should include at " f"least {NEW_LABEL_THRESHOLD} instances of the new label", show=verbose, ) if has_no_neg_warning: msg.text( "Training data should always include examples of entities " "in context, as well as examples without a given entity " "type.", show=verbose, ) if has_ws_ents_error: msg.text( "Entity spans consisting of or starting/ending " "with whitespace characters are considered invalid." ) if "textcat" in factory_names: msg.divider("Text Classification (Exclusive Classes)") labels = _get_labels_from_model(nlp, "textcat") msg.info(f"Text Classification: {len(labels)} label(s)") msg.text(f"Labels: {_format_labels(labels)}", show=verbose) missing_labels = labels - set(gold_train_data["cats"]) if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}." ) if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): msg.warn( "Potential train/dev mismatch: the train and dev labels are " "not the same. " f"Train labels: {_format_labels(gold_train_data['cats'])}. " f"Dev labels: {_format_labels(gold_dev_data['cats'])}." ) if len(labels) < 2: msg.fail( "The model does not have enough labels. 'textcat' requires at " "least two labels due to mutually-exclusive classes, e.g. " "LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary " "classification task." ) if ( gold_train_data["n_cats_bad_values"] > 0 or gold_dev_data["n_cats_bad_values"] > 0 ): msg.fail( "Unsupported values for cats: the supported values are " "1.0/True and 0.0/False." ) if gold_train_data["n_cats_multilabel"] > 0: # Note: you should never get here because you run into E895 on # initialization first. msg.fail( "The train data contains instances without mutually-exclusive " "classes. Use the component 'textcat_multilabel' instead of " "'textcat'." ) if gold_dev_data["n_cats_multilabel"] > 0: msg.fail( "The dev data contains instances without mutually-exclusive " "classes. Use the component 'textcat_multilabel' instead of " "'textcat'." ) if "textcat_multilabel" in factory_names: msg.divider("Text Classification (Multilabel)") labels = _get_labels_from_model(nlp, "textcat_multilabel") msg.info(f"Text Classification: {len(labels)} label(s)") msg.text(f"Labels: {_format_labels(labels)}", show=verbose) missing_labels = labels - set(gold_train_data["cats"]) if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}." ) if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): msg.warn( "Potential train/dev mismatch: the train and dev labels are " "not the same. " f"Train labels: {_format_labels(gold_train_data['cats'])}. " f"Dev labels: {_format_labels(gold_dev_data['cats'])}." ) if ( gold_train_data["n_cats_bad_values"] > 0 or gold_dev_data["n_cats_bad_values"] > 0 ): msg.fail( "Unsupported values for cats: the supported values are " "1.0/True and 0.0/False." ) if gold_train_data["n_cats_multilabel"] > 0: if gold_dev_data["n_cats_multilabel"] == 0: msg.warn( "Potential train/dev mismatch: the train data contains " "instances without mutually-exclusive classes while the " "dev data contains only instances with mutually-exclusive " "classes." ) else: msg.warn( "The train data contains only instances with " "mutually-exclusive classes. You can potentially use the " "component 'textcat' instead of 'textcat_multilabel'." ) if gold_dev_data["n_cats_multilabel"] > 0: msg.fail( "Train/dev mismatch: the dev data contains instances " "without mutually-exclusive classes while the train data " "contains only instances with mutually-exclusive classes." ) if "tagger" in factory_names: msg.divider("Part-of-speech Tagging") label_list = [label for label in gold_train_data["tags"]] model_labels = _get_labels_from_model(nlp, "tagger") msg.info(f"{len(label_list)} label(s) in train data") labels = set(label_list) missing_labels = model_labels - labels if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}." ) labels_with_counts = _format_labels( gold_train_data["tags"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) if "morphologizer" in factory_names: msg.divider("Morphologizer (POS+Morph)") label_list = [label for label in gold_train_data["morphs"]] model_labels = _get_labels_from_model(nlp, "morphologizer") msg.info(f"{len(label_list)} label(s) in train data") labels = set(label_list) missing_labels = model_labels - labels if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}." ) labels_with_counts = _format_labels( gold_train_data["morphs"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) if "parser" in factory_names: has_low_data_warning = False msg.divider("Dependency Parsing") # profile sentence length msg.info( f"Found {gold_train_data['n_sents']} sentence(s) with an average " f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words." ) # check for documents with multiple sentences sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"]) if sents_per_doc < 1.1: msg.warn( f"The training data contains {sents_per_doc:.2f} sentences per " f"document. When there are very few documents containing more " f"than one sentence, the parser will not learn how to segment " f"longer texts into sentences." ) # profile labels labels_train = [label for label in gold_train_data["deps"]] labels_train_unpreprocessed = [ label for label in gold_train_unpreprocessed_data["deps"] ] labels_dev = [label for label in gold_dev_data["deps"]] if gold_train_unpreprocessed_data["n_nonproj"] > 0: n_nonproj = gold_train_unpreprocessed_data["n_nonproj"] msg.info(f"Found {n_nonproj} nonprojective train sentence(s)") if gold_dev_data["n_nonproj"] > 0: n_nonproj = gold_dev_data["n_nonproj"] msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)") msg.info(f"{len(labels_train_unpreprocessed)} label(s) in train data") msg.info(f"{len(labels_train)} label(s) in projectivized train data") labels_with_counts = _format_labels( gold_train_unpreprocessed_data["deps"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) # rare labels in train for label in gold_train_unpreprocessed_data["deps"]: if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD: msg.warn( f"Low number of examples for label '{label}' " f"({gold_train_unpreprocessed_data['deps'][label]})" ) has_low_data_warning = True # rare labels in projectivized train rare_projectivized_labels = [] for label in gold_train_data["deps"]: if ( gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and DELIMITER in label ): rare_projectivized_labels.append( f"{label}: {gold_train_data['deps'][label]}" ) if len(rare_projectivized_labels) > 0: msg.warn( f"Low number of examples for {len(rare_projectivized_labels)} " "label(s) in the projectivized dependency trees used for " "training. You may want to projectivize labels such as punct " "before training in order to improve parser performance." ) msg.warn( f"Projectivized labels with low numbers of examples: ", ", ".join(rare_projectivized_labels), show=verbose, ) has_low_data_warning = True # labels only in train if set(labels_train) - set(labels_dev): msg.warn( "The following labels were found only in the train data:", ", ".join(set(labels_train) - set(labels_dev)), show=verbose, ) # labels only in dev if set(labels_dev) - set(labels_train): msg.warn( "The following labels were found only in the dev data:", ", ".join(set(labels_dev) - set(labels_train)), show=verbose, ) if has_low_data_warning: msg.text( f"To train a parser, your data should include at " f"least {DEP_LABEL_THRESHOLD} instances of each label.", show=verbose, ) # multiple root labels if len(gold_train_unpreprocessed_data["roots"]) > 1: msg.warn( f"Multiple root labels " f"({', '.join(gold_train_unpreprocessed_data['roots'])}) " f"found in training data. spaCy's parser uses a single root " f"label ROOT so this distinction will not be available." ) # these should not happen, but just in case if gold_train_data["n_nonproj"] > 0: msg.fail( f"Found {gold_train_data['n_nonproj']} nonprojective " f"projectivized train sentence(s)" ) if gold_train_data["n_cycles"] > 0: msg.fail( f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles" ) msg.divider("Summary") good_counts = msg.counts[MESSAGES.GOOD] warn_counts = msg.counts[MESSAGES.WARN] fail_counts = msg.counts[MESSAGES.FAIL] if good_counts: msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed") if warn_counts: msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}") if fail_counts: msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}") sys.exit(1)