Ejemplo n.º 1
0
def print_pipe_analysis(
    analysis: Dict[str, Dict[str, Union[List[str], Dict]]],
    *,
    keys: List[str] = DEFAULT_KEYS,
) -> None:
    """Print a formatted version of the pipe analysis produced by analyze_pipes.

    analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis.
    keys (List[str]): The meta keys to show in the table.
    """
    msg.divider("Pipeline Overview")
    header = ["#", "Component", *[key.capitalize() for key in keys]]
    summary: ItemsView = analysis["summary"].items()
    body = [[i, n, *[v for v in m.values()]]
            for i, (n, m) in enumerate(summary)]
    msg.table(body, header=header, divider=True, multiline=True)
    n_problems = sum(len(p) for p in analysis["problems"].values())
    if any(p for p in analysis["problems"].values()):
        msg.divider(f"Problems ({n_problems})")
        for name, problem in analysis["problems"].items():
            if problem:
                msg.warn(
                    f"'{name}' requirements not met: {', '.join(problem)}")
    else:
        msg.good("No problems found.")
Ejemplo n.º 2
0
def evaluate_model(model, eval_path):
    """
    Evaluate a trained model on Prodigy annotations and print the accuracy.
    """
    with msg.loading(f"Loading model '{model}'..."):
        nlp = spacy.load(model)
    data, _ = format_data(srsly.read_jsonl(eval_path))
    sc = nlp.evaluate(data)
    result = [("F-Score", f"{sc.textcat_score:.3f}")]
    msg.table(result)
Ejemplo n.º 3
0
def validate() -> None:
    model_pkgs, compat = get_model_pkgs()
    spacy_version = get_minor_version(about.__version__)
    current_compat = compat.get(spacy_version, {})
    if not current_compat:
        msg.warn(f"No compatible packages found for v{spacy_version} of spaCy")
    incompat_models = {
        d["name"]
        for _, d in model_pkgs.items() if not d["compat"]
    }
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

    msg.divider(f"Installed pipeline packages (spaCy v{about.__version__})")
    msg.info(f"spaCy installation: {spacy_dir}")

    if model_pkgs:
        header = ("NAME", "SPACY", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
            if data["compat"]:
                comp = msg.text("", color="green", icon="good", no_print=True)
                version = msg.text(data["version"],
                                   color="green",
                                   no_print=True)
            else:
                version = msg.text(data["version"],
                                   color="yellow",
                                   no_print=True)
                comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}"
            rows.append((data["name"], data["spacy"], version, comp))
        msg.table(rows, header=header)
    else:
        msg.text("No pipeline packages found in your current environment.",
                 exits=0)
    if update_models:
        msg.divider("Install updates")
        msg.text("Use the following commands to update the packages:")
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
        msg.info(
            f"The following packages are custom spaCy pipelines or not "
            f"available for spaCy v{about.__version__}:",
            ", ".join(na_models),
        )
    if incompat_models:
        sys.exit(1)
Ejemplo n.º 4
0
def info(model=None, markdown=False, silent=False):
    """
    Print info about spaCy installation. If a model shortcut link is
    speficied as an argument, print model information. Flag --markdown
    prints details in Markdown for easy copy-pasting to GitHub issues.
    """
    if model:
        if util.is_package(model):
            model_path = util.get_package_path(model)
        else:
            model_path = util.get_data_path() / model
        meta_path = model_path / "meta.json"
        if not meta_path.is_file():
            msg.fail("Can't find model meta.json", meta_path, exits=1)
        meta = srsly.read_json(meta_path)
        if model_path.resolve() != model_path:
            meta["link"] = path2str(model_path)
            meta["source"] = path2str(model_path.resolve())
        else:
            meta["source"] = path2str(model_path)
        if not silent:
            title = "Info about model '{}'".format(model)
            model_meta = {
                k: v
                for k, v in meta.items() if k not in ("accuracy", "speed")
            }
            if markdown:
                print_markdown(model_meta, title=title)
            else:
                msg.table(model_meta, title=title)
        return meta
    data = {
        "spaCy version": about.__version__,
        "Location": path2str(Path(__file__).parent.parent),
        "Platform": platform.platform(),
        "Python version": platform.python_version(),
        "Models": list_models(),
    }
    if not silent:
        title = "Info about spaCy"
        if markdown:
            print_markdown(data, title=title)
        else:
            msg.table(data, title=title)
    return data
Ejemplo n.º 5
0
def _print_span_characteristics(span_characteristics: Dict[str, Any]):
    """Print all span characteristics into a table"""
    headers = ("Span Type", "Length", "SD", "BD")
    # Prepare table data with all span characteristics
    table_data = [
        span_characteristics["lengths"],
        span_characteristics["sd"],
        span_characteristics["bd"],
    ]
    table = _format_span_row(span_data=table_data,
                             labels=span_characteristics["labels"])
    # Prepare table footer with weighted averages
    footer_data = [
        span_characteristics["avg_length"],
        span_characteristics["avg_sd"],
        span_characteristics["avg_bd"],
    ]
    footer = ["Wgt. Average"] + [str(round(f, 2)) for f in footer_data]
    msg.table(table, footer=footer, header=headers, divider=True)
Ejemplo n.º 6
0
def debug_config(
    config_path: Path,
    *,
    overrides: Dict[str, Any] = {},
    show_funcs: bool = False,
    show_vars: bool = False,
):
    msg.divider("Config validation")
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
        nlp = util.load_model_from_config(config)
        config = nlp.config.interpolate()
    msg.divider("Config validation for [initialize]")
    with show_validation_error(config_path):
        T = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
    msg.divider("Config validation for [training]")
    with show_validation_error(config_path):
        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
        dot_names = [T["train_corpus"], T["dev_corpus"]]
        util.resolve_dot_names(config, dot_names)
    msg.good("Config is valid")
    if show_vars:
        variables = get_variables(config)
        msg.divider(f"Variables ({len(variables)})")
        head = ("Variable", "Value")
        msg.table(variables,
                  header=head,
                  divider=True,
                  widths=(41, 34),
                  spacing=2)
    if show_funcs:
        funcs = get_registered_funcs(config)
        msg.divider(f"Registered functions ({len(funcs)})")
        for func in funcs:
            func_data = {
                "Registry": f"@{func['registry']}",
                "Name": func["name"],
                "Module": func["module"],
                "File": f"{func['file']} (line {func['line_no']})",
            }
            msg.info(f"[{func['path']}]")
            print(table(func_data).strip())
Ejemplo n.º 7
0
def wps(model, data):
    """
    Measure the processing speed in words per second. It's recommended to
    use a larger corpus of raw text here (e.g. a few million words).
    """
    with msg.loading(f"Loading model '{model}'..."):
        nlp = spacy.load(model)
    texts = (eg["text"] for eg in srsly.read_jsonl(data))
    n_docs = 0
    n_words = 0
    start_time = timer()
    for doc in nlp.pipe(texts):
        n_docs += 1
        n_words += len(doc)
    end_time = timer()
    wps = int(n_words / (end_time - start_time))
    result = [
        ("Docs", f"{n_docs:,}"),
        ("Words", f"{n_words:,}"),
        ("Words/s", f"{wps:,}"),
    ]
    msg.table(result, widths=(7, 12), aligns=("l", "r"))
Ejemplo n.º 8
0
    def main(self, args: DeviceInfoArguments) -> int:
        device_info = self.get_client().device_info(args.device_name)
        if device_info.device_type is None:
            msg.fail(
                f"Unknown device {args.device_name}",
                text="See `labby devices` for a list of available devices.",
            )
            return 1

        msg.divider(
            f"{args.device_name} (device_info.device_type.friendly_name)")

        if device_info.is_connected:
            msg.table([
                ("Connection", render.good("OK")),
                *self._render_device_info(device_info),
            ])
        else:
            msg.table([("Connection", render.fail("Error"))])
            msg.text(f"{color(device_info.error_type, bold=True)}: " +
                     f"{device_info.error_message}")

        return 0
Ejemplo n.º 9
0
def cli_print_problems(environment: str, difficulty: str, number: int):
    """Print a set of generated problems from a given environment.

    This is useful if you when developing new environment types for
    verifying that the problems you're generating take the form you
    expect. """
    import gym
    from mathy_envs.gym import MathyGymEnv

    env_name = f"mathy-{environment}-{difficulty}-v0"
    env: MathyGymEnv = gym.make(env_name)  # type:ignore
    msg.divider(env_name)
    with msg.loading(f"Generating {number} problems..."):
        header = ("Complexity", "Is Valid", "Text")
        widths = (10, 8, 62)
        aligns = ("c", "c", "l")
        data = []
        for i in range(number):
            state, problem = env.mathy.get_initial_state(env.env_problem_args,
                                                         print_problem=False)
            valid = False
            text = problem.text
            try:
                env.mathy.parser.parse(problem.text)
                valid = True
            except BaseException as error:
                text = f"parse failed for '{problem.text}' with error: {error}"
            data.append((
                problem.complexity,
                "✔" if valid else "✘",
                text,
            ))
    msg.good(f"\nGenerated {number} problems!")

    print(
        msg.table(data,
                  header=header,
                  divider=True,
                  widths=widths,
                  aligns=aligns))
Ejemplo n.º 10
0
def print_run_help(project_dir: Path,
                   subcommand: Optional[str] = None) -> None:
    """Simulate a CLI help prompt using the info available in the project.yml.

    project_dir (Path): The project directory.
    subcommand (Optional[str]): The subcommand or None. If a subcommand is
        provided, the subcommand help is shown. Otherwise, the top-level help
        and a list of available commands is printed.
    """
    config = load_project_config(project_dir)
    config_commands = config.get("commands", [])
    commands = {cmd["name"]: cmd for cmd in config_commands}
    workflows = config.get("workflows", {})
    project_loc = "" if is_cwd(project_dir) else project_dir
    if subcommand:
        validate_subcommand(commands.keys(), workflows.keys(), subcommand)
        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
        if subcommand in commands:
            help_text = commands[subcommand].get("help")
            if help_text:
                print(f"\n{help_text}\n")
        elif subcommand in workflows:
            steps = workflows[subcommand]
            print(f"\nWorkflow consisting of {len(steps)} commands:")
            steps_data = [(f"{i + 1}. {step}", commands[step].get("help", ""))
                          for i, step in enumerate(steps)]
            msg.table(steps_data)
            help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
            print(f"For command details, run: {help_cmd}")
    else:
        print("")
        title = config.get("title")
        if title:
            print(f"{title}\n")
        if config_commands:
            print(f"Available commands in {PROJECT_FILE}")
            print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
            msg.table([(cmd["name"], cmd.get("help", ""))
                       for cmd in config_commands])
        if workflows:
            print(f"Available workflows in {PROJECT_FILE}")
            print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
            msg.table([(name, " -> ".join(steps))
                       for name, steps in workflows.items()])
Ejemplo n.º 11
0
def evaluate(
    model,
    data_path,
    gpu_id=-1,
    gold_preproc=False,
    displacy_path=None,
    displacy_limit=25,
    return_scores=False,
):
    """
    Evaluate a model. To render a sample of parses in a HTML file, set an
    output directory as the displacy_path argument.
    """
    util.fix_random_seed()
    if gpu_id >= 0:
        util.use_gpu(gpu_id)
    util.set_env_log(False)
    data_path = util.ensure_path(data_path)
    displacy_path = util.ensure_path(displacy_path)
    if not data_path.exists():
        msg.fail("Evaluation data not found", data_path, exits=1)
    if displacy_path and not displacy_path.exists():
        msg.fail("Visualization output directory not found",
                 displacy_path,
                 exits=1)
    corpus = GoldCorpus(data_path, data_path)
    if model.startswith("blank:"):
        nlp = spacy.blank(model.replace("blank:", ""))
    else:
        nlp = util.load_model(model)
    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
    begin = timer()
    scorer = nlp.evaluate(dev_docs, verbose=False)
    end = timer()
    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
    results = {
        "Time": "%.2f s" % (end - begin),
        "Words": nwords,
        "Words/s": "%.0f" % (nwords / (end - begin)),
        "TOK": "%.2f" % scorer.token_acc,
        "POS": "%.2f" % scorer.tags_acc,
        "UAS": "%.2f" % scorer.uas,
        "LAS": "%.2f" % scorer.las,
        "NER P": "%.2f" % scorer.ents_p,
        "NER R": "%.2f" % scorer.ents_r,
        "NER F": "%.2f" % scorer.ents_f,
        "Textcat": "%.2f" % scorer.textcat_score,
    }
    msg.table(results, title="Results")

    if displacy_path:
        docs, golds = zip(*dev_docs)
        render_deps = "parser" in nlp.meta.get("pipeline", [])
        render_ents = "ner" in nlp.meta.get("pipeline", [])
        render_parses(
            docs,
            displacy_path,
            model_name=model,
            limit=displacy_limit,
            deps=render_deps,
            ents=render_ents,
        )
        msg.good("Generated {} parses as HTML".format(displacy_limit),
                 displacy_path)
    if return_scores:
        return scorer.scores
Ejemplo n.º 12
0
def validate():
    """
    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
    with msg.loading("Loading compatibility table..."):
        r = requests.get(about.__compatibility__)
        if r.status_code != 200:
            msg.fail(
                "Server error ({})".format(r.status_code),
                "Couldn't fetch compatibility table.",
                exits=1,
            )
    msg.good("Loaded compatibility table")
    compat = r.json()["spacy"]
    version = about.__version__
    version = version.rsplit(".dev", 1)[0]
    current_compat = compat.get(version)
    if not current_compat:
        msg.fail(
            "Can't find spaCy v{} in compatibility table".format(version),
            about.__compatibility__,
            exits=1,
        )
    all_models = set()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
        for model, model_vs in models.items():
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    model_links = get_model_links(current_compat)
    model_pkgs = get_model_pkgs(current_compat, all_models)
    incompat_links = {l for l, d in model_links.items() if not d["compat"]}
    incompat_models = {
        d["name"]
        for _, d in model_pkgs.items() if not d["compat"]
    }
    incompat_models.update(
        [d["name"] for _, d in model_links.items() if not d["compat"]])
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

    msg.divider("Installed models (spaCy v{})".format(about.__version__))
    msg.info("spaCy installation: {}".format(path2str(spacy_dir)))

    if model_links or model_pkgs:
        header = ("TYPE", "NAME", "MODEL", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
            rows.append(get_model_row(current_compat, name, data, msg))
        for name, data in model_links.items():
            rows.append(get_model_row(current_compat, name, data, msg, "link"))
        msg.table(rows, header=header)
    else:
        msg.text("No models found in your current environment.", exits=0)
    if update_models:
        msg.divider("Install updates")
        msg.text("Use the following commands to update the model packages:")
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
        msg.text("The following models are not available for spaCy "
                 "v{}: {}".format(about.__version__, ", ".join(na_models)))
    if incompat_links:
        msg.text(
            "You may also want to overwrite the incompatible links using the "
            "`python -m spacy link` command with `--force`, or remove them "
            "from the data directory. "
            "Data path: {path}".format(path=path2str(get_data_path())))
    if incompat_models or incompat_links:
        sys.exit(1)
Ejemplo n.º 13
0
def speech_to_text_demo(asr: "ASROnlineAudioFrame") -> None:
    """Speech to Text (ASR) Microphone Demo.

    Interrupt the notebook's kernel to stop the app from recoring.
    """
    asr.reset()
    audio = pyaudio.PyAudio()

    offset = {"count": 0}
    columns = []
    devices = []

    for idx in range(audio.get_device_count()):
        device = audio.get_device_info_by_index(idx)
        if not device.get("maxInputChannels"):
            continue
        devices.append(idx)
        columns.append((idx, device.get("name")))

    if columns:
        msg.good("Found the following input devices!")
        msg.table(columns, header=("ID", "Devices"), divider=True)

    if devices:
        device_index = -2
        while device_index not in devices:
            msg.info("Please enter the device ID")
            device_index = int(input())

        def callback(in_data, frame_count, time_info, status):
            signal = np.frombuffer(in_data, dtype=np.int16)
            text = asr.transcribe(signal)
            if text:
                print(text, end="")
                offset["count"] = asr.params.offset
            elif offset["count"] > 0:
                offset["count"] -= 1
                if offset["count"] == 0:
                    print(" ", end="")
            return (in_data, pyaudio.paContinue)

        stream = audio.open(
            input=True,
            format=pyaudio.paInt16,
            input_device_index=device_index,
            stream_callback=callback,
            channels=asr.params.channels,
            rate=asr.params.sample_rate,
            frames_per_buffer=asr.chunk_size,
        )

        msg.loading("Listening...")
        stream.start_stream()

        try:
            while stream.is_active():
                time.sleep(0.1)
        except (KeyboardInterrupt, Exception) as e:
            stream.stop_stream()
            stream.close()
            audio.terminate()
            msg.warn("WARNING: ASR stream stopped.", e)
    else:
        msg.fail("ERROR", "No audio input device found.")
Ejemplo n.º 14
0
def debug_data(
    config_path: Path,
    *,
    config_overrides: Dict[str, Any] = {},
    ignore_warnings: bool = False,
    verbose: bool = False,
    no_format: bool = True,
    silent: bool = True,
):
    msg = Printer(
        no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
    )
    # Make sure all files and paths exists if they are needed
    with show_validation_error(config_path):
        cfg = util.load_config(config_path, overrides=config_overrides)
        nlp = util.load_model_from_config(cfg)
        config = nlp.config.interpolate()
        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    # Use original config here, not resolved version
    sourced_components = get_sourced_components(cfg)
    frozen_components = T["frozen_components"]
    resume_components = [p for p in sourced_components if p not in frozen_components]
    pipeline = nlp.pipe_names
    factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
    msg.divider("Data file validation")

    # Create the gold corpus to be able to better analyze data
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)

    nlp.initialize(lambda: train_corpus(nlp))
    msg.good("Pipeline can be initialized with data")

    train_dataset = list(train_corpus(nlp))
    dev_dataset = list(dev_corpus(nlp))
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_dataset constantly
    gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
    gold_train_unpreprocessed_data = _compile_gold(
        train_dataset, factory_names, nlp, make_proj=False
    )
    gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True)

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]
    frozen_components = T["frozen_components"]

    msg.divider("Training stats")
    msg.text(f"Language: {nlp.lang}")
    msg.text(f"Training pipeline: {', '.join(pipeline)}")
    if resume_components:
        msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
    if frozen_components:
        msg.text(f"Frozen components: {', '.join(frozen_components)}")
    msg.text(f"{len(train_dataset)} training docs")
    msg.text(f"{len(dev_dataset)} evaluation docs")

    if not len(gold_dev_data):
        msg.fail("No evaluation docs")
    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn(f"{overlap} training examples also in evaluation data")
    else:
        msg.good("No overlap between training and evaluation data")
    # TODO: make this feedback more fine-grained and report on updated
    # components vs. blank components
    if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD:
        text = f"Low number of examples to train a new pipeline ({len(train_dataset)})"
        if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
            f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples "
            f"(minimum {BLANK_MODEL_MIN_THRESHOLD})",
            show=verbose,
        )

    msg.divider("Vocab & Vectors")
    n_words = gold_train_data["n_words"]
    msg.info(
        f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)"
    )
    if gold_train_data["n_misaligned_words"] > 0:
        n_misaligned = gold_train_data["n_misaligned_words"]
        msg.warn(f"{n_misaligned} misaligned tokens in the training data")
    if gold_dev_data["n_misaligned_words"] > 0:
        n_misaligned = gold_dev_data["n_misaligned_words"]
        msg.warn(f"{n_misaligned} misaligned tokens in the dev data")
    most_common_words = gold_train_data["words"].most_common(10)
    msg.text(
        f"10 most common words: {_format_labels(most_common_words, counts=True)}",
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info(
            f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
            f"unique keys, {nlp.vocab.vectors_length} dimensions)"
        )
        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
        msg.warn(
            "{} words in training data without vectors ({:.0f}%)".format(
                n_missing_vectors,
                100 * (n_missing_vectors / gold_train_data["n_words"]),
            ),
        )
        msg.text(
            "10 most common words without vectors: {}".format(
                _format_labels(
                    gold_train_data["words_missing_vectors"].most_common(10),
                    counts=True,
                )
            ),
            show=verbose,
        )
    else:
        msg.info("No word vectors present in the package")

    if "spancat" in factory_names:
        model_labels_spancat = _get_labels_from_spancat(nlp)
        has_low_data_warning = False
        has_no_neg_warning = False

        msg.divider("Span Categorization")
        msg.table(model_labels_spancat, header=["Spans Key", "Labels"], divider=True)

        msg.text("Label counts in train data: ", show=verbose)
        for spans_key, data_labels in gold_train_data["spancat"].items():
            msg.text(
                f"Key: {spans_key}, {_format_labels(data_labels.items(), counts=True)}",
                show=verbose,
            )
        # Data checks: only take the spans keys in the actual spancat components
        data_labels_in_component = {
            spans_key: gold_train_data["spancat"][spans_key]
            for spans_key in model_labels_spancat.keys()
        }
        for spans_key, data_labels in data_labels_in_component.items():
            for label, count in data_labels.items():
                # Check for missing labels
                spans_key_in_model = spans_key in model_labels_spancat.keys()
                if (spans_key_in_model) and (
                    label not in model_labels_spancat[spans_key]
                ):
                    msg.warn(
                        f"Label '{label}' is not present in the model labels of key '{spans_key}'. "
                        "Performance may degrade after training."
                    )
                # Check for low number of examples per label
                if count <= NEW_LABEL_THRESHOLD:
                    msg.warn(
                        f"Low number of examples for label '{label}' in key '{spans_key}' ({count})"
                    )
                    has_low_data_warning = True
                # Check for negative examples
                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(
                        train_dataset, label, "spancat", spans_key
                    )
                if neg_docs == 0:
                    msg.warn(f"No examples for texts WITHOUT new label '{label}'")
                    has_no_neg_warning = True

        if has_low_data_warning:
            msg.text(
                f"To train a new span type, your data should include at "
                f"least {NEW_LABEL_THRESHOLD} instances of the new label",
                show=verbose,
            )
        else:
            msg.good("Good amount of examples for all labels")

        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of spans "
                "in context, as well as examples without a given span "
                "type.",
                show=verbose,
            )
        else:
            msg.good("Examples without ocurrences available for all labels")

    if "ner" in factory_names:
        # Get all unique NER labels present in the data
        labels = set(
            label for label in gold_train_data["ner"] if label not in ("O", "-", None)
        )
        label_counts = gold_train_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False
        has_boundary_cross_ents_warning = False

        msg.divider("Named Entity Recognition")
        msg.info(f"{len(model_labels)} label(s)")
        missing_values = label_counts["-"]
        msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
        for label in labels:
            if len(label) == 0:
                msg.fail("Empty label found in train data")
        labels_with_counts = [
            (label, count)
            for label, count in label_counts.most_common()
            if label != "-"
        ]
        labels_with_counts = _format_labels(labels_with_counts, counts=True)
        msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}."
            )
        if gold_train_data["ws_ents"]:
            msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
            has_ws_ents_error = True

        for label in labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
                    f"Low number of examples for label '{label}' ({label_counts[label]})"
                )
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(train_dataset, label, "ner")
                if neg_docs == 0:
                    msg.warn(f"No examples for texts WITHOUT new label '{label}'")
                    has_no_neg_warning = True

        if gold_train_data["boundary_cross_ents"]:
            msg.warn(
                f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
            )
            has_boundary_cross_ents_warning = True

        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurrences available for all labels")
        if not has_ws_ents_error:
            msg.good("No entities consisting of or starting/ending with whitespace")
        if not has_boundary_cross_ents_warning:
            msg.good("No entities crossing sentence boundaries")

        if has_low_data_warning:
            msg.text(
                f"To train a new entity type, your data should include at "
                f"least {NEW_LABEL_THRESHOLD} instances of the new label",
                show=verbose,
            )
        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of entities "
                "in context, as well as examples without a given entity "
                "type.",
                show=verbose,
            )
        if has_ws_ents_error:
            msg.text(
                "Entity spans consisting of or starting/ending "
                "with whitespace characters are considered invalid."
            )

    if "textcat" in factory_names:
        msg.divider("Text Classification (Exclusive Classes)")
        labels = _get_labels_from_model(nlp, "textcat")
        msg.info(f"Text Classification: {len(labels)} label(s)")
        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
        missing_labels = labels - set(gold_train_data["cats"])
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}."
            )
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.warn(
                "Potential train/dev mismatch: the train and dev labels are "
                "not the same. "
                f"Train labels: {_format_labels(gold_train_data['cats'])}. "
                f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
            )
        if len(labels) < 2:
            msg.fail(
                "The model does not have enough labels. 'textcat' requires at "
                "least two labels due to mutually-exclusive classes, e.g. "
                "LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary "
                "classification task."
            )
        if (
            gold_train_data["n_cats_bad_values"] > 0
            or gold_dev_data["n_cats_bad_values"] > 0
        ):
            msg.fail(
                "Unsupported values for cats: the supported values are "
                "1.0/True and 0.0/False."
            )
        if gold_train_data["n_cats_multilabel"] > 0:
            # Note: you should never get here because you run into E895 on
            # initialization first.
            msg.fail(
                "The train data contains instances without mutually-exclusive "
                "classes. Use the component 'textcat_multilabel' instead of "
                "'textcat'."
            )
        if gold_dev_data["n_cats_multilabel"] > 0:
            msg.fail(
                "The dev data contains instances without mutually-exclusive "
                "classes. Use the component 'textcat_multilabel' instead of "
                "'textcat'."
            )

    if "textcat_multilabel" in factory_names:
        msg.divider("Text Classification (Multilabel)")
        labels = _get_labels_from_model(nlp, "textcat_multilabel")
        msg.info(f"Text Classification: {len(labels)} label(s)")
        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
        missing_labels = labels - set(gold_train_data["cats"])
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}."
            )
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.warn(
                "Potential train/dev mismatch: the train and dev labels are "
                "not the same. "
                f"Train labels: {_format_labels(gold_train_data['cats'])}. "
                f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
            )
        if (
            gold_train_data["n_cats_bad_values"] > 0
            or gold_dev_data["n_cats_bad_values"] > 0
        ):
            msg.fail(
                "Unsupported values for cats: the supported values are "
                "1.0/True and 0.0/False."
            )
        if gold_train_data["n_cats_multilabel"] > 0:
            if gold_dev_data["n_cats_multilabel"] == 0:
                msg.warn(
                    "Potential train/dev mismatch: the train data contains "
                    "instances without mutually-exclusive classes while the "
                    "dev data contains only instances with mutually-exclusive "
                    "classes."
                )
        else:
            msg.warn(
                "The train data contains only instances with "
                "mutually-exclusive classes. You can potentially use the "
                "component 'textcat' instead of 'textcat_multilabel'."
            )
            if gold_dev_data["n_cats_multilabel"] > 0:
                msg.fail(
                    "Train/dev mismatch: the dev data contains instances "
                    "without mutually-exclusive classes while the train data "
                    "contains only instances with mutually-exclusive classes."
                )

    if "tagger" in factory_names:
        msg.divider("Part-of-speech Tagging")
        label_list = [label for label in gold_train_data["tags"]]
        model_labels = _get_labels_from_model(nlp, "tagger")
        msg.info(f"{len(label_list)} label(s) in train data")
        labels = set(label_list)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}."
            )
        labels_with_counts = _format_labels(
            gold_train_data["tags"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)

    if "morphologizer" in factory_names:
        msg.divider("Morphologizer (POS+Morph)")
        label_list = [label for label in gold_train_data["morphs"]]
        model_labels = _get_labels_from_model(nlp, "morphologizer")
        msg.info(f"{len(label_list)} label(s) in train data")
        labels = set(label_list)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}."
            )
        labels_with_counts = _format_labels(
            gold_train_data["morphs"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)

    if "parser" in factory_names:
        has_low_data_warning = False
        msg.divider("Dependency Parsing")

        # profile sentence length
        msg.info(
            f"Found {gold_train_data['n_sents']} sentence(s) with an average "
            f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words."
        )

        # check for documents with multiple sentences
        sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
        if sents_per_doc < 1.1:
            msg.warn(
                f"The training data contains {sents_per_doc:.2f} sentences per "
                f"document. When there are very few documents containing more "
                f"than one sentence, the parser will not learn how to segment "
                f"longer texts into sentences."
            )

        # profile labels
        labels_train = [label for label in gold_train_data["deps"]]
        labels_train_unpreprocessed = [
            label for label in gold_train_unpreprocessed_data["deps"]
        ]
        labels_dev = [label for label in gold_dev_data["deps"]]

        if gold_train_unpreprocessed_data["n_nonproj"] > 0:
            n_nonproj = gold_train_unpreprocessed_data["n_nonproj"]
            msg.info(f"Found {n_nonproj} nonprojective train sentence(s)")
        if gold_dev_data["n_nonproj"] > 0:
            n_nonproj = gold_dev_data["n_nonproj"]
            msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
        msg.info(f"{len(labels_train_unpreprocessed)} label(s) in train data")
        msg.info(f"{len(labels_train)} label(s) in projectivized train data")
        labels_with_counts = _format_labels(
            gold_train_unpreprocessed_data["deps"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)

        # rare labels in train
        for label in gold_train_unpreprocessed_data["deps"]:
            if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD:
                msg.warn(
                    f"Low number of examples for label '{label}' "
                    f"({gold_train_unpreprocessed_data['deps'][label]})"
                )
                has_low_data_warning = True

        # rare labels in projectivized train
        rare_projectivized_labels = []
        for label in gold_train_data["deps"]:
            if (
                gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD
                and DELIMITER in label
            ):
                rare_projectivized_labels.append(
                    f"{label}: {gold_train_data['deps'][label]}"
                )

        if len(rare_projectivized_labels) > 0:
            msg.warn(
                f"Low number of examples for {len(rare_projectivized_labels)} "
                "label(s) in the projectivized dependency trees used for "
                "training. You may want to projectivize labels such as punct "
                "before training in order to improve parser performance."
            )
            msg.warn(
                f"Projectivized labels with low numbers of examples: ",
                ", ".join(rare_projectivized_labels),
                show=verbose,
            )
            has_low_data_warning = True

        # labels only in train
        if set(labels_train) - set(labels_dev):
            msg.warn(
                "The following labels were found only in the train data:",
                ", ".join(set(labels_train) - set(labels_dev)),
                show=verbose,
            )

        # labels only in dev
        if set(labels_dev) - set(labels_train):
            msg.warn(
                "The following labels were found only in the dev data:",
                ", ".join(set(labels_dev) - set(labels_train)),
                show=verbose,
            )

        if has_low_data_warning:
            msg.text(
                f"To train a parser, your data should include at "
                f"least {DEP_LABEL_THRESHOLD} instances of each label.",
                show=verbose,
            )

        # multiple root labels
        if len(gold_train_unpreprocessed_data["roots"]) > 1:
            msg.warn(
                f"Multiple root labels "
                f"({', '.join(gold_train_unpreprocessed_data['roots'])}) "
                f"found in training data. spaCy's parser uses a single root "
                f"label ROOT so this distinction will not be available."
            )

        # these should not happen, but just in case
        if gold_train_data["n_nonproj"] > 0:
            msg.fail(
                f"Found {gold_train_data['n_nonproj']} nonprojective "
                f"projectivized train sentence(s)"
            )
        if gold_train_data["n_cycles"] > 0:
            msg.fail(
                f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
            )

    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
        msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed")
    if warn_counts:
        msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}")
    if fail_counts:
        msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}")
        sys.exit(1)