Example #1
0
def collect_cmd(
    spec_file: typer.FileText,
    jobs: int = typer.Option(1, "--jobs", "-j"),
    commit_in: str = typer.Option(
        get_current_commit().hash, "--commit", show_default=True
    ),
    branch: str = typer.Option(get_branch(), "--branch", show_default=True),
) -> None:
    spec = load_spec(spec_file)
    storage = Storage(spec.storage_dir)

    commit = Commit(
        hash=str(commit_in),
        date=get_commit_date(commit_in),
        message=get_commit_message(commit_in),
    )
    # parent = Commit(hash=str(parent_in), date=get_commit_date(parent_in))
    parent = storage.get_branch_tip(get_branch())
    assert commit != parent, "We ran on this commit before it seems"

    msg.info(f"#jobs: {jobs}")
    msg.info(f"on commit:     {commit}")
    msg.info(f"parent commit: {parent}")

    if jobs > 1:
        msg.warn(
            "If you're running benchmarks from the collect call,"
            " concurrency can affect results"
        )

    assert jobs > 0, "Jobs value must be positive"

    msg.good("Spec loaded successfully")
    msg.divider()

    try:
        results = run_collectors(spec.collectors, jobs=jobs)
    except CollectorError as e:
        msg.fail("Collector returned invalid format")
        typer.echo(str(e.exc))
        return
        # raise e

    msg.good("Collection completed")
    # print(results)

    run = Run(
        commit=commit,
        parent=parent,
        branch=branch,
        date=datetime.now(),
        results=sum((r.metrics for r in results), []),
        context={},
    )

    # print(run)

    storage = Storage(spec.storage_dir)

    storage.store_run(run)
Example #2
0
def profile(model, inputs=None, n_texts=10000):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    Input should be formatted as one JSON object per line with a key "text".
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.
    """
    if inputs is not None:
        inputs = _read_inputs(inputs, msg)
    if inputs is None:
        n_inputs = 25000
        with msg.loading("Loading IMDB dataset via Thinc..."):
            imdb_train, _ = thinc.extra.datasets.imdb()
            inputs, _ = zip(*imdb_train)
        msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
        inputs = inputs[:n_inputs]
    with msg.loading("Loading model '{}'...".format(model)):
        nlp = load_model(model)
    msg.good("Loaded model '{}'".format(model))
    texts = list(itertools.islice(inputs, n_texts))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
                    "Profile.prof")
    s = pstats.Stats("Profile.prof")
    msg.divider("Profile stats")
    s.strip_dirs().sort_stats("time").print_stats()
def generate_meta(model_path, existing_meta, msg):
    meta = existing_meta or {}
    settings = [
        ("lang", "Model language", meta.get("lang", "en")),
        ("name", "Model name", meta.get("name", "model")),
        ("version", "Model version", meta.get("version", "0.0.0")),
        ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
        ("description", "Model description", meta.get("description", False)),
        ("author", "Author", meta.get("author", False)),
        ("email", "Author email", meta.get("email", False)),
        ("url", "Author website", meta.get("url", False)),
        ("license", "License", meta.get("license", "CC BY-SA 3.0")),
    ]
    nlp = util.load_model_from_path(Path(model_path))
    meta["pipeline"] = nlp.pipe_names
    meta["vectors"] = {
        "width": nlp.vocab.vectors_length,
        "vectors": len(nlp.vocab.vectors),
        "keys": nlp.vocab.vectors.n_keys,
        "name": nlp.vocab.vectors.name,
    }
    msg.divider("Generating meta.json")
    msg.text(
        "Enter the package settings for your model. The following information "
        "will be read from your model data: pipeline, vectors."
    )
    for setting, desc, default in settings:
        response = get_raw_input(desc, default)
        meta[setting] = default if response == "" and default else response
    if about.__title__ != "spacy":
        meta["parent_package"] = about.__title__
    return meta
Example #4
0
def print_pipe_analysis(
    analysis: Dict[str, Dict[str, Union[List[str], Dict]]],
    *,
    keys: List[str] = DEFAULT_KEYS,
) -> None:
    """Print a formatted version of the pipe analysis produced by analyze_pipes.

    analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis.
    keys (List[str]): The meta keys to show in the table.
    """
    msg.divider("Pipeline Overview")
    header = ["#", "Component", *[key.capitalize() for key in keys]]
    summary: ItemsView = analysis["summary"].items()
    body = [[i, n, *[v for v in m.values()]]
            for i, (n, m) in enumerate(summary)]
    msg.table(body, header=header, divider=True, multiline=True)
    n_problems = sum(len(p) for p in analysis["problems"].values())
    if any(p for p in analysis["problems"].values()):
        msg.divider(f"Problems ({n_problems})")
        for name, problem in analysis["problems"].items():
            if problem:
                msg.warn(
                    f"'{name}' requirements not met: {', '.join(problem)}")
    else:
        msg.good("No problems found.")
Example #5
0
def profile(model: str,
            inputs: Optional[Path] = None,
            n_texts: int = 10000) -> None:
    if inputs is not None:
        texts = _read_inputs(inputs, msg)
        texts = list(itertools.islice(texts, n_texts))
    if inputs is None:
        try:
            import ml_datasets
        except ImportError:
            msg.fail(
                "This command, when run without an input file, "
                "requires the ml_datasets library to be installed: "
                "pip install ml_datasets",
                exits=1,
            )

        with msg.loading("Loading IMDB dataset via ml_datasets..."):
            imdb_train, _ = ml_datasets.imdb(train_limit=n_texts, dev_limit=0)
            texts, _ = zip(*imdb_train)
        msg.info(f"Loaded IMDB dataset and using {n_texts} examples")
    with msg.loading(f"Loading pipeline '{model}'..."):
        nlp = load_model(model)
    msg.good(f"Loaded pipeline '{model}'")
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
                    "Profile.prof")
    s = pstats.Stats("Profile.prof")
    msg.divider("Profile stats")
    s.strip_dirs().sort_stats("time").print_stats()
Example #6
0
def train(
    config_path: Union[str, Path],
    output_path: Optional[Union[str, Path]] = None,
    *,
    use_gpu: int = -1,
    overrides: Dict[str, Any] = util.SimpleFrozenDict(),
):
    config_path = util.ensure_path(config_path)
    output_path = util.ensure_path(output_path)
    # Make sure all files and paths exists if they are needed
    if not config_path or (str(config_path) != "-" and not config_path.exists()):
        msg.fail("Config file not found", config_path, exits=1)
    if not output_path:
        msg.info("No output directory provided")
    else:
        if not output_path.exists():
            output_path.mkdir(parents=True)
            msg.good(f"Created output directory: {output_path}")
        msg.info(f"Saving to output directory: {output_path}")
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides, interpolate=False)
    msg.divider("Initializing pipeline")
    with show_validation_error(config_path, hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    msg.good("Initialized pipeline")
    msg.divider("Training pipeline")
    train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
Example #7
0
def assemble_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(...,
                            help="Path to config file",
                            exists=True,
                            allow_dash=True),
    output_path: Path = Arg(
        ..., help="Output directory to store assembled pipeline in"),
    code_path: Optional[Path] = Opt(
        None,
        "--code",
        "-c",
        help=
        "Path to Python file with additional code (registered functions) to be imported"
    ),
    verbose: bool = Opt(
        False,
        "--verbose",
        "-V",
        "-VV",
        help="Display more information for debugging purposes"),
    # fmt: on
):
    """
    Assemble a spaCy pipeline from a config file. The config file includes
    all settings for initializing the pipeline. To override settings in the
    config, e.g. settings that point to local paths or that you want to
    experiment with, you can override them as command line options. The
    --code argument lets you pass in a Python file that can be used to
    register custom functions that are referenced in the config.

    DOCS: https://spacy.io/api/cli#assemble
    """
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    # Make sure all files and paths exists if they are needed
    if not config_path or (str(config_path) != "-"
                           and not config_path.exists()):
        msg.fail("Config file not found", config_path, exits=1)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    with show_validation_error(config_path):
        config = util.load_config(config_path,
                                  overrides=overrides,
                                  interpolate=False)
    msg.divider("Initializing pipeline")
    nlp = load_model_from_config(config, auto_fill=True)
    config = config.interpolate()
    sourced = get_sourced_components(config)
    # Make sure that listeners are defined before initializing further
    nlp._link_components()
    with nlp.select_pipes(disable=[*sourced]):
        nlp.initialize()
    msg.good("Initialized pipeline")
    msg.divider("Serializing to disk")
    if output_path is not None and not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory: {output_path}")
    nlp.to_disk(output_path)
Example #8
0
def project_run(
    project_dir: Path,
    subcommand: str,
    *,
    overrides: Dict[str, Any] = SimpleFrozenDict(),
    force: bool = False,
    dry: bool = False,
    capture: bool = False,
) -> None:
    """Run a named script defined in the project.yml. If the script is part
    of the default pipeline (defined in the "run" section), DVC is used to
    execute the command, so it can determine whether to rerun it. It then
    calls into "exec" to execute it.

    project_dir (Path): Path to project directory.
    subcommand (str): Name of command to run.
    overrides (Dict[str, Any]): Optional config overrides.
    force (bool): Force re-running, even if nothing changed.
    dry (bool): Perform a dry run and don't execute commands.
    capture (bool): Whether to capture the output and errors of individual commands.
        If False, the stdout and stderr will not be redirected, and if there's an error,
        sys.exit will be called with the return code. You should use capture=False
        when you want to turn over execution to the command, and capture=True
        when you want to run the command more like a function.
    """
    config = load_project_config(project_dir, overrides=overrides)
    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    workflows = config.get("workflows", {})
    validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
    if subcommand in workflows:
        msg.info(f"Running workflow '{subcommand}'")
        for cmd in workflows[subcommand]:
            project_run(
                project_dir,
                cmd,
                overrides=overrides,
                force=force,
                dry=dry,
                capture=capture,
            )
    else:
        cmd = commands[subcommand]
        for dep in cmd.get("deps", []):
            if not (project_dir / dep).exists():
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
                err_kwargs = {"exits": 1} if not dry else {}
                msg.fail(err, err_help, **err_kwargs)
        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
        with working_dir(project_dir) as current_dir:
            msg.divider(subcommand)
            rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
            if not rerun and not force:
                msg.info(f"Skipping '{cmd['name']}': nothing changed")
            else:
                run_commands(cmd["script"], dry=dry, capture=capture)
                if not dry:
                    update_lockfile(current_dir, cmd)
Example #9
0
 def main(self, args: BaseArgumentParser) -> int:
     list_devices_response = self.get_client().list_devices()
     msg.divider("Registered Devices")
     for device in list_devices_response.devices:
         if device.is_available:
             msg.good(f"{device.name}")
         else:
             msg.fail(f"{device.name}:")
             msg.text(
                 f"  {color(device.error_type, bold=True)}: {device.error_message}"
             )
     return 0
Example #10
0
def validate() -> None:
    model_pkgs, compat = get_model_pkgs()
    spacy_version = get_minor_version(about.__version__)
    current_compat = compat.get(spacy_version, {})
    if not current_compat:
        msg.warn(f"No compatible packages found for v{spacy_version} of spaCy")
    incompat_models = {
        d["name"]
        for _, d in model_pkgs.items() if not d["compat"]
    }
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

    msg.divider(f"Installed pipeline packages (spaCy v{about.__version__})")
    msg.info(f"spaCy installation: {spacy_dir}")

    if model_pkgs:
        header = ("NAME", "SPACY", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
            if data["compat"]:
                comp = msg.text("", color="green", icon="good", no_print=True)
                version = msg.text(data["version"],
                                   color="green",
                                   no_print=True)
            else:
                version = msg.text(data["version"],
                                   color="yellow",
                                   no_print=True)
                comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}"
            rows.append((data["name"], data["spacy"], version, comp))
        msg.table(rows, header=header)
    else:
        msg.text("No pipeline packages found in your current environment.",
                 exits=0)
    if update_models:
        msg.divider("Install updates")
        msg.text("Use the following commands to update the packages:")
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
        msg.info(
            f"The following packages are custom spaCy pipelines or not "
            f"available for spaCy v{about.__version__}:",
            ", ".join(na_models),
        )
    if incompat_models:
        sys.exit(1)
Example #11
0
def project_run(project_dir: Path,
                subcommand: str,
                *,
                force: bool = False,
                dry: bool = False) -> None:
    """Run a named script defined in the project.yml. If the script is part
    of the default pipeline (defined in the "run" section), DVC is used to
    execute the command, so it can determine whether to rerun it. It then
    calls into "exec" to execute it.

    project_dir (Path): Path to project directory.
    subcommand (str): Name of command to run.
    force (bool): Force re-running, even if nothing changed.
    dry (bool): Perform a dry run and don't execute commands.
    """
    config = load_project_config(project_dir)
    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    workflows = config.get("workflows", {})
    validate_subcommand(commands.keys(), workflows.keys(), subcommand)
    if subcommand in workflows:
        msg.info(f"Running workflow '{subcommand}'")
        for cmd in workflows[subcommand]:
            project_run(project_dir, cmd, force=force, dry=dry)
    else:
        cmd = commands[subcommand]
        for dep in cmd.get("deps", []):
            if not (project_dir / dep).exists():
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
                err_kwargs = {"exits": 1} if not dry else {}
                msg.fail(err, err_help, **err_kwargs)
        check_spacy_commit = check_bool_env_var(
            ENV_VARS.PROJECT_USE_GIT_VERSION)
        with working_dir(project_dir) as current_dir:
            msg.divider(subcommand)
            rerun = check_rerun(current_dir,
                                cmd,
                                check_spacy_commit=check_spacy_commit)
            if not rerun and not force:
                msg.info(f"Skipping '{cmd['name']}': nothing changed")
            else:
                run_commands(cmd["script"], dry=dry)
                if not dry:
                    update_lockfile(current_dir, cmd)
Example #12
0
def debug_config(
    config_path: Path,
    *,
    overrides: Dict[str, Any] = {},
    show_funcs: bool = False,
    show_vars: bool = False,
):
    msg.divider("Config validation")
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
        nlp = util.load_model_from_config(config)
        config = nlp.config.interpolate()
    msg.divider("Config validation for [initialize]")
    with show_validation_error(config_path):
        T = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
    msg.divider("Config validation for [training]")
    with show_validation_error(config_path):
        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
        dot_names = [T["train_corpus"], T["dev_corpus"]]
        util.resolve_dot_names(config, dot_names)
    msg.good("Config is valid")
    if show_vars:
        variables = get_variables(config)
        msg.divider(f"Variables ({len(variables)})")
        head = ("Variable", "Value")
        msg.table(variables,
                  header=head,
                  divider=True,
                  widths=(41, 34),
                  spacing=2)
    if show_funcs:
        funcs = get_registered_funcs(config)
        msg.divider(f"Registered functions ({len(funcs)})")
        for func in funcs:
            func_data = {
                "Registry": f"@{func['registry']}",
                "Name": func["name"],
                "Module": func["module"],
                "File": f"{func['file']} (line {func['line_no']})",
            }
            msg.info(f"[{func['path']}]")
            print(table(func_data).strip())
Example #13
0
def cli_print_problems(environment: str, difficulty: str, number: int):
    """Print a set of generated problems from a given environment.

    This is useful if you when developing new environment types for
    verifying that the problems you're generating take the form you
    expect. """
    import gym
    from mathy_envs.gym import MathyGymEnv

    env_name = f"mathy-{environment}-{difficulty}-v0"
    env: MathyGymEnv = gym.make(env_name)  # type:ignore
    msg.divider(env_name)
    with msg.loading(f"Generating {number} problems..."):
        header = ("Complexity", "Is Valid", "Text")
        widths = (10, 8, 62)
        aligns = ("c", "c", "l")
        data = []
        for i in range(number):
            state, problem = env.mathy.get_initial_state(env.env_problem_args,
                                                         print_problem=False)
            valid = False
            text = problem.text
            try:
                env.mathy.parser.parse(problem.text)
                valid = True
            except BaseException as error:
                text = f"parse failed for '{problem.text}' with error: {error}"
            data.append((
                problem.complexity,
                "✔" if valid else "✘",
                text,
            ))
    msg.good(f"\nGenerated {number} problems!")

    print(
        msg.table(data,
                  header=header,
                  divider=True,
                  widths=widths,
                  aligns=aligns))
Example #14
0
def fitted_ann_kb(nlp, entities, aliases):
    kb = AnnKnowledgeBase(nlp.vocab, entity_vector_length=300)
    print(vars(kb))

    entity_ids = []
    descriptions = []
    freqs = []
    for e in entities:
        entity_ids.append(e["id"])
        descriptions.append(e.get("description", ""))
        freqs.append(100)

    msg.divider("Apply EntityEncoder")

    with msg.loading("Applying EntityEncoder to descriptions"):
        # get the pretrained entity vectors
        embeddings = [nlp.make_doc(desc).vector for desc in descriptions]
        msg.good("Finished, embeddings created")

    with msg.loading("Setting kb entities and aliases"):
        # set the entities, can also be done by calling `kb.add_entity` for each entity
        for i in range(len(entity_ids)):
            entity = entity_ids[i]
            if not kb.contains_entity(entity):
                kb.add_entity(entity, freqs[i], embeddings[i])

        for a in aliases:
            ents = [e for e in a["entities"] if kb.contains_entity(e)]
            n_ents = len(ents)
            if n_ents > 0:
                prior_prob = [1.0 / n_ents] * n_ents
                kb.add_alias(alias=a["alias"],
                             entities=ents,
                             probabilities=prior_prob)

    kb.fit_index(verbose=True)

    return kb
Example #15
0
    def main(self, args: DeviceInfoArguments) -> int:
        device_info = self.get_client().device_info(args.device_name)
        if device_info.device_type is None:
            msg.fail(
                f"Unknown device {args.device_name}",
                text="See `labby devices` for a list of available devices.",
            )
            return 1

        msg.divider(
            f"{args.device_name} (device_info.device_type.friendly_name)")

        if device_info.is_connected:
            msg.table([
                ("Connection", render.good("OK")),
                *self._render_device_info(device_info),
            ])
        else:
            msg.table([("Connection", render.fail("Error"))])
            msg.text(f"{color(device_info.error_type, bold=True)}: " +
                     f"{device_info.error_message}")

        return 0
Example #16
0
def fix_annotations(
    example: Example,
    corrections: List[Correction],
    case_sensitive: bool = False,
    dryrun: bool = False,
) -> Example:
    """Fix annotations in a copy of List[Example] data.
    
    This function will NOT add annotations to your data.
    It will only remove erroneous annotations and fix the
    labels for specific spans.
    
    Args:
        example (Example): Input Example
        corrections (Dict[str, str]): Dictionary of corrections mapping entity text to a new label.
            If the value is set to None, the annotation will be removed
        case_sensitive (bool, optional): Consider case of text for each correction
        dryrun (bool, optional): Treat corrections as a dryrun and just print all changes to be made
    
    Returns:
        Example: Example with fixed annotations
    """

    if not case_sensitive:
        for c in corrections:
            c.annotation = c.annotation.lower()

    corrections_map: Dict[str,
                          Correction] = {c.annotation: c
                                         for c in corrections}
    prints: List[str] = []

    ents_to_remove: List[int] = []
    for i, s in enumerate(example.spans):
        t = s.text if case_sensitive else s.text.lower()

        if t in corrections_map:
            c = corrections_map[t]
            if c.to_label is None and s.label in c.from_labels:
                if dryrun:
                    prints.append(f"Deleting span: {s.text}")
                else:
                    ents_to_remove.append(i)
            elif s.label in c.from_labels or "ANY" in c.from_labels:
                if dryrun:
                    prints.append(
                        f"Correction span: {s.text} from labels: {c.from_labels} to label: {c.to_label}"
                    )
                else:
                    s.label = cast(str, c.to_label)

    i = len(ents_to_remove) - 1
    while i >= 0:
        idx = ents_to_remove[i]
        del example.spans[idx]
        i -= 1

    if dryrun:
        msg.divider("Example Text")
        msg.text(example.text)
        for line in prints:
            msg.text(line)

    return example
Example #17
0
def debug_model(
    config,
    resolved_train_config,
    nlp,
    model: Model,
    *,
    print_settings: Optional[Dict[str, Any]] = None,
):
    if not isinstance(model, Model):
        msg.fail(
            f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
            exits=1,
        )
    if print_settings is None:
        print_settings = {}

    # STEP 0: Printing before training
    msg.info(f"Analysing model with ID {model.id}")
    if print_settings.get("print_before_training"):
        msg.divider(f"STEP 0 - before training")
        _print_model(model, print_settings)

    # STEP 1: Initializing the model and printing again
    X = _get_docs()
    # The output vector might differ from the official type of the output layer
    with data_validation(False):
        try:
            dot_names = [resolved_train_config["train_corpus"]]
            with show_validation_error():
                (train_corpus, ) = resolve_dot_names(config, dot_names)
                nlp.initialize(lambda: train_corpus(nlp))
            msg.info("Initialized the model with the training corpus.")
        except ValueError:
            try:
                _set_output_dim(nO=7, model=model)
                with show_validation_error():
                    nlp.initialize(
                        lambda: [Example.from_dict(x, {}) for x in X])
                msg.info("Initialized the model with dummy data.")
            except Exception:
                msg.fail(
                    "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
                    exits=1,
                )

    if print_settings.get("print_after_init"):
        msg.divider(f"STEP 1 - after initialization")
        _print_model(model, print_settings)

    # STEP 2: Updating the model and printing again
    optimizer = Adam(0.001)
    set_dropout_rate(model, 0.2)
    # ugly hack to deal with Tok2Vec listeners
    tok2vec = None
    if model.has_ref("tok2vec") and model.get_ref(
            "tok2vec").name == "tok2vec-listener":
        tok2vec = nlp.get_pipe("tok2vec")
    goldY = None
    for e in range(3):
        if tok2vec:
            tok2vec.update([Example.from_dict(x, {}) for x in X])
        Y, get_dX = model.begin_update(X)
        if goldY is None:
            goldY = _simulate_gold(Y)
        dY = get_gradient(goldY, Y, model.ops)
        get_dX(dY)
        model.finish_update(optimizer)
    if print_settings.get("print_after_training"):
        msg.divider(f"STEP 2 - after training")
        _print_model(model, print_settings)

    # STEP 3: the final prediction
    prediction = model.predict(X)
    if print_settings.get("print_prediction"):
        msg.divider(f"STEP 3 - prediction")
        msg.info(str(prediction))

    msg.good(f"Succesfully ended analysis - model looks good.")
Example #18
0
def debug_data(
    config_path: Path,
    *,
    config_overrides: Dict[str, Any] = {},
    ignore_warnings: bool = False,
    verbose: bool = False,
    no_format: bool = True,
    silent: bool = True,
):
    msg = Printer(no_print=silent,
                  pretty=not no_format,
                  ignore_warnings=ignore_warnings)
    # Make sure all files and paths exists if they are needed
    with show_validation_error(config_path):
        cfg = util.load_config(config_path, overrides=config_overrides)
        nlp = util.load_model_from_config(cfg)
        config = nlp.config.interpolate()
        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    # Use original config here, not resolved version
    sourced_components = get_sourced_components(cfg)
    frozen_components = T["frozen_components"]
    resume_components = [
        p for p in sourced_components if p not in frozen_components
    ]
    pipeline = nlp.pipe_names
    factory_names = [
        nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names
    ]
    msg.divider("Data file validation")

    # Create the gold corpus to be able to better analyze data
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)

    nlp.initialize(lambda: train_corpus(nlp))
    msg.good("Pipeline can be initialized with data")

    train_dataset = list(train_corpus(nlp))
    dev_dataset = list(dev_corpus(nlp))
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_dataset constantly
    gold_train_data = _compile_gold(train_dataset,
                                    factory_names,
                                    nlp,
                                    make_proj=True)
    gold_train_unpreprocessed_data = _compile_gold(train_dataset,
                                                   factory_names,
                                                   nlp,
                                                   make_proj=False)
    gold_dev_data = _compile_gold(dev_dataset,
                                  factory_names,
                                  nlp,
                                  make_proj=True)

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]
    frozen_components = T["frozen_components"]

    msg.divider("Training stats")
    msg.text(f"Language: {nlp.lang}")
    msg.text(f"Training pipeline: {', '.join(pipeline)}")
    if resume_components:
        msg.text(
            f"Components from other pipelines: {', '.join(resume_components)}")
    if frozen_components:
        msg.text(f"Frozen components: {', '.join(frozen_components)}")
    msg.text(f"{len(train_dataset)} training docs")
    msg.text(f"{len(dev_dataset)} evaluation docs")

    if not len(gold_dev_data):
        msg.fail("No evaluation docs")
    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn(f"{overlap} training examples also in evaluation data")
    else:
        msg.good("No overlap between training and evaluation data")
    # TODO: make this feedback more fine-grained and report on updated
    # components vs. blank components
    if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD:
        text = f"Low number of examples to train a new pipeline ({len(train_dataset)})"
        if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
            f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples "
            f"(minimum {BLANK_MODEL_MIN_THRESHOLD})",
            show=verbose,
        )

    msg.divider("Vocab & Vectors")
    n_words = gold_train_data["n_words"]
    msg.info(
        f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)"
    )
    if gold_train_data["n_misaligned_words"] > 0:
        n_misaligned = gold_train_data["n_misaligned_words"]
        msg.warn(f"{n_misaligned} misaligned tokens in the training data")
    if gold_dev_data["n_misaligned_words"] > 0:
        n_misaligned = gold_dev_data["n_misaligned_words"]
        msg.warn(f"{n_misaligned} misaligned tokens in the dev data")
    most_common_words = gold_train_data["words"].most_common(10)
    msg.text(
        f"10 most common words: {_format_labels(most_common_words, counts=True)}",
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info(
            f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
            f"unique keys, {nlp.vocab.vectors_length} dimensions)")
        n_missing_vectors = sum(
            gold_train_data["words_missing_vectors"].values())
        msg.warn(
            "{} words in training data without vectors ({:.0f}%)".format(
                n_missing_vectors,
                100 * (n_missing_vectors / gold_train_data["n_words"]),
            ), )
        msg.text(
            "10 most common words without vectors: {}".format(
                _format_labels(
                    gold_train_data["words_missing_vectors"].most_common(10),
                    counts=True,
                )),
            show=verbose,
        )
    else:
        msg.info("No word vectors present in the package")

    if "ner" in factory_names:
        # Get all unique NER labels present in the data
        labels = set(label for label in gold_train_data["ner"]
                     if label not in ("O", "-", None))
        label_counts = gold_train_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False
        has_boundary_cross_ents_warning = False

        msg.divider("Named Entity Recognition")
        msg.info(f"{len(model_labels)} label(s)")
        missing_values = label_counts["-"]
        msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
        for label in labels:
            if len(label) == 0:
                msg.fail("Empty label found in train data")
        labels_with_counts = [(label, count)
                              for label, count in label_counts.most_common()
                              if label != "-"]
        labels_with_counts = _format_labels(labels_with_counts, counts=True)
        msg.text(f"Labels in train data: {_format_labels(labels)}",
                 show=verbose)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        if gold_train_data["ws_ents"]:
            msg.fail(
                f"{gold_train_data['ws_ents']} invalid whitespace entity spans"
            )
            has_ws_ents_error = True

        for label in labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
                    f"Low number of examples for label '{label}' ({label_counts[label]})"
                )
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(
                        train_dataset, label)
                if neg_docs == 0:
                    msg.warn(
                        f"No examples for texts WITHOUT new label '{label}'")
                    has_no_neg_warning = True

        if gold_train_data["boundary_cross_ents"]:
            msg.warn(
                f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
            )
            has_boundary_cross_ents_warning = True

        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurrences available for all labels")
        if not has_ws_ents_error:
            msg.good(
                "No entities consisting of or starting/ending with whitespace")
        if not has_boundary_cross_ents_warning:
            msg.good("No entities crossing sentence boundaries")

        if has_low_data_warning:
            msg.text(
                f"To train a new entity type, your data should include at "
                f"least {NEW_LABEL_THRESHOLD} instances of the new label",
                show=verbose,
            )
        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of entities "
                "in context, as well as examples without a given entity "
                "type.",
                show=verbose,
            )
        if has_ws_ents_error:
            msg.text("Entity spans consisting of or starting/ending "
                     "with whitespace characters are considered invalid.")

    if "textcat" in factory_names:
        msg.divider("Text Classification (Exclusive Classes)")
        labels = _get_labels_from_model(nlp, "textcat")
        msg.info(f"Text Classification: {len(labels)} label(s)")
        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
        missing_labels = labels - set(gold_train_data["cats"])
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.warn(
                "Potential train/dev mismatch: the train and dev labels are "
                "not the same. "
                f"Train labels: {_format_labels(gold_train_data['cats'])}. "
                f"Dev labels: {_format_labels(gold_dev_data['cats'])}.")
        if len(labels) < 2:
            msg.fail(
                "The model does not have enough labels. 'textcat' requires at "
                "least two labels due to mutually-exclusive classes, e.g. "
                "LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary "
                "classification task.")
        if (gold_train_data["n_cats_bad_values"] > 0
                or gold_dev_data["n_cats_bad_values"] > 0):
            msg.fail("Unsupported values for cats: the supported values are "
                     "1.0/True and 0.0/False.")
        if gold_train_data["n_cats_multilabel"] > 0:
            # Note: you should never get here because you run into E895 on
            # initialization first.
            msg.fail(
                "The train data contains instances without mutually-exclusive "
                "classes. Use the component 'textcat_multilabel' instead of "
                "'textcat'.")
        if gold_dev_data["n_cats_multilabel"] > 0:
            msg.fail(
                "The dev data contains instances without mutually-exclusive "
                "classes. Use the component 'textcat_multilabel' instead of "
                "'textcat'.")

    if "textcat_multilabel" in factory_names:
        msg.divider("Text Classification (Multilabel)")
        labels = _get_labels_from_model(nlp, "textcat_multilabel")
        msg.info(f"Text Classification: {len(labels)} label(s)")
        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
        missing_labels = labels - set(gold_train_data["cats"])
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.warn(
                "Potential train/dev mismatch: the train and dev labels are "
                "not the same. "
                f"Train labels: {_format_labels(gold_train_data['cats'])}. "
                f"Dev labels: {_format_labels(gold_dev_data['cats'])}.")
        if (gold_train_data["n_cats_bad_values"] > 0
                or gold_dev_data["n_cats_bad_values"] > 0):
            msg.fail("Unsupported values for cats: the supported values are "
                     "1.0/True and 0.0/False.")
        if gold_train_data["n_cats_multilabel"] > 0:
            if gold_dev_data["n_cats_multilabel"] == 0:
                msg.warn(
                    "Potential train/dev mismatch: the train data contains "
                    "instances without mutually-exclusive classes while the "
                    "dev data contains only instances with mutually-exclusive "
                    "classes.")
        else:
            msg.warn("The train data contains only instances with "
                     "mutually-exclusive classes. You can potentially use the "
                     "component 'textcat' instead of 'textcat_multilabel'.")
            if gold_dev_data["n_cats_multilabel"] > 0:
                msg.fail(
                    "Train/dev mismatch: the dev data contains instances "
                    "without mutually-exclusive classes while the train data "
                    "contains only instances with mutually-exclusive classes.")

    if "tagger" in factory_names:
        msg.divider("Part-of-speech Tagging")
        label_list = [label for label in gold_train_data["tags"]]
        model_labels = _get_labels_from_model(nlp, "tagger")
        msg.info(f"{len(label_list)} label(s) in train data")
        labels = set(label_list)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        labels_with_counts = _format_labels(
            gold_train_data["tags"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)

    if "morphologizer" in factory_names:
        msg.divider("Morphologizer (POS+Morph)")
        label_list = [label for label in gold_train_data["morphs"]]
        model_labels = _get_labels_from_model(nlp, "morphologizer")
        msg.info(f"{len(label_list)} label(s) in train data")
        labels = set(label_list)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        labels_with_counts = _format_labels(
            gold_train_data["morphs"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)

    if "parser" in factory_names:
        has_low_data_warning = False
        msg.divider("Dependency Parsing")

        # profile sentence length
        msg.info(
            f"Found {gold_train_data['n_sents']} sentence(s) with an average "
            f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words."
        )

        # check for documents with multiple sentences
        sents_per_doc = gold_train_data["n_sents"] / len(
            gold_train_data["texts"])
        if sents_per_doc < 1.1:
            msg.warn(
                f"The training data contains {sents_per_doc:.2f} sentences per "
                f"document. When there are very few documents containing more "
                f"than one sentence, the parser will not learn how to segment "
                f"longer texts into sentences.")

        # profile labels
        labels_train = [label for label in gold_train_data["deps"]]
        labels_train_unpreprocessed = [
            label for label in gold_train_unpreprocessed_data["deps"]
        ]
        labels_dev = [label for label in gold_dev_data["deps"]]

        if gold_train_unpreprocessed_data["n_nonproj"] > 0:
            n_nonproj = gold_train_unpreprocessed_data["n_nonproj"]
            msg.info(f"Found {n_nonproj} nonprojective train sentence(s)")
        if gold_dev_data["n_nonproj"] > 0:
            n_nonproj = gold_dev_data["n_nonproj"]
            msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
        msg.info(f"{len(labels_train_unpreprocessed)} label(s) in train data")
        msg.info(f"{len(labels_train)} label(s) in projectivized train data")
        labels_with_counts = _format_labels(
            gold_train_unpreprocessed_data["deps"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)

        # rare labels in train
        for label in gold_train_unpreprocessed_data["deps"]:
            if gold_train_unpreprocessed_data["deps"][
                    label] <= DEP_LABEL_THRESHOLD:
                msg.warn(f"Low number of examples for label '{label}' "
                         f"({gold_train_unpreprocessed_data['deps'][label]})")
                has_low_data_warning = True

        # rare labels in projectivized train
        rare_projectivized_labels = []
        for label in gold_train_data["deps"]:
            if (gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD
                    and DELIMITER in label):
                rare_projectivized_labels.append(
                    f"{label}: {gold_train_data['deps'][label]}")

        if len(rare_projectivized_labels) > 0:
            msg.warn(
                f"Low number of examples for {len(rare_projectivized_labels)} "
                "label(s) in the projectivized dependency trees used for "
                "training. You may want to projectivize labels such as punct "
                "before training in order to improve parser performance.")
            msg.warn(
                f"Projectivized labels with low numbers of examples: ",
                ", ".join(rare_projectivized_labels),
                show=verbose,
            )
            has_low_data_warning = True

        # labels only in train
        if set(labels_train) - set(labels_dev):
            msg.warn(
                "The following labels were found only in the train data:",
                ", ".join(set(labels_train) - set(labels_dev)),
                show=verbose,
            )

        # labels only in dev
        if set(labels_dev) - set(labels_train):
            msg.warn(
                "The following labels were found only in the dev data:",
                ", ".join(set(labels_dev) - set(labels_train)),
                show=verbose,
            )

        if has_low_data_warning:
            msg.text(
                f"To train a parser, your data should include at "
                f"least {DEP_LABEL_THRESHOLD} instances of each label.",
                show=verbose,
            )

        # multiple root labels
        if len(gold_train_unpreprocessed_data["roots"]) > 1:
            msg.warn(
                f"Multiple root labels "
                f"({', '.join(gold_train_unpreprocessed_data['roots'])}) "
                f"found in training data. spaCy's parser uses a single root "
                f"label ROOT so this distinction will not be available.")

        # these should not happen, but just in case
        if gold_train_data["n_nonproj"] > 0:
            msg.fail(f"Found {gold_train_data['n_nonproj']} nonprojective "
                     f"projectivized train sentence(s)")
        if gold_train_data["n_cycles"] > 0:
            msg.fail(
                f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
            )

    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
        msg.good(
            f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed"
        )
    if warn_counts:
        msg.warn(
            f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}")
    if fail_counts:
        msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}")
        sys.exit(1)
Example #19
0
def train_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(...,
                            help="Path to config file",
                            exists=True,
                            allow_dash=True),
    output_path: Optional[Path] = Opt(
        None,
        "--output",
        "--output-path",
        "-o",
        help="Output directory to store trained pipeline in"),
    code_path: Optional[Path] = Opt(
        None,
        "--code",
        "-c",
        help=
        "Path to Python file with additional code (registered functions) to be imported"
    ),
    verbose: bool = Opt(
        False,
        "--verbose",
        "-V",
        "-VV",
        help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
):
    """
    Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
    convert data from other formats, use the `spacy convert` command. The
    config file includes all settings and hyperparameters used during training.
    To override settings in the config, e.g. settings that point to local
    paths or that you want to experiment with, you can override them as
    command line options. For instance, --training.batch_size 128 overrides
    the value of "batch_size" in the block "[training]". The --code argument
    lets you pass in a Python file that's imported before training. It can be
    used to register custom functions and architectures that can then be
    referenced in the config.

    DOCS: https://spacy.io/api/cli#train
    """
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    # Make sure all files and paths exists if they are needed
    if not config_path or (str(config_path) != "-"
                           and not config_path.exists()):
        msg.fail("Config file not found", config_path, exits=1)
    if not output_path:
        msg.info("No output directory provided")
    else:
        if not output_path.exists():
            output_path.mkdir(parents=True)
            msg.good(f"Created output directory: {output_path}")
        msg.info(f"Saving to output directory: {output_path}")
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path,
                                  overrides=overrides,
                                  interpolate=False)
    msg.divider("Initializing pipeline")
    with show_validation_error(config_path, hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    msg.good("Initialized pipeline")
    msg.divider("Training pipeline")
    train(nlp,
          output_path,
          use_gpu=use_gpu,
          stdout=sys.stdout,
          stderr=sys.stderr)
Example #20
0
def pretrain(
    texts_loc,
    vectors_model,
    output_dir,
    width=96,
    conv_depth=4,
    bilstm_depth=0,
    cnn_pieces=3,
    sa_depth=0,
    use_chars=False,
    cnn_window=1,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
    dropout=0.2,
    n_iter=1000,
    batch_size=3000,
    max_length=500,
    min_length=5,
    seed=0,
    n_save_every=None,
    init_tok2vec=None,
    epoch_start=None,
):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pretrained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pretrained weights
    files to the 'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. The API and
    errors around this need some improvement.
    """
    config = dict(locals())
    for key in config:
        if isinstance(config[key], Path):
            config[key] = str(config[key])
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
    if has_gpu:
        import torch

        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
    if output_dir.exists() and [p for p in output_dir.iterdir()]:
        msg.warn(
            "Output directory is not empty",
            "It is better to use an empty directory or refer to a new output path, "
            "then the new directory will be created for you.",
        )
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory: {}".format(output_dir))
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
        if not texts:
            msg.fail("Input file is empty", texts_loc, exits=1)
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

    with msg.loading("Loading model '{}'...".format(vectors_model)):
        nlp = util.load_model(vectors_model)
    msg.good("Loaded model '{}'".format(vectors_model))
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
            width,
            embed_rows,
            conv_depth=conv_depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=bilstm_depth,  # Requires PyTorch. Experimental.
            subword_features=not use_chars,  # Set to False for Chinese etc
            cnn_maxout_pieces=cnn_pieces,  # If set to 1, use Mish activation.
        ),
    )
    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))
        # Parse the epoch number from the given weight file
        model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
        if model_name:
            # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
            epoch_start = int(model_name.group(0)[5:][:-4]) + 1
        else:
            if not epoch_start:
                msg.fail(
                    "You have to use the '--epoch-start' argument when using a renamed weight file for "
                    "'--init-tok2vec'",
                    exits=True,
                )
            elif epoch_start < 0:
                msg.fail(
                    "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid"
                    % epoch_start,
                    exits=True,
                )
    else:
        # Without '--init-tok2vec' the '--epoch-start' argument is ignored
        epoch_start = 0

    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
    msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start)
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
                "wb"
            ) as file_:
                file_.write(model.tok2vec.to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    skip_counter = 0
    for epoch in range(epoch_start, n_iter + epoch_start):
        for batch_id, batch in enumerate(
            util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
        ):
            docs, count = make_docs(
                nlp,
                [text for (text, _) in batch],
                max_length=max_length,
                min_length=min_length,
            )
            skip_counter += count
            loss = make_update(
                model, docs, optimizer, objective=loss_func, drop=dropout
            )
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
                    break
            if n_save_every and (batch_id % n_save_every == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
    if skip_counter > 0:
        msg.warn("Skipped {count} empty values".format(count=str(skip_counter)))
    msg.good("Successfully finished pretrain")
Example #21
0
def validate():
    """
    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
    with msg.loading("Loading compatibility table..."):
        r = requests.get(about.__compatibility__)
        if r.status_code != 200:
            msg.fail(
                "Server error ({})".format(r.status_code),
                "Couldn't fetch compatibility table.",
                exits=1,
            )
    msg.good("Loaded compatibility table")
    compat = r.json()["spacy"]
    version = about.__version__
    version = version.rsplit(".dev", 1)[0]
    current_compat = compat.get(version)
    if not current_compat:
        msg.fail(
            "Can't find spaCy v{} in compatibility table".format(version),
            about.__compatibility__,
            exits=1,
        )
    all_models = set()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
        for model, model_vs in models.items():
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    model_links = get_model_links(current_compat)
    model_pkgs = get_model_pkgs(current_compat, all_models)
    incompat_links = {l for l, d in model_links.items() if not d["compat"]}
    incompat_models = {
        d["name"]
        for _, d in model_pkgs.items() if not d["compat"]
    }
    incompat_models.update(
        [d["name"] for _, d in model_links.items() if not d["compat"]])
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

    msg.divider("Installed models (spaCy v{})".format(about.__version__))
    msg.info("spaCy installation: {}".format(path2str(spacy_dir)))

    if model_links or model_pkgs:
        header = ("TYPE", "NAME", "MODEL", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
            rows.append(get_model_row(current_compat, name, data, msg))
        for name, data in model_links.items():
            rows.append(get_model_row(current_compat, name, data, msg, "link"))
        msg.table(rows, header=header)
    else:
        msg.text("No models found in your current environment.", exits=0)
    if update_models:
        msg.divider("Install updates")
        msg.text("Use the following commands to update the model packages:")
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
        msg.text("The following models are not available for spaCy "
                 "v{}: {}".format(about.__version__, ", ".join(na_models)))
    if incompat_links:
        msg.text(
            "You may also want to overwrite the incompatible links using the "
            "`python -m spacy link` command with `--force`, or remove them "
            "from the data directory. "
            "Data path: {path}".format(path=path2str(get_data_path())))
    if incompat_models or incompat_links:
        sys.exit(1)
Example #22
0
def debug_model(
    config,
    resolved_train_config,
    nlp,
    pipe,
    *,
    print_settings: Optional[Dict[str, Any]] = None,
):
    if not hasattr(pipe, "model"):
        msg.fail(
            f"The component '{pipe}' does not specify an object that holds a Model.",
            exits=1,
        )
    model = pipe.model
    if not isinstance(model, Model):
        msg.fail(
            f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
            exits=1,
        )
    if print_settings is None:
        print_settings = {}

    # STEP 0: Printing before training
    msg.info(f"Analysing model with ID {model.id}")
    if print_settings.get("print_before_training"):
        msg.divider(f"STEP 0 - before training")
        _print_model(model, print_settings)

    # STEP 1: Initializing the model and printing again
    with data_validation(False):
        try:
            dot_names = [resolved_train_config["train_corpus"]]
            with show_validation_error():
                (train_corpus, ) = resolve_dot_names(config, dot_names)
                nlp.initialize(lambda: train_corpus(nlp))
            msg.info("Initialized the model with the training corpus.")
            examples = list(itertools.islice(train_corpus(nlp), 5))
        except ValueError:
            try:
                _set_output_dim(nO=7, model=model)
                with show_validation_error():
                    examples = [Example.from_dict(x, {}) for x in _get_docs()]
                    nlp.initialize(lambda: examples)
                msg.info("Initialized the model with dummy data.")
            except Exception:
                msg.fail(
                    "Could not initialize the model: you'll have to provide a valid 'train_corpus' argument in the config file.",
                    exits=1,
                )

    if print_settings.get("print_after_init"):
        msg.divider(f"STEP 1 - after initialization")
        _print_model(model, print_settings)

    # STEP 2: Updating the model and printing again
    set_dropout_rate(model, 0.2)
    # ugly hack to deal with Tok2Vec/Transformer listeners
    upstream_component = None
    if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref(
            "tok2vec").name:
        upstream_component = nlp.get_pipe("tok2vec")
    if (model.has_ref("tok2vec")
            and "transformer-listener" in model.get_ref("tok2vec").name):
        upstream_component = nlp.get_pipe("transformer")
    for e in range(3):
        if upstream_component:
            upstream_component.update(examples)
        pipe.update(examples)
    if print_settings.get("print_after_training"):
        msg.divider(f"STEP 2 - after training")
        _print_model(model, print_settings)

    # STEP 3: the final prediction
    prediction = model.predict([ex.predicted for ex in examples])
    if print_settings.get("print_prediction"):
        msg.divider(f"STEP 3 - prediction")
        msg.info(str(prediction))

    msg.good(f"Succesfully ended analysis - model looks good.")