Beispiel #1
0
def debug_data_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
    verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
    no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
    # fmt: on
):
    """
    Analyze, debug and validate your training and development data. Outputs
    useful stats, and can help you find problems like invalid entity annotations,
    cyclic dependencies, low data labels and more.

    DOCS: https://spacy.io/api/cli#debug-data
    """
    if ctx.command.name == "debug-data":
        msg.warn(
            "The debug-data command is now available via the 'debug data' "
            "subcommand (without the hyphen). You can run python -m spacy debug "
            "--help for an overview of the other available debugging commands."
        )
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    debug_data(
        config_path,
        config_overrides=overrides,
        ignore_warnings=ignore_warnings,
        verbose=verbose,
        no_format=no_format,
        silent=False,
    )
Beispiel #2
0
def collect_cmd(
    spec_file: typer.FileText,
    jobs: int = typer.Option(1, "--jobs", "-j"),
    commit_in: str = typer.Option(
        get_current_commit().hash, "--commit", show_default=True
    ),
    branch: str = typer.Option(get_branch(), "--branch", show_default=True),
) -> None:
    spec = load_spec(spec_file)
    storage = Storage(spec.storage_dir)

    commit = Commit(
        hash=str(commit_in),
        date=get_commit_date(commit_in),
        message=get_commit_message(commit_in),
    )
    # parent = Commit(hash=str(parent_in), date=get_commit_date(parent_in))
    parent = storage.get_branch_tip(get_branch())
    assert commit != parent, "We ran on this commit before it seems"

    msg.info(f"#jobs: {jobs}")
    msg.info(f"on commit:     {commit}")
    msg.info(f"parent commit: {parent}")

    if jobs > 1:
        msg.warn(
            "If you're running benchmarks from the collect call,"
            " concurrency can affect results"
        )

    assert jobs > 0, "Jobs value must be positive"

    msg.good("Spec loaded successfully")
    msg.divider()

    try:
        results = run_collectors(spec.collectors, jobs=jobs)
    except CollectorError as e:
        msg.fail("Collector returned invalid format")
        typer.echo(str(e.exc))
        return
        # raise e

    msg.good("Collection completed")
    # print(results)

    run = Run(
        commit=commit,
        parent=parent,
        branch=branch,
        date=datetime.now(),
        results=sum((r.metrics for r in results), []),
        context={},
    )

    # print(run)

    storage = Storage(spec.storage_dir)

    storage.store_run(run)
Beispiel #3
0
def print_pipe_analysis(
    analysis: Dict[str, Dict[str, Union[List[str], Dict]]],
    *,
    keys: List[str] = DEFAULT_KEYS,
) -> None:
    """Print a formatted version of the pipe analysis produced by analyze_pipes.

    analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis.
    keys (List[str]): The meta keys to show in the table.
    """
    msg.divider("Pipeline Overview")
    header = ["#", "Component", *[key.capitalize() for key in keys]]
    summary: ItemsView = analysis["summary"].items()
    body = [[i, n, *[v for v in m.values()]]
            for i, (n, m) in enumerate(summary)]
    msg.table(body, header=header, divider=True, multiline=True)
    n_problems = sum(len(p) for p in analysis["problems"].values())
    if any(p for p in analysis["problems"].values()):
        msg.divider(f"Problems ({n_problems})")
        for name, problem in analysis["problems"].items():
            if problem:
                msg.warn(
                    f"'{name}' requirements not met: {', '.join(problem)}")
    else:
        msg.good("No problems found.")
Beispiel #4
0
 def eval_dataset(set_id):
     DB = connect()
     data = DB.get_dataset(set_id)
     accepted = [
         eg for eg in data if eg["answer"] == "accept" and eg.get("accept")
     ]
     rejected = [eg for eg in data if eg["answer"] == "reject"]
     ignored = [eg for eg in data if eg["answer"] == "ignore"]
     if not accepted and not rejected:
         msg.warn("No annotations collected", exits=1)
     counts = Counter()
     for eg in accepted:
         for model_id in eg["accept"]:
             counts[model_id] += 1
     preference, _ = counts.most_common(1)[0]
     ratio = f"{counts[preference]} / {sum(counts.values()) - counts[preference]}"
     msg.info(f"Evaluating data from '{set_id}'")
     msg.text(
         f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
     if counts["A"] == counts["B"]:
         msg.warn(f"No preference ({ratio})")
     else:
         pc = counts[preference] / sum(counts.values())
         msg.good(
             f"You preferred vectors {preference} with {ratio} ({pc:.0%})")
         msg.text(mapping[preference])
Beispiel #5
0
def git_checkout(repo: str,
                 subpath: str,
                 dest: Path,
                 *,
                 branch: str = "master",
                 sparse: bool = False):
    git_version = get_git_version()
    if dest.exists():
        msg.fail("Destination of checkout must not exist", exits=1)
    if not dest.parent.exists():
        msg.fail("Parent of destination of checkout must exist", exits=1)
    if sparse and git_version >= (2, 22):
        return git_sparse_checkout(repo, subpath, dest, branch)
    elif sparse:
        # Only show warnings if the user explicitly wants sparse checkout but
        # the Git version doesn't support it
        err_old = (
            f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
            f"that doesn't fully support sparse checkout yet.")
        err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
        msg.warn(
            f"{err_unk if git_version == (0, 0) else err_old} "
            f"This means that more files than necessary may be downloaded "
            f"temporarily. To only download the files needed, make sure "
            f"you're using Git v2.22 or above.")
    with make_tempdir() as tmp_dir:
        cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
        run_command(cmd, capture=True)
        # We need Path(name) to make sure we also support subdirectories
        shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
Beispiel #6
0
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
    if not config_path or (str(config_path) != "-" and not config_path.exists()):
        msg.fail("Config file not found", config_path, exits=1)
    if output_dir.exists() and [p for p in output_dir.iterdir()]:
        if resume_path:
            msg.warn(
                "Output directory is not empty.",
                "If you're resuming a run in this directory, the old weights "
                "for the consecutive epochs will be overwritten with the new ones.",
            )
        else:
            msg.warn(
                "Output directory is not empty. ",
                "It is better to use an empty directory or refer to a new output path, "
                "then the new directory will be created for you.",
            )
    if resume_path is not None:
        if resume_path.is_dir():
            # This is necessary because Windows gives a Permission Denied when we
            # try to open the directory later, which is confusing. See #7878
            msg.fail(
                "--resume-path should be a weights file, but {resume_path} is a directory.",
                exits=True,
            )
        model_name = re.search(r"model\d+\.bin", str(resume_path))
        if not model_name and not epoch_resume:
            msg.fail(
                "You have to use the --epoch-resume setting when using a renamed weight file for --resume-path",
                exits=True,
            )
        elif not model_name and epoch_resume < 0:
            msg.fail(
                f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid",
                exits=True,
            )
Beispiel #7
0
def project_clone(
    name: str,
    dest: Path,
    *,
    repo: str = about.__projects__,
    branch: str = about.__projects_branch__,
    sparse_checkout: bool = False,
) -> None:
    """Clone a project template from a repository.

    name (str): Name of subdirectory to clone.
    dest (Path): Destination path of cloned project.
    repo (str): URL of Git repo containing project templates.
    branch (str): The branch to clone from
    """
    dest = ensure_path(dest)
    check_clone(name, dest, repo)
    project_dir = dest.resolve()
    repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
    try:
        git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
    except subprocess.CalledProcessError:
        err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
        msg.fail(err, exits=1)
    msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')",
             project_dir)
    if not (project_dir / PROJECT_FILE).exists():
        msg.warn(f"No {PROJECT_FILE} found in directory")
    else:
        msg.good(f"Your project is now ready!")
        print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
Beispiel #8
0
def profile_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read current calling context
    model: str = Arg(..., help="Trained pipeline to load"),
    inputs: Optional[Path] = Arg(None,
                                 help="Location of input file. '-' for stdin.",
                                 exists=True,
                                 allow_dash=True),
    n_texts: int = Opt(10000,
                       "--n-texts",
                       "-n",
                       help="Maximum number of texts to use if available"),
    # fmt: on
):
    """
    Profile which functions take the most time in a spaCy pipeline.
    Input should be formatted as one JSON object per line with a key "text".
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.

    DOCS: https://spacy.io/api/cli#debug-profile
    """
    if ctx.parent.command.name == NAME:  # type: ignore[union-attr]    # called as top-level command
        msg.warn(
            "The profile command is now available via the 'debug profile' "
            "subcommand. You can run python -m spacy debug --help for an "
            "overview of the other available debugging commands.")
    profile(model, inputs=inputs, n_texts=n_texts)
Beispiel #9
0
def link(*args, **kwargs):
    """As of spaCy v3.0, symlinks like "en" are not supported anymore. You can load trained
    pipeline packages using their full names or from a directory path."""
    msg.warn(
        "As of spaCy v3.0, model symlinks are not supported anymore. You can load trained "
        "pipeline packages using their full names or from a directory path."
    )
Beispiel #10
0
def project_assets(project_dir: Path,
                   *,
                   sparse_checkout: bool = False) -> None:
    """Fetch assets for a project using DVC if possible.

    project_dir (Path): Path to project directory.
    """
    project_path = ensure_path(project_dir)
    config = load_project_config(project_path)
    assets = config.get("assets", {})
    if not assets:
        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
    msg.info(f"Fetching {len(assets)} asset(s)")
    for asset in assets:
        dest = (project_dir / asset["dest"]).resolve()
        checksum = asset.get("checksum")
        if "git" in asset:
            git_err = (
                f"Cloning spaCy project templates requires Git and the 'git' command. "
                f"Make sure it's installed and that the executable is available."
            )
            get_git_version(error=git_err)
            if dest.exists():
                # If there's already a file, check for checksum
                if checksum and checksum == get_checksum(dest):
                    msg.good(
                        f"Skipping download with matching checksum: {asset['dest']}"
                    )
                    continue
                else:
                    if dest.is_dir():
                        shutil.rmtree(dest)
                    else:
                        dest.unlink()
            if "repo" not in asset["git"] or asset["git"]["repo"] is None:
                msg.fail(
                    "A git asset must include 'repo', the repository address.",
                    exits=1)
            if "path" not in asset["git"] or asset["git"]["path"] is None:
                msg.fail(
                    "A git asset must include 'path' - use \"\" to get the entire repository.",
                    exits=1,
                )
            git_checkout(
                asset["git"]["repo"],
                asset["git"]["path"],
                dest,
                branch=asset["git"].get("branch"),
                sparse=sparse_checkout,
            )
            msg.good(f"Downloaded asset {dest}")
        else:
            url = asset.get("url")
            if not url:
                # project.yml defines asset without URL that the user has to place
                check_private_asset(dest, checksum)
                continue
            fetch_asset(project_path, url, dest, checksum)
Beispiel #11
0
def init_model(
    lang,
    output_dir,
    freqs_loc=None,
    clusters_loc=None,
    jsonl_loc=None,
    vectors_loc=None,
    truncate_vectors=0,
    prune_vectors=-1,
    vectors_name=None,
    model_name=None,
):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
    and word vectors. If vectors are provided in Word2Vec format, they can
    be either a .txt or zipped as a .zip or .tar.gz.
    """
    if jsonl_loc is not None:
        if freqs_loc is not None or clusters_loc is not None:
            settings = ["-j"]
            if freqs_loc:
                settings.append("-f")
            if clusters_loc:
                settings.append("-c")
            msg.warn(
                "Incompatible arguments",
                "The -f and -c arguments are deprecated, and not compatible "
                "with the -j argument, which should specify the same "
                "information. Either merge the frequencies and clusters data "
                "into the JSONL-formatted file (recommended), or use only the "
                "-f and -c files, without the other lexical attributes.",
            )
        jsonl_loc = ensure_path(jsonl_loc)
        lex_attrs = srsly.read_jsonl(jsonl_loc)
    else:
        clusters_loc = ensure_path(clusters_loc)
        freqs_loc = ensure_path(freqs_loc)
        if freqs_loc is not None and not freqs_loc.exists():
            msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
        lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)

    with msg.loading("Creating model..."):
        nlp = create_model(lang, lex_attrs, name=model_name)
    msg.good("Successfully created model")
    if vectors_loc is not None:
        add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors,
                    vectors_name)
    vec_added = len(nlp.vocab.vectors)
    lex_added = len(nlp.vocab)
    msg.good(
        "Sucessfully compiled vocab",
        "{} entries, {} vectors".format(lex_added, vec_added),
    )
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    return nlp
Beispiel #12
0
 async def insert_commons_categories(self, row, delay):
     try:
         await asyncio.sleep(delay)
         self.process_data(row,
                           config.CORPUS_ARTICLES_HEADER[31],
                           data=Extract(self.project, row["qid"]).commons_categories())
         self.utils.report_time(self.start_time)
     except pwb.exceptions.NoPage:
         msg.warn("Sin elemento de Wikidata")
         self.utils.report_time(self.start_time)
Beispiel #13
0
 async def insert_wikidata_references_p143(self, row, delay):
     try:
         await asyncio.sleep(delay)
         self.process_data(row,
                           config.CORPUS_ARTICLES_HEADER[25],
                           data=Extract(self.project, row["qid"]).wikidata_references()[1])
         self.utils.report_time(self.start_time)
     except pwb.exceptions.NoPage:
         msg.warn("Sin elemento de Wikidata")
         self.utils.report_time(self.start_time)
Beispiel #14
0
    def process_data(self, row, header_id, data=None):
        if data is None:
            data = row

        msg.info(f"id: {row['qid']}")
        msg.good(f"{data}")

        try:
            self.insert(data, row["id"], header_id)
            self.export()
        except pwb.exceptions.InvalidTitle:
            msg.warn("Título invalido: {row['qid']}")
            logging.error("Título invalido: {row['qid']}")
            self.utils.should_continue()
        except pwb.exceptions.NoPage:
            msg.warn("No tiene página en eswiki: {row['qid']}")
            logging.error("No tiene página en eswiki: {row['qid']}")

        except pwb.exceptions.IsRedirectPage:
            # TODO: Se debe añadir un sistema por el que detectar que es una redirección y obtener
            # la página de destino, y luego trabajar con ella.
            msg.warn(f"Es una redirección: {row['qid']}")
            logging.error(f"Es una redirección: {row['qid']}")
            pass
        # TODO: redefinir este "bare except"
        except:
            msg.warn(f"Error inesperado: {sys.exc_info()[0]}")
            logging.error(f"Error inesperado: {sys.exc_info()[0]}")
            pass
Beispiel #15
0
def validate() -> None:
    model_pkgs, compat = get_model_pkgs()
    spacy_version = get_minor_version(about.__version__)
    current_compat = compat.get(spacy_version, {})
    if not current_compat:
        msg.warn(f"No compatible packages found for v{spacy_version} of spaCy")
    incompat_models = {
        d["name"]
        for _, d in model_pkgs.items() if not d["compat"]
    }
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

    msg.divider(f"Installed pipeline packages (spaCy v{about.__version__})")
    msg.info(f"spaCy installation: {spacy_dir}")

    if model_pkgs:
        header = ("NAME", "SPACY", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
            if data["compat"]:
                comp = msg.text("", color="green", icon="good", no_print=True)
                version = msg.text(data["version"],
                                   color="green",
                                   no_print=True)
            else:
                version = msg.text(data["version"],
                                   color="yellow",
                                   no_print=True)
                comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}"
            rows.append((data["name"], data["spacy"], version, comp))
        msg.table(rows, header=header)
    else:
        msg.text("No pipeline packages found in your current environment.",
                 exits=0)
    if update_models:
        msg.divider("Install updates")
        msg.text("Use the following commands to update the packages:")
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
        msg.info(
            f"The following packages are custom spaCy pipelines or not "
            f"available for spaCy v{about.__version__}:",
            ", ".join(na_models),
        )
    if incompat_models:
        sys.exit(1)
Beispiel #16
0
    def apply_bilou_schema(self, message: Dict) -> List[Text]:
        """Apply BILOU schema to a gold standard JSON example.

        Args:
            message (dict): message dict.

        Returns:
            a list of BILOU tags.
        """
        tokens = self.tokens_without_cls(message)
        entity_offsets = get_entity_offsets(message)
        entities = bilou_tags_from_offsets(tokens, entity_offsets)

        collected = []
        for t, e in zip(tokens, entities):
            if e == "-":
                collected.append(t)
            elif collected:
                collected_text = " ".join([t.text for t in collected])
                msg and msg.warn(
                    f"Misaligned entity annotation for '{collected_text}' "
                    f"in sentence: \"{message['text']}\". "
                    f"Make sure the start and end values of the "
                    f"annotated training examples end at token "
                    f"boundaries (e.g. don't include trailing "
                    f"whitespaces or punctuation).")
                collected = []

        return entities
Beispiel #17
0
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
    """Check and validate assets without a URL (private assets that the user
    has to provide themselves) and give feedback about the checksum.

    dest (Path): Destination path of the asset.
    checksum (Optional[str]): Optional checksum of the expected file.
    """
    if not Path(dest).exists():
        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
        msg.warn(err)
    else:
        if not checksum:
            msg.good(f"Asset already exists: {dest}")
        elif checksum == get_checksum(dest):
            msg.good(f"Asset exists with matching checksum: {dest}")
        else:
            msg.fail(f"Asset available but with incorrect checksum: {dest}")
Beispiel #18
0
def fetch_asset(project_path: Path,
                url: str,
                dest: Path,
                checksum: Optional[str] = None) -> None:
    """Fetch an asset from a given URL or path. If a checksum is provided and a
    local file exists, it's only re-downloaded if the checksum doesn't match.

    project_path (Path): Path to project directory.
    url (str): URL or path to asset.
    checksum (Optional[str]): Optional expected checksum of local file.
    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
        the asset failed.
    """
    dest_path = (project_path / dest).resolve()
    if dest_path.exists():
        # If there's already a file, check for checksum
        if checksum:
            if checksum == get_checksum(dest_path):
                msg.good(f"Skipping download with matching checksum: {dest}")
                return
        else:
            # If there's not a checksum, make sure the file is a possibly valid size
            if os.path.getsize(dest_path) == 0:
                msg.warn(
                    f"Asset exists but with size of 0 bytes, deleting: {dest}")
                os.remove(dest_path)
    # We might as well support the user here and create parent directories in
    # case the asset dir isn't listed as a dir to create in the project.yml
    if not dest_path.parent.exists():
        dest_path.parent.mkdir(parents=True)
    with working_dir(project_path):
        url = convert_asset_url(url)
        try:
            download_file(url, dest_path)
            msg.good(f"Downloaded asset {dest}")
        except requests.exceptions.RequestException as e:
            if Path(url).exists() and Path(url).is_file():
                # If it's a local file, copy to destination
                shutil.copy(url, str(dest_path))
                msg.good(f"Copied local asset {dest}")
            else:
                msg.fail(f"Download failed: {dest}", e)
    if checksum and checksum != get_checksum(dest_path):
        msg.fail(
            f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
Beispiel #19
0
def convert_asset_url(url: str) -> str:
    """Check and convert the asset URL if needed.

    url (str): The asset URL.
    RETURNS (str): The converted URL.
    """
    # If the asset URL is a regular GitHub URL it's likely a mistake
    if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url:
        converted = url.replace("github.com", "raw.githubusercontent.com")
        converted = re.sub(r"/(tree|blob)/", "/", converted)
        msg.warn(
            "Downloading from a regular GitHub URL. This will only download "
            "the source of the page, not the actual file. Converting the URL "
            "to a raw URL.",
            converted,
        )
        return converted
    return url
Beispiel #20
0
    def __init__(self, vectorizer, architecture, state_dict_path, labels):
        """
        - param vectorizer(callable): a function that converts any string to a NumPy 1-D array.
        - param architecture(class): a `torch.nn.Module` child class to be instantiated into a neural net.
        - param state_dict_path(str): path to a PyTorch state dict that matches the architecture.
        - param labels(list of str): the classification labels, e.g. ["POSITIVE", "NEGATIVE"].
        """

        # set up label conversion
        self.label_encoder = {_label: i for i, _label in enumerate(labels)}
        self.label_decoder = {i: _label for i, _label in enumerate(labels)}
        self.num_classes = len(self.label_encoder)

        # set up vectorizer and the neural network with appropriate dimensions
        self.vectorizer = vectorizer
        vec_dim = self.vectorizer("").shape[0]
        self.nn = architecture(vec_dim, self.num_classes)

        # if a state dict exists, load it and create a backup copy
        import os

        if os.path.isfile(state_dict_path):
            from shutil import copyfile

            try:
                self.nn.load_state_dict(torch.load(state_dict_path))
            except Exception as e:
                logger.warn(f"Load VectorNet state path failed with {type(e)}: e")

            state_dict_backup_path = (
                f"{state_dict_path}.{datetime.now().strftime('%Y%m%d%H%M%S')}"
            )
            copyfile(state_dict_path, state_dict_backup_path)

        # set a path to store updated parameters
        self.nn_update_path = state_dict_path

        # initialize an optimizer object and a dict to hold dynamic parameters
        self.nn_optimizer = torch.optim.Adam(self.nn.parameters())
        self._dynamic_params = {"optimizer": {"lr": 0.01, "betas": (0.9, 0.999)}}
Beispiel #21
0
def main(name: ("模型名称", "positional", None, None, trf_list),
         make_cache_dir: (" 创建缓存文件夹", "flag", "mk"),
         use_local_class: ("不使用网络读取", "flag", "local")):
    if make_cache_dir:
        c_path = ensure_path(f"{cache_path + name}")
        if c_path.exists():
            msg.warn(f"{cache_path + name} already exists")
        else:
            c_path.mkdir()
            msg.good(f" 缓存文件夹已创建:\t{cache_path}{name}")

    msg.warn("\n================url================\n")

    config_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[name]

    model_file = ALL_PRETRAINED_MODEL_ARCHIVE_MAP[name]
    msg.text(f"{config_file}\n{model_file}\n")

    vocab = get_tokenizer(name, use_local_class)
    pretrained_vocab_files_map = vocab.pretrained_vocab_files_map
    for vocab_file in pretrained_vocab_files_map.values():
        msg.text(f"{vocab_file[name]}\n")

    msg.warn("\n================url================\n")
    msg.good("\n使用下载工具下载后,将模型文件放入缓存文件夹中。")
def get_source_files(lang):
    exercises_path = Path(EXERCISES_DIR)
    if not exercises_path.exists():
        msg.fail(f"Can't find exercises directory: {EXERCISES_DIR}", exits=1)
    for lang_path in exercises_path.iterdir():
        if lang_path.is_dir():
            lang_name = lang_path.stem
            if lang and lang_name != lang:
                continue
            for py_file in lang_path.iterdir():
                if py_file.name.startswith("test_"):
                    solution_name = f"solution_{py_file.name.split('test_')[1]}"
                    solution_file = lang_path / solution_name
                    if not solution_file.exists():
                        if py_file.name == GENERAL_TEST:
                            yield (lang_name, py_file, None)
                        else:
                            msg.warn(
                                f"Didn't find solution for test: {py_file.stem} ({lang_path})"
                            )
                    else:
                        yield (lang_name, py_file, solution_file)
Beispiel #23
0
def download(model: str,
             direct: bool = False,
             sdist: bool = False,
             *pip_args) -> None:
    if (not (is_package("spacy") or is_package("spacy-nightly"))
            and "--no-deps" not in pip_args):
        msg.warn(
            "Skipping pipeline package dependencies and setting `--no-deps`. "
            "You don't seem to have the spaCy package itself installed "
            "(maybe because you've built from source?), so installing the "
            "package dependencies would cause spaCy to be downloaded, which "
            "probably isn't what you want. If the pipeline package has other "
            "dependencies, you'll have to install them manually.")
        pip_args = pip_args + ("--no-deps", )
    suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
    dl_tpl = "{m}-{v}/{m}-{v}{s}#egg={m}=={v}"
    if direct:
        components = model.split("-")
        model_name = "".join(components[:-1])
        version = components[-1]
        download_model(dl_tpl.format(m=model_name, v=version, s=suffix),
                       pip_args)
    else:
        model_name = model
        if model in OLD_MODEL_SHORTCUTS:
            msg.warn(
                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please"
                f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
            )
            model_name = OLD_MODEL_SHORTCUTS[model]
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
        download_model(dl_tpl.format(m=model_name, v=version, s=suffix),
                       pip_args)
    msg.good(
        "Download and installation successful",
        f"You can now load the package via spacy.load('{model_name}')",
    )
Beispiel #24
0
def main(in_file, model_file=None, config_file=None, spacy_model=None):
    """Train CRF entity tagger."""
    if config_file:
        msg.info(f"Loading config: {config_file}")
        component_config = srsly.read_json(config_file)
    else:
        component_config = None

    model_file = model_file or "model.pkl"
    msg.info("Loading model from file", model_file)
    crf_extractor = CRFExtractor(
        component_config=component_config).from_disk(model_file)
    msg.good("Successfully loaded CRF tagger", crf_extractor)

    msg.info("Loading dev dataset from file", in_file)
    dev_examples = read_file(in_file)
    msg.good(f"Successfully loaded {len(dev_examples)} dev examples.")

    if spacy_model is not None:
        nlp = spacy.load(spacy_model)
        msg.info(f"Using spaCy model: {spacy_model}")
    else:
        nlp = spacy.blank("en")
        msg.info(f"Using spaCy blank: 'en'")

    tokenizer = SpacyTokenizer(nlp=nlp)
    use_dense_features = crf_extractor.use_dense_features()
    dev_crf_examples = [
        gold_example_to_crf_tokens(ex,
                                   tokenizer=tokenizer,
                                   use_dense_features=use_dense_features)
        for ex in dev_examples
    ]

    f1_score, classification_report = crf_extractor.eval(dev_crf_examples)
    msg.warn(f"f1 score: {f1_score}")
    print(classification_report)
Beispiel #25
0
def check_workflows(workflows: List[str],
                    workflow: Optional[str] = None) -> None:
    """Validate workflows provided in project.yml and check that a given
    workflow can be used to generate a DVC config.

    workflows (List[str]): Names of the available workflows.
    workflow (Optional[str]): The name of the workflow to convert.
    """
    if not workflows:
        msg.fail(
            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
            f"define at least one list of commands.",
            exits=1,
        )
    if workflow is not None and workflow not in workflows:
        msg.fail(
            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
            f"Available workflows: {', '.join(workflows)}",
            exits=1,
        )
    if not workflow:
        msg.warn(
            f"No workflow specified for DVC pipeline. Using the first workflow "
            f"defined in {PROJECT_FILE}: '{workflows[0]}'")
Beispiel #26
0
 def eval_dataset(set_id):
     """Output summary about user agreement with the model."""
     DB = connect()
     data = DB.get_dataset(set_id)
     accepted = [
         eg for eg in data if eg["answer"] == "accept" and eg.get("accept")
     ]
     rejected = [eg for eg in data if eg["answer"] == "reject"]
     if not accepted and not rejected:
         msg.warn("No annotations collected", exits=1)
     high_conf = 0.8
     agree_count = 0
     disagree_high_conf = len(
         [e for e in rejected if e["confidence"] > high_conf])
     for eg in accepted:
         choice = eg["accept"][0]
         score_choice = [
             o["score"] for o in eg["options"] if o["id"] == choice
         ][0]
         score_other = [
             o["score"] for o in eg["options"] if o["id"] != choice
         ][0]
         if score_choice > score_other:
             agree_count += 1
         elif eg["confidence"] > high_conf:
             disagree_high_conf += 1
     pc = agree_count / (len(accepted) + len(rejected))
     text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})"
     msg.info(f"Evaluating data from '{set_id}'")
     if pc > 0.5:
         msg.good(text)
     else:
         msg.fail(text)
     msg.text(
         f"You disagreed on {disagree_high_conf} high confidence scores")
     msg.text(f"You rejected {len(rejected)} suggestions as not similar")
Beispiel #27
0
 def eval_dataset(set_id):
     DB = connect()
     data = DB.get_dataset(set_id)
     accepted = [
         eg for eg in data if eg["answer"] == "accept" and eg.get("accept")
     ]
     rejected = [eg for eg in data if eg["answer"] == "reject"]
     ignored = [eg for eg in data if eg["answer"] == "ignore"]
     if not accepted and not rejected:
         msg.warn("No annotations collected", exits=1)
     total_count = 0
     agree_count = 0
     for eg in accepted:
         total_count += len(eg.get("options", []))
         agree_count += len(eg.get("accept", []))
     msg.info(f"Evaluating data from '{set_id}'")
     msg.text(
         f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
     pc = agree_count / total_count
     text = f"You agreed {agree_count} / {total_count} times ({pc:.0%})"
     if pc > 0.5:
         msg.good(text)
     else:
         msg.fail(text)
Beispiel #28
0
def train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    replace_components=False,
    vectors=None,
    width=96,
    conv_depth=4,
    cnn_window=1,
    cnn_pieces=3,
    bilstm_depth=0,
    embed_rows=2000,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    orth_variant_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    textcat_multilabel=False,
    textcat_arch="bow",
    textcat_positive_label=None,
    tag_map_path=None,
    omit_extra_lookups=False,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [
            p for p in output_path.iterdir() if p.is_dir()
    ]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()
        msg.good("Created output directory: {}".format(output_path))

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    disabled_pipes = None
    pipes_added = False
    msg.text("Training pipeline: {}".format(pipeline))
    if use_gpu >= 0:
        activated_gpu = None
        try:
            activated_gpu = set_gpu(use_gpu)
        except Exception as e:
            msg.warn("Exception: {}".format(e))
        if activated_gpu is not None:
            msg.text("Using GPU: {}".format(use_gpu))
        else:
            msg.warn("Unable to activate GPU: {}".format(use_gpu))
            msg.text("Using CPU only")
            use_gpu = -1
    base_components = []
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        for pipe in pipeline:
            pipe_cfg = {}
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            elif pipe == "textcat":
                pipe_cfg = {
                    "exclusive_classes": not textcat_multilabel,
                    "architecture": textcat_arch,
                    "positive_label": textcat_positive_label,
                }
            if pipe not in nlp.pipe_names:
                msg.text("Adding component to base model: '{}'".format(pipe))
                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
                pipes_added = True
            elif replace_components:
                msg.text(
                    "Replacing component from base model '{}'".format(pipe))
                nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
                pipes_added = True
            else:
                if pipe == "textcat":
                    textcat_cfg = nlp.get_pipe("textcat").cfg
                    base_cfg = {
                        "exclusive_classes": textcat_cfg["exclusive_classes"],
                        "architecture": textcat_cfg["architecture"],
                        "positive_label": textcat_cfg["positive_label"],
                    }
                    if base_cfg != pipe_cfg:
                        msg.fail(
                            "The base textcat model configuration does"
                            "not match the provided training options. "
                            "Existing cfg: {}, provided cfg: {}".format(
                                base_cfg, pipe_cfg),
                            exits=1,
                        )
                msg.text(
                    "Extending component from base model '{}'".format(pipe))
                base_components.append(pipe)
        disabled_pipes = nlp.disable_pipes(
            [p for p in nlp.pipe_names if p not in pipeline])
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            elif pipe == "textcat":
                pipe_cfg = {
                    "exclusive_classes": not textcat_multilabel,
                    "architecture": textcat_arch,
                    "positive_label": textcat_positive_label,
                }
            else:
                pipe_cfg = {}
            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))

    if tag_map_path is not None:
        tag_map = srsly.read_json(tag_map_path)
        # Replace tag map with provided mapping
        nlp.vocab.morphology.load_tag_map(tag_map)

    # Create empty extra lexeme tables so the data from spacy-lookups-data
    # isn't loaded if these features are accessed
    if omit_extra_lookups:
        nlp.vocab.lookups_extra = Lookups()
        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
        nlp.vocab.lookups_extra.add_table("lexeme_prob")
        nlp.vocab.lookups_extra.add_table("lexeme_settings")

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks),
                         ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail("Can't use multitask objective without '{}' in the "
                         "pipeline".format(pipe_name))
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model and not pipes_added:
        # Start with an existing model, use default optimizer
        optimizer = nlp.resume_training(device=use_gpu)
    else:
        # Start with a blank model, call begin_training
        cfg = {"device": use_gpu}
        cfg["conv_depth"] = conv_depth
        cfg["token_vector_width"] = width
        cfg["bilstm_depth"] = bilstm_depth
        cfg["cnn_maxout_pieces"] = cnn_pieces
        cfg["embed_size"] = embed_rows
        cfg["conv_window"] = cnn_window
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg)

    nlp._optimizer = None

    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec,
                                              base_components)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # Verify textcat config
    if "textcat" in pipeline:
        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
        if textcat_positive_label and textcat_positive_label not in textcat_labels:
            msg.fail(
                "The textcat_positive_label (tpl) '{}' does not match any "
                "label in the training data.".format(textcat_positive_label),
                exits=1,
            )
        if textcat_positive_label and len(textcat_labels) != 2:
            msg.fail(
                "A textcat_positive_label (tpl) '{}' was provided for training "
                "data that does not appear to be a binary classification "
                "problem with two labels.".format(textcat_positive_label),
                exits=1,
            )
        train_docs = corpus.train_docs(
            nlp,
            noise_level=noise_level,
            gold_preproc=gold_preproc,
            max_length=0,
            ignore_misaligned=True,
        )
        train_labels = set()
        if textcat_multilabel:
            multilabel_found = False
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1:
                    multilabel_found = True
            if not multilabel_found and not base_model:
                msg.warn("The textcat training instances look like they have "
                         "mutually-exclusive classes. Remove the flag "
                         "'--textcat-multilabel' to train a classifier with "
                         "mutually-exclusive classes.")
        if not textcat_multilabel:
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1 and not base_model:
                    msg.warn(
                        "Some textcat training instances do not have exactly "
                        "one positive label. Modifying training options to "
                        "include the flag '--textcat-multilabel' for classes "
                        "that are not mutually exclusive.")
                    nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
                    textcat_multilabel = True
                    break
        if base_model and set(textcat_labels) != train_labels:
            msg.fail(
                "Cannot extend textcat model using data with different "
                "labels. Base model labels: {}, training data labels: "
                "{}.".format(textcat_labels, list(train_labels)),
                exits=1,
            )
        if textcat_multilabel:
            msg.text(
                "Textcat evaluation score: ROC AUC score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        elif textcat_positive_label and len(textcat_labels) == 2:
            msg.text("Textcat evaluation score: F1-score for the "
                     "label '{}'".format(textcat_positive_label))
        elif len(textcat_labels) > 1:
            if len(textcat_labels) == 2:
                msg.warn(
                    "If the textcat component is a binary classifier with "
                    "exclusive classes, provide '--textcat-positive-label' for "
                    "an evaluation on the positive class.")
            msg.text(
                "Textcat evaluation score: F1-score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        else:
            msg.fail(
                "Unsupported textcat configuration. Use `spacy debug-data` "
                "for more information.")

    # fmt: off
    row_head, output_stats = _configure_training_output(
        pipeline, use_gpu, has_beam_widths)
    row_widths = [len(w) for w in row_head]
    row_settings = {
        "widths": row_widths,
        "aligns": tuple(["r" for i in row_head]),
        "spacing": 2
    }
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp,
                noise_level=noise_level,
                orth_variant_level=orth_variant_level,
                gold_preproc=gold_preproc,
                max_length=0,
                ignore_misaligned=True,
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8)
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs,
                                                     size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    try:
                        nlp.update(
                            docs,
                            golds,
                            sgd=optimizer,
                            drop=next(dropout_rates),
                            losses=losses,
                        )
                    except ValueError as e:
                        err = "Error during training"
                        if init_tok2vec:
                            err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
                        msg.fail(err,
                                 "Original error message: {}".format(e),
                                 exits=1)
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(
                            nlp_loaded,
                            gold_preproc=gold_preproc,
                            ignore_misaligned=True,
                        ))
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        # Only evaluate on CPU in the first iteration (for
                        # timing) if GPU is enabled
                        if i == 0:
                            with Model.use_device("cpu"):
                                nlp_loaded = util.load_model_from_path(
                                    epoch_model_path)
                                for name, component in nlp_loaded.pipeline:
                                    if hasattr(component, "cfg"):
                                        component.cfg[
                                            "beam_width"] = beam_width
                                dev_docs = list(
                                    corpus.dev_docs(
                                        nlp_loaded,
                                        gold_preproc=gold_preproc,
                                        ignore_misaligned=True,
                                    ))
                                start_time = timer()
                                scorer = nlp_loaded.evaluate(dev_docs,
                                                             verbose=verbose)
                                end_time = timer()
                                cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta.setdefault("accuracy", {})
                        for component in nlp.pipe_names:
                            for metric in _get_metrics(component):
                                meta["accuracy"][metric] = scorer.scores[
                                    metric]
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        for component in nlp.pipe_names:
                            for metric in _get_metrics(component):
                                meta["beam_accuracy"][metric] = scorer.scores[
                                    metric]
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta["labels"] = nlp.meta["labels"]
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        output_stats,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    if i == 0 and "textcat" in pipeline:
                        textcats_per_cat = scorer.scores.get(
                            "textcats_per_cat", {})
                        for cat, cat_score in textcats_per_cat.items():
                            if cat_score.get("roc_auc_score", 0) < 0:
                                msg.warn(
                                    "Textcat ROC AUC score is undefined due to "
                                    "only one value in label '{}'.".format(
                                        cat))
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        iter_current = i + 1
                        msg.text("Early stopping, best iteration "
                                 "is: {}".format(iter_current -
                                                 iter_since_best))
                        msg.text("Best score = {}; Final iteration "
                                 "score = {}".format(best_score,
                                                     current_score))
                        break
    except Exception as e:
        msg.warn(
            "Aborting and saving the final best model. "
            "Encountered exception: {}".format(e),
            exits=1,
        )
    finally:
        best_pipes = nlp.pipe_names
        if disabled_pipes:
            disabled_pipes.restore()
            meta["pipeline"] = nlp.pipe_names
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
            srsly.write_json(final_model_path / "meta.json", meta)

            meta_loc = output_path / "model-final" / "meta.json"
            final_meta = srsly.read_json(meta_loc)
            final_meta.setdefault("accuracy", {})
            final_meta["accuracy"].update(meta.get("accuracy", {}))
            final_meta.setdefault("speed", {})
            final_meta["speed"].setdefault("cpu", None)
            final_meta["speed"].setdefault("gpu", None)
            meta.setdefault("speed", {})
            meta["speed"].setdefault("cpu", None)
            meta["speed"].setdefault("gpu", None)
            # combine cpu and gpu speeds with the base model speeds
            if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
                speed = _get_total_speed(
                    [final_meta["speed"]["cpu"], meta["speed"]["cpu"]])
                final_meta["speed"]["cpu"] = speed
            if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
                speed = _get_total_speed(
                    [final_meta["speed"]["gpu"], meta["speed"]["gpu"]])
                final_meta["speed"]["gpu"] = speed
            # if there were no speeds to update, overwrite with meta
            if (final_meta["speed"]["cpu"] is None
                    and final_meta["speed"]["gpu"] is None):
                final_meta["speed"].update(meta["speed"])
            # note: beam speeds are not combined with the base model
            if has_beam_widths:
                final_meta.setdefault("beam_accuracy", {})
                final_meta["beam_accuracy"].update(
                    meta.get("beam_accuracy", {}))
                final_meta.setdefault("beam_speed", {})
                final_meta["beam_speed"].update(meta.get("beam_speed", {}))
            srsly.write_json(meta_loc, final_meta)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(final_meta, output_path,
                                                  best_pipes)
        msg.good("Created best model", best_model_path)
Beispiel #29
0
def project_assets(
    project_dir: Path,
    *,
    overrides: Dict[str, Any] = SimpleFrozenDict(),
    sparse_checkout: bool = False,
    extra: bool = False,
) -> None:
    """Fetch assets for a project using DVC if possible.

    project_dir (Path): Path to project directory.
    sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
                            needed.
    extra (bool): Whether to download all assets, including those marked as 'extra'.
    """
    project_path = ensure_path(project_dir)
    config = load_project_config(project_path, overrides=overrides)
    assets = [
        asset
        for asset in config.get("assets", [])
        if extra or not asset.get("extra", EXTRA_DEFAULT)
    ]
    if not assets:
        msg.warn(
            f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
            exits=0,
        )
    msg.info(f"Fetching {len(assets)} asset(s)")

    for asset in assets:
        dest = (project_dir / asset["dest"]).resolve()
        checksum = asset.get("checksum")
        if "git" in asset:
            git_err = (
                f"Cloning spaCy project templates requires Git and the 'git' command. "
                f"Make sure it's installed and that the executable is available."
            )
            get_git_version(error=git_err)
            if dest.exists():
                # If there's already a file, check for checksum
                if checksum and checksum == get_checksum(dest):
                    msg.good(
                        f"Skipping download with matching checksum: {asset['dest']}"
                    )
                    continue
                else:
                    if dest.is_dir():
                        shutil.rmtree(dest)
                    else:
                        dest.unlink()
            if "repo" not in asset["git"] or asset["git"]["repo"] is None:
                msg.fail(
                    "A git asset must include 'repo', the repository address.", exits=1
                )
            if "path" not in asset["git"] or asset["git"]["path"] is None:
                msg.fail(
                    "A git asset must include 'path' - use \"\" to get the entire repository.",
                    exits=1,
                )
            git_checkout(
                asset["git"]["repo"],
                asset["git"]["path"],
                dest,
                branch=asset["git"].get("branch"),
                sparse=sparse_checkout,
            )
            msg.good(f"Downloaded asset {dest}")
        else:
            url = asset.get("url")
            if not url:
                # project.yml defines asset without URL that the user has to place
                check_private_asset(dest, checksum)
                continue
            fetch_asset(project_path, url, dest, checksum)
Beispiel #30
0
def main(model_path,
         out_dir,
         min_freq_ratio=0.0,
         min_distance=0.0,
         check_keys=''):
    check_keys_list = []
    if len(check_keys) > 0:
        check_keys_list = list(map(lambda x: x.strip(), check_keys.split(',')))

    s2v = Sense2Vec().from_disk(model_path)
    output_path = Path(out_dir)
    vocab = {}
    for key, score in s2v.frequencies:
        vocab[key] = score
    vectors = {}
    for key, val in s2v:
        vectors[key] = val
    msg.info("loading vectors")
    for key, val in s2v:
        vector_size = len(val)
        break
    all_senses = s2v.senses
    msg.info("loaded vectors")

    if len(check_keys_list) > 0:
        blacklist = {}
        whitelist = []
        blacklisted_sense_keys = get_blacklisted_sense_keys(vocab)
        markdown_and_url_keys = get_markdown_and_url_keys(vocab)
        minority_keys = get_minority_keys(vocab, min_freq_ratio)
        redundant_keys = get_redundant_keys(vocab, vectors, min_distance)
        for k in check_keys_list:
            if k in blacklisted_sense_keys:
                blacklist[k] = 'sense'
            elif k in markdown_and_url_keys:
                blacklist[k] = 'markdown / url'
            elif k in minority_keys:
                blacklist[k] = 'minority'
            elif k in redundant_keys:
                blacklist[k] = 'redundant'
            else:
                whitelist.append(k)
        msg.warn('blacklist')
        for k in blacklist.keys():
            msg.warn("{k}: {v}".format(k=k, v=blacklist[k]))
        msg.good('whitelist')
        for k in whitelist:
            msg.good(k)
    else:
        discarded = set()
        discarded.update(get_blacklisted_sense_keys(vocab))
        discarded.update(get_markdown_and_url_keys(vocab))
        discarded.update(get_minority_keys(vocab, min_freq_ratio))
        discarded.update(get_redundant_keys(vocab, vectors, min_distance))
        n_vectors = len(vectors) - len(discarded)
        s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
        for key, vector in vectors.items():
            if key not in discarded:
                s2v.add(key, vector)
                if key in vocab:
                    s2v.set_freq(key, vocab[key])
        msg.good("Created the sense2vec model")
        msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses")
        s2v.to_disk(output_path)
        msg.good("Saved model to directory", out_dir)