Ejemplo n.º 1
0
    def pipe_(self, operations: List[Union[str, OperationState]]) -> None:
        """Run a sequence of operations on dataset data.
        Internally calls Dataset.apply_ and will resolve named
        operations in registry.operations

        Args:
            operations (List[Union[str, OperationState]]): List of operations
        """

        msg = Printer(no_print=self.verbose == False)
        msg.text(f"Applying pipeline of operations inplace to the dataset: {self.name}")

        for op in operations:
            op_name = op.name if isinstance(op, OperationState) else op
            msg.text(f"|_ {op_name}")

        for op in operations:
            if isinstance(op, str):
                op_name = op
                args = []
                kwargs = {}
                initial_state = None
            elif isinstance(op, OperationState):
                op_name = op.name
                args = op.args
                kwargs = op.kwargs
                initial_state = op

            operation = registry.operations.get(op_name)

            self.apply_(operation, *args, initial_state=initial_state, **kwargs)
Ejemplo n.º 2
0
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
    output directory, and model data will be copied over. If --create-meta is
    set and a meta.json already exists in the output directory, the existing
    values will be used as the defaults in the command-line prompt.
    """
    msg = Printer()
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
        msg.fail("Can't locate model data", input_path, exits=1)
    if not output_path or not output_path.exists():
        msg.fail("Output directory not found", output_path, exits=1)
    if meta_path and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)

    meta_path = meta_path or input_path / "meta.json"
    if meta_path.is_file():
        meta = srsly.read_json(meta_path)
        if not create_meta:  # only print if user doesn't want to overwrite
            msg.good("Loaded meta.json from file", meta_path)
        else:
            meta = generate_meta(input_dir, meta, msg)
    for key in ("lang", "name", "version"):
        if key not in meta or meta[key] == "":
            msg.fail(
                "No '{}' setting found in meta.json".format(key),
                "This setting is required to build your package.",
                exits=1,
            )
    model_name = meta["lang"] + "_" + meta["name"]
    model_name_v = model_name + "-" + meta["version"]
    main_path = output_path / model_name_v
    package_path = main_path / model_name

    if package_path.exists():
        if force:
            shutil.rmtree(path2str(package_path))
        else:
            msg.fail(
                "Package directory already exists",
                "Please delete the directory and try again, or use the "
                "`--force` flag to overwrite existing "
                "directories.".format(path=path2str(package_path)),
                exits=1,
            )
    Path.mkdir(package_path, parents=True)
    shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
    create_file(package_path / "__init__.py", TEMPLATE_INIT)
    msg.good("Successfully created package '{}'".format(model_name_v), main_path)
    msg.text("To build the package, run `python setup.py sdist` in this directory.")
def main(path, name="bert-base-uncased", lang="en"):
    msg = Printer()
    msg.info(f"Creating model for '{name}' ({lang})")
    with msg.loading(f"Setting up the pipeline..."):
        nlp = PyTT_Language(pytt_name=name, meta={"lang": lang})
        nlp.add_pipe(nlp.create_pipe("sentencizer"))
        nlp.add_pipe(PyTT_WordPiecer.from_pretrained(nlp.vocab, name))
        nlp.add_pipe(PyTT_TokenVectorEncoder.from_pretrained(nlp.vocab, name))
    msg.good("Initialized the model pipeline")
    nlp.to_disk(path)
    msg.good(f"Saved '{name}' ({lang})")
    msg.text(f"Pipeline: {nlp.pipe_names}")
    msg.text(f"Location: {path}")
Ejemplo n.º 4
0
class Output:
    def __init__(self, stages, *args, **kwargs):
        self.stages = stages
        self.context = None
        self.printer = Printer()

    def success(self, title="", text="", show=True, spaced=False, exits=None):
        self.write(
            self.printer.text(title=self.with_prefix(title), text=text, color=MESSAGES.GOOD, icon=MESSAGES.GOOD,
                              show=show, spaced=spaced, exits=exits, no_print=True)
        )

    def info(self, title="", text="", show=True, spaced=False, exits=None):
        self.write(
            self.printer.text(title=self.with_prefix(title), text=text, color=MESSAGES.INFO, icon=MESSAGES.INFO,
                              show=show, spaced=spaced, exits=exits, no_print=True)
        )

    def error(self, title="", text="", show=True, spaced=False, exits=None):
        self.write(
            self.printer.text(title=self.with_prefix(title), text=text, color=MESSAGES.FAIL, icon=MESSAGES.FAIL,
                              show=show, spaced=spaced, exits=exits, no_print=True)
        )

    def warning(self, title="", text="", show=True, spaced=False, exits=None):
        self.write(
            self.printer.text(title=self.with_prefix(title), text=text, color=MESSAGES.WARN, icon=MESSAGES.WARN,
                              show=show, spaced=spaced, exits=exits, no_print=True)
        )

    def set_description(self, *args, **kwargs):
        pass

    def close(self, *args, **kwargs):
        pass

    def write(self, text):
        click.echo(text)

    def set_context(self, context: str):
        self.context = context

    def line_prefix(self) -> str:
        return f"[{self.context}] " if self.context else ""

    def with_prefix(self, title) -> str:
        return f"{self.line_prefix()} {title}"

    def __iter__(self):
        return iter(self.stages)
Ejemplo n.º 5
0
def main(path, name="bert-base-uncased", lang="en"):
    msg = Printer()
    msg.info(f"Creating model for '{name}' ({lang})")
    with msg.loading(f"Setting up the pipeline..."):
        nlp = TransformersLanguage(trf_name=name, meta={"lang": lang})
        nlp.add_pipe(nlp.create_pipe("sentencizer"))
        nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, name))
        nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, name))
    msg.good("Initialized the model pipeline")
    nlp.to_disk(path)
    msg.good(f"Saved '{name}' ({lang})")
    msg.text(f"Pipeline: {nlp.pipe_names}")
    msg.text(f"Location: {path}")
    with msg.loading("Verifying model loads..."):
        nlp.from_disk(path)
    msg.good("Model loads!")
Ejemplo n.º 6
0
def save_config(config: Config,
                output_file: Path,
                is_stdout: bool = False,
                silent: bool = False) -> None:
    no_print = is_stdout or silent
    msg = Printer(no_print=no_print)
    if is_stdout:
        print(config.to_str())
    else:
        if not output_file.parent.exists():
            output_file.parent.mkdir(parents=True)
        config.to_disk(output_file, interpolate=False)
        msg.good("Saved config", output_file)
        msg.text("You can now add your data and train your pipeline:")
        variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
        if not no_print:
            print(
                f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}"
            )
Ejemplo n.º 7
0
def generate_meta(existing_meta: Dict[str, Any],
                  msg: Printer) -> Dict[str, Any]:
    meta = existing_meta or {}
    settings = [
        ("lang", "Pipeline language", meta.get("lang", "en")),
        ("name", "Pipeline name", meta.get("name", "pipeline")),
        ("version", "Package version", meta.get("version", "0.0.0")),
        ("description", "Package description", meta.get("description", None)),
        ("author", "Author", meta.get("author", None)),
        ("email", "Author email", meta.get("email", None)),
        ("url", "Author website", meta.get("url", None)),
        ("license", "License", meta.get("license", "MIT")),
    ]
    msg.divider("Generating meta.json")
    msg.text(
        "Enter the package settings for your pipeline. The following information "
        "will be read from your pipeline data: pipeline, vectors.")
    for setting, desc, default in settings:
        response = get_raw_input(desc, default)
        meta[setting] = default if response == "" and default else response
    return meta
Ejemplo n.º 8
0
    def apply_(
        self,
        operation: Union[str, Callable[[Any], OperationResult]],
        *args: Any,
        initial_state: OperationState = None,
        **kwargs: Any,
    ) -> None:
        """Apply an operation to all data inplace.

        Args:
            operation (Callable[[Any], OperationResult]): Any operation that
                changes data in place. See recon.operations.registry.operations
        """
        if isinstance(operation, str):
            operation = registry.operations.get(operation)
            if operation:
                operation = cast(Callable, operation)

        name = getattr(operation, "name", None)
        if name is None or name not in registry.operations:
            raise ValueError(
                "This function is not an operation. Ensure your function is registered in the operations registry."
            )

        msg = Printer(no_print=self.verbose == False)
        msg.text(f"=> Applying operation '{name}' inplace")
        result: OperationResult = operation(self, *args, initial_state=initial_state, verbose=self.verbose, **kwargs)  # type: ignore
        msg.good(f"Completed operation '{name}'")

        self.operations.append(result.state)
        dataset_changed = any(
            (
                result.state.examples_added,
                result.state.examples_removed,
                result.state.examples_changed,
            )
        )
        if dataset_changed:
            self.data = result.data
Ejemplo n.º 9
0
def validate():
    """
    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
    msg = Printer()
    with msg.loading("Loading compatibility table..."):
        r = requests.get(about.__compatibility__)
        if r.status_code != 200:
            msg.fail(
                "Server error ({})".format(r.status_code),
                "Couldn't fetch compatibility table.",
                exits=1,
            )
    msg.good("Loaded compatibility table")
    compat = r.json()["spacy"]
    version = about.__version__
    version = version.rsplit(".dev", 1)[0]
    current_compat = compat.get(version)
    if not current_compat:
        msg.fail(
            "Can't find spaCy v{} in compatibility table".format(version),
            about.__compatibility__,
            exits=1,
        )
    all_models = set()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
        for model, model_vs in models.items():
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    model_links = get_model_links(current_compat)
    model_pkgs = get_model_pkgs(current_compat, all_models)
    incompat_links = {l for l, d in model_links.items() if not d["compat"]}
    incompat_models = {
        d["name"]
        for _, d in model_pkgs.items() if not d["compat"]
    }
    incompat_models.update(
        [d["name"] for _, d in model_links.items() if not d["compat"]])
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

    msg.divider("Installed models (spaCy v{})".format(about.__version__))
    msg.info("spaCy installation: {}".format(path2str(spacy_dir)))

    if model_links or model_pkgs:
        header = ("TYPE", "NAME", "MODEL", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
            rows.append(get_model_row(current_compat, name, data, msg))
        for name, data in model_links.items():
            rows.append(get_model_row(current_compat, name, data, msg, "link"))
        msg.table(rows, header=header)
    else:
        msg.text("No models found in your current environment.", exits=0)
    if update_models:
        msg.divider("Install updates")
        msg.text("Use the following commands to update the model packages:")
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
        msg.text("The following models are not available for spaCy "
                 "v{}: {}".format(about.__version__, ", ".join(na_models)))
    if incompat_links:
        msg.text(
            "You may also want to overwrite the incompatible links using the "
            "`python -m spacy link` command with `--force`, or remove them "
            "from the data directory. "
            "Data path: {path}".format(path=path2str(get_data_path())))
    if incompat_models or incompat_links:
        sys.exit(1)
Ejemplo n.º 10
0
def init_config(
    *,
    lang: str,
    pipeline: List[str],
    optimize: str,
    gpu: bool,
    pretraining: bool = False,
    silent: bool = True,
) -> Config:
    msg = Printer(no_print=silent)
    with TEMPLATE_PATH.open("r") as f:
        template = Template(f.read())
    # Filter out duplicates since tok2vec and transformer are added by template
    pipeline = [
        pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")
    ]
    defaults = RECOMMENDATIONS["__default__"]
    reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict()
    variables = {
        "lang": lang,
        "components": pipeline,
        "optimize": optimize,
        "hardware": "gpu" if gpu else "cpu",
        "transformer_data": reco["transformer"],
        "word_vectors": reco["word_vectors"],
        "has_letters": reco["has_letters"],
    }
    if variables["transformer_data"] and not has_spacy_transformers():
        msg.warn(
            "To generate a more effective transformer-based config (GPU-only), "
            "install the spacy-transformers package and re-run this command. "
            "The config generated now does not use transformers.")
        variables["transformer_data"] = None
    base_template = template.render(variables).strip()
    # Giving up on getting the newlines right in jinja for now
    base_template = re.sub(r"\n\n\n+", "\n\n", base_template)
    # Access variables declared in templates
    template_vars = template.make_module(variables)
    use_case = {
        "Language":
        lang,
        "Pipeline":
        ", ".join(pipeline),
        "Optimize for":
        optimize,
        "Hardware":
        variables["hardware"].upper(),
        "Transformer":
        template_vars.transformer.get("name")  # type: ignore[attr-defined]
        if template_vars.use_transformer  # type: ignore[attr-defined]
        else None,
    }
    msg.info("Generated config template specific for your use case")
    for label, value in use_case.items():
        msg.text(f"- {label}: {value}")
    with show_validation_error(hint_fill=False):
        config = util.load_config_from_str(base_template)
        nlp = util.load_model_from_config(config, auto_fill=True)
        config = nlp.config
        if pretraining:
            validate_config_for_pretrain(config, msg)
            pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
            config = pretrain_config.merge(config)
    msg.good("Auto-filled config with all values")
    return config
Ejemplo n.º 11
0
def pretrain(
    texts_loc,
    vectors_model,
    output_dir,
    width=96,
    depth=4,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
    dropout=0.2,
    n_iter=1000,
    batch_size=3000,
    max_length=500,
    min_length=5,
    seed=0,
    n_save_every=None,
):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pre-trained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pre-trained weights
    files to the 'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. The API and
    errors around this need some improvement.
    """
    config = dict(locals())
    msg = Printer()
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory")
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

    with msg.loading("Loading model '{}'...".format(vectors_model)):
        nlp = util.load_model(vectors_model)
    msg.good("Loaded model '{}'".format(vectors_model))
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
            width,
            embed_rows,
            conv_depth=depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=0,  # Requires PyTorch. Experimental.
            cnn_maxout_pieces=3,  # You can try setting this higher
            subword_features=True,  # Set to False for Chinese etc
        ),
    )
    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
    msg.divider("Pre-training tok2vec layer")
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
                "wb"
            ) as file_:
                file_.write(model.tok2vec.to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    for epoch in range(n_iter):
        for batch_id, batch in enumerate(
            util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
        ):
            docs = make_docs(
                nlp,
                [text for (text, _) in batch],
                max_length=max_length,
                min_length=min_length,
            )
            loss = make_update(
                model, docs, optimizer, objective=loss_func, drop=dropout
            )
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
                    break
            if n_save_every and (batch_id % n_save_every == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
def pretrain(
    texts_loc,
    vectors_model,
    output_dir,
    width=96,
    depth=4,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
    dropout=0.2,
    n_iter=1000,
    batch_size=3000,
    max_length=500,
    min_length=5,
    seed=0,
    n_save_every=None,
):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pre-trained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pre-trained weights
    files to the 'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. The API and
    errors around this need some improvement.
    """
    config = dict(locals())
    msg = Printer()
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory")
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

    with msg.loading("Loading model '{}'...".format(vectors_model)):
        nlp = util.load_model(vectors_model)
    msg.good("Loaded model '{}'".format(vectors_model))
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
            width,
            embed_rows,
            conv_depth=depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=0,  # Requires PyTorch. Experimental.
            cnn_maxout_pieces=3,  # You can try setting this higher
            subword_features=True,  # Set to False for Chinese etc
        ),
    )
    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
    msg.divider("Pre-training tok2vec layer")
    row_settings = {
        "widths": (3, 10, 10, 6, 4),
        "aligns": ("r", "r", "r", "r", "r")
    }
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / ("model%d%s.bin" %
                                (epoch, is_temp_str))).open("wb") as file_:
                file_.write(model.tok2vec.to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    for epoch in range(n_iter):
        for batch_id, batch in enumerate(
                util.minibatch_by_words(((text, None) for text in texts),
                                        size=batch_size)):
            docs = make_docs(
                nlp,
                [text for (text, _) in batch],
                max_length=max_length,
                min_length=min_length,
            )
            loss = make_update(model,
                               docs,
                               optimizer,
                               objective=loss_func,
                               drop=dropout)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10**7:
                    break
            if n_save_every and (batch_id % n_save_every == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
Ejemplo n.º 13
0
def train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    vectors=None,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    orth_variant_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    textcat_multilabel=False,
    textcat_arch="bow",
    textcat_positive_label=None,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """

    # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
    import tqdm

    msg = Printer()
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [
            p for p in output_path.iterdir() if p.is_dir()
    ]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    msg.text("Training pipeline: {}".format(pipeline))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
        for pipe in pipeline:
            if pipe not in nlp.pipe_names:
                if pipe == "parser":
                    pipe_cfg = {"learn_tokens": learn_tokens}
                elif pipe == "textcat":
                    pipe_cfg = {
                        "exclusive_classes": not textcat_multilabel,
                        "architecture": textcat_arch,
                        "positive_label": textcat_positive_label,
                    }
                else:
                    pipe_cfg = {}
                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
            else:
                if pipe == "textcat":
                    textcat_cfg = nlp.get_pipe("textcat").cfg
                    base_cfg = {
                        "exclusive_classes": textcat_cfg["exclusive_classes"],
                        "architecture": textcat_cfg["architecture"],
                        "positive_label": textcat_cfg["positive_label"],
                    }
                    pipe_cfg = {
                        "exclusive_classes": not textcat_multilabel,
                        "architecture": textcat_arch,
                        "positive_label": textcat_positive_label,
                    }
                    if base_cfg != pipe_cfg:
                        msg.fail(
                            "The base textcat model configuration does"
                            "not match the provided training options. "
                            "Existing cfg: {}, provided cfg: {}".format(
                                base_cfg, pipe_cfg),
                            exits=1,
                        )
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            elif pipe == "textcat":
                pipe_cfg = {
                    "exclusive_classes": not textcat_multilabel,
                    "architecture": textcat_arch,
                    "positive_label": textcat_positive_label,
                }
            else:
                pipe_cfg = {}
            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks),
                         ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail("Can't use multitask objective without '{}' in the "
                         "pipeline".format(pipe_name))
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model:
        # Start with an existing model, use default optimizer
        optimizer = create_default_optimizer(Model.ops)
    else:
        # Start with a blank model, call begin_training
        optimizer = nlp.begin_training(lambda: corpus.train_tuples,
                                       device=use_gpu)

    nlp._optimizer = None

    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # Verify textcat config
    if "textcat" in pipeline:
        textcat_labels = nlp.get_pipe("textcat").cfg["labels"]
        if textcat_positive_label and textcat_positive_label not in textcat_labels:
            msg.fail(
                "The textcat_positive_label (tpl) '{}' does not match any "
                "label in the training data.".format(textcat_positive_label),
                exits=1,
            )
        if textcat_positive_label and len(textcat_labels) != 2:
            msg.fail(
                "A textcat_positive_label (tpl) '{}' was provided for training "
                "data that does not appear to be a binary classification "
                "problem with two labels.".format(textcat_positive_label),
                exits=1,
            )
        train_docs = corpus.train_docs(nlp,
                                       noise_level=noise_level,
                                       gold_preproc=gold_preproc,
                                       max_length=0)
        train_labels = set()
        if textcat_multilabel:
            multilabel_found = False
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1:
                    multilabel_found = True
            if not multilabel_found and not base_model:
                msg.warn("The textcat training instances look like they have "
                         "mutually-exclusive classes. Remove the flag "
                         "'--textcat-multilabel' to train a classifier with "
                         "mutually-exclusive classes.")
        if not textcat_multilabel:
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1 and not base_model:
                    msg.warn(
                        "Some textcat training instances do not have exactly "
                        "one positive label. Modifying training options to "
                        "include the flag '--textcat-multilabel' for classes "
                        "that are not mutually exclusive.")
                    nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
                    textcat_multilabel = True
                    break
        if base_model and set(textcat_labels) != train_labels:
            msg.fail(
                "Cannot extend textcat model using data with different "
                "labels. Base model labels: {}, training data labels: "
                "{}.".format(textcat_labels, list(train_labels)),
                exits=1,
            )
        if textcat_multilabel:
            msg.text(
                "Textcat evaluation score: ROC AUC score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        elif textcat_positive_label and len(textcat_labels) == 2:
            msg.text("Textcat evaluation score: F1-score for the "
                     "label '{}'".format(textcat_positive_label))
        elif len(textcat_labels) > 1:
            if len(textcat_labels) == 2:
                msg.warn(
                    "If the textcat component is a binary classifier with "
                    "exclusive classes, provide '--textcat_positive_label' for "
                    "an evaluation on the positive class.")
            msg.text(
                "Textcat evaluation score: F1-score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        else:
            msg.fail(
                "Unsupported textcat configuration. Use `spacy debug-data` "
                "for more information.")

    # fmt: off
    row_head, output_stats = _configure_training_output(
        pipeline, use_gpu, has_beam_widths)
    row_widths = [len(w) for w in row_head]
    row_settings = {
        "widths": row_widths,
        "aligns": tuple(["r" for i in row_head]),
        "spacing": 2
    }
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp,
                noise_level=noise_level,
                orth_variant_level=orth_variant_level,
                gold_preproc=gold_preproc,
                max_length=0,
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8)
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs,
                                                     size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs,
                        golds,
                        sgd=optimizer,
                        drop=next(dropout_rates),
                        losses=losses,
                    )
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc))
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        with Model.use_device("cpu"):
                            nlp_loaded = util.load_model_from_path(
                                epoch_model_path)
                            for name, component in nlp_loaded.pipeline:
                                if hasattr(component, "cfg"):
                                    component.cfg["beam_width"] = beam_width
                            dev_docs = list(
                                corpus.dev_docs(nlp_loaded,
                                                gold_preproc=gold_preproc))
                            start_time = timer()
                            scorer = nlp_loaded.evaluate(dev_docs,
                                                         verbose=verbose)
                            end_time = timer()
                            cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta["accuracy"] = scorer.scores
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        meta["beam_accuracy"][beam_width] = scorer.scores
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta["labels"] = nlp.meta["labels"]
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        output_stats,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    if i == 0 and "textcat" in pipeline:
                        textcats_per_cat = scorer.scores.get(
                            "textcats_per_cat", {})
                        for cat, cat_score in textcats_per_cat.items():
                            if cat_score.get("roc_auc_score", 0) < 0:
                                msg.warn(
                                    "Textcat ROC AUC score is undefined due to "
                                    "only one value in label '{}'.".format(
                                        cat))
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        msg.text("Early stopping, best iteration "
                                 "is: {}".format(i - iter_since_best))
                        msg.text("Best score = {}; Final iteration "
                                 "score = {}".format(best_score,
                                                     current_score))
                        break
    finally:
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(meta, output_path,
                                                  nlp.pipe_names)
        msg.good("Created best model", best_model_path)
Ejemplo n.º 14
0
def train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    vectors=None,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """
    msg = Printer()
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    msg.text("Training pipeline: {}".format(pipeline))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
        nlp.disable_pipes(*other_pipes)
        for pipe in pipeline:
            if pipe not in nlp.pipe_names:
                nlp.add_pipe(nlp.create_pipe(pipe))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            nlp.add_pipe(nlp.create_pipe(pipe))

    if learn_tokens:
        nlp.add_pipe(nlp.create_pipe("merge_subtokens"))

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail(
                    "Can't use multitask objective without '{}' in the "
                    "pipeline".format(pipe_name)
                )
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model:
        # Start with an existing model, use default optimizer
        optimizer = create_default_optimizer(Model.ops)
    else:
        # Start with a blank model, call begin_training
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)

    nlp._optimizer = None

    # Load in pre-trained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # fmt: off
    row_head = ["Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS"]
    row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7]
    if has_beam_widths:
        row_head.insert(1, "Beam W.")
        row_widths.insert(1, 7)
    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
                )
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs,
                        golds,
                        sgd=optimizer,
                        drop=next(dropout_rates),
                        losses=losses,
                    )
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
                    )
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, debug)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        with Model.use_device("cpu"):
                            nlp_loaded = util.load_model_from_path(epoch_model_path)
                            for name, component in nlp_loaded.pipeline:
                                if hasattr(component, "cfg"):
                                    component.cfg["beam_width"] = beam_width
                            dev_docs = list(
                                corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
                            )
                            start_time = timer()
                            scorer = nlp_loaded.evaluate(dev_docs)
                            end_time = timer()
                            cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta["accuracy"] = scorer.scores
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        meta["beam_accuracy"][beam_width] = scorer.scores
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        msg.text(
                            "Early stopping, best iteration "
                            "is: {}".format(i - iter_since_best)
                        )
                        msg.text(
                            "Best score = {}; Final iteration "
                            "score = {}".format(best_score, current_score)
                        )
                        break
    finally:
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
        msg.good("Created best model", best_model_path)
Ejemplo n.º 15
0
class Engine(ClassNursery):
    def __init__(
        self,
        model: nn.Module,
        datasets_manager: DatasetsManager,
        optimizer: optim,
        batch_size: int,
        save_dir: str,
        num_epochs: int,
        save_every: int,
        log_train_metrics_every: int,
        train_metric: BaseMetric,
        validation_metric: BaseMetric,
        test_metric: BaseMetric,
        experiment_name: Optional[str] = None,
        experiment_hyperparams: Optional[Dict[str, Any]] = None,
        tensorboard_logdir: str = None,
        track_for_best: str = "loss",
        collate_fn=list,
        device: Union[torch.device, str] = torch.device("cpu"),
        gradient_norm_clip_value: Optional[float] = 5.0,
        lr_scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
        use_wandb: bool = False,
        sample_proportion: float = 1.0,
        seeds: Dict[str, int] = None,
    ):
        """ Engine runs the models end to end. It iterates through the train dataset and passes
        it through the model. During training it helps in tracking a lot of parameters for the run
        and saving the parameters. It also reports validation and test parameters from time to time.
        Many utilities required for end-end running of the model is here.

        Parameters
        ----------
        model : nn.Module
            A pytorch module defining a model to be run
        datasets_manager : DatasetsManager
            A datasets manager that handles all the different datasets
        optimizer : torch.optim
            Any Optimizer object instantiated using  ``torch.optim``
        batch_size : int
            Batch size for the dataset. The same batch size is used for ``train``, ``valid``
            and ``test`` dataset
        save_dir : int
            The experiments are saved in ``save_dir``. We save checkpoints, the best model,
            logs and other information into the save dir
        num_epochs : int
            The number of epochs to run the training
        save_every : int
            The model will be checkpointed every ``save_every`` number of iterations
        log_train_metrics_every : int
            The train metrics will be reported every ``log_train_metrics_every`` iterations
            during training
        train_metric : BaseMetric
            Anything that is an instance of ``BaseMetric`` for calculating training metrics
        validation_metric : BaseMetric
            Anything that is an instance of ``BaseMetric`` for calculating validation metrics
        test_metric : BaseMetric
            Anything that is an instance of ``BaseMetric`` for calculating test metrics
        experiment_name : str
            The experiment should be given a name for ease of tracking. Instead experiment
            name is not given, we generate a unique 10 digit sha for the experiment.
        experiment_hyperparams : Dict[str, Any]
            This is mostly used for tracking the different hyper-params of the experiment
            being run. This may be used by ``wandb`` to save the hyper-params
        tensorboard_logdir : str
            The directory where all the tensorboard runs are stored. If ``None`` is passed
            then it defaults to the tensorboard default of storing the log in the current directory.
        track_for_best : str
            Which metric should be tracked for deciding the best model?. Anything that
            the metric emits and is a single value can be used for tracking. The defauly value
            is ``loss``. If its loss, then the best value will be the lowest one. For some
            other metrics like ``macro_fscore``, the best metric might be the one that has the highest
            value
        collate_fn : Callable[[List[Any]], List[Any]]
            Collates the different examples into a single batch of examples.
            This is the same terminology adopted from ``pytorch``. There is no different
        device : torch.device
            The device on which the model will be placed. If this is "cpu", then the model
            and the tensors will all be on cpu. If this is "cuda:0", then the model and
            the tensors will be placed on cuda device 0. You can mention any other cuda
            device that is suitable for your environment
        gradient_norm_clip_value : float
            To avoid gradient explosion, the gradients of the norm will be clipped
            if the gradient norm exceeds this value
        lr_scheduler : torch.optim.lr_scheduler
            Any pytorch ``lr_scheduler`` can be used for reducing the learning rate
            if the performance on the validation set reduces.
        use_wandb : bool
            wandb or weights and biases is a tool that is used to track experiments
            online. Sciwing comes with inbuilt functionality to track experiments
            on weights and biases
        seeds: Dict[str, int]
            The dict of seeds to be set.
            Set the random_seed, pytorch_seed and numpy_seed
            Found in
            https://github.com/allenai/allennlp/blob/master/allennlp/common/util.py
        """

        if isinstance(device, str):
            device = torch.device(device)

        if seeds is None:
            seeds = {}
        self.seeds = seeds

        self._set_seeds()

        self.model = model
        self.datasets_manager = datasets_manager
        self.train_dataset = self.datasets_manager.train_dataset
        self.validation_dataset = self.datasets_manager.dev_dataset
        self.test_dataset = self.datasets_manager.test_dataset
        self.optimizer = optimizer
        self.batch_size = batch_size
        self.save_dir = pathlib.Path(save_dir)
        self.num_epochs = num_epochs
        self.msg_printer = Printer()
        self.save_every = save_every
        self.log_train_metrics_every = log_train_metrics_every
        self.tensorboard_logdir = tensorboard_logdir
        self.train_metric_calc = train_metric
        self.validation_metric_calc = validation_metric
        self.test_metric_calc = test_metric
        self.summaryWriter = SummaryWriter(log_dir=tensorboard_logdir)
        self.track_for_best = track_for_best
        self.collate_fn = collate_fn
        self.device = device
        self.best_track_value = None
        self.set_best_track_value(self.best_track_value)
        self.gradient_norm_clip_value = gradient_norm_clip_value
        self.lr_scheduler = lr_scheduler
        self.lr_scheduler_is_plateau = isinstance(
            self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
        self.use_wandb = wandb and use_wandb
        self.sample_proportion = sample_proportion
        self.label_namespaces = self.datasets_manager.label_namespaces
        self.datasets_manager.print_stats()

        if experiment_name is None:
            hash_ = hashlib.sha1()
            hash_.update(str(time.time()).encode("utf-8"))
            digest = hash_.hexdigest()
            experiment_name = digest[:10]

        self.experiment_name = experiment_name
        self.experiment_hyperparams = experiment_hyperparams or {}

        if self.use_wandb:
            wandb.init(
                project="project-scwing",
                name=self.experiment_name,
                config=self.experiment_hyperparams,
            )

        if not self.save_dir.is_dir():
            self.save_dir.mkdir(parents=True)

        with open(self.save_dir.joinpath("hyperparams.json"), "w") as fp:
            json.dump(self.experiment_hyperparams, fp)

        self.num_workers = 1
        self.model.to(self.device)

        self.train_loader = self.get_loader(self.train_dataset)
        self.validation_loader = self.get_loader(self.validation_dataset)
        self.test_loader = self.get_loader(self.test_dataset)

        # refresh the iters at the beginning of every epoch
        self.train_iter = None
        self.validation_iter = None
        self.test_iter = None

        # initializing loss meters
        self.train_loss_meter = LossMeter()
        self.validation_loss_meter = LossMeter()

        self.msg_printer.divider("ENGINE STARTING")
        time.sleep(3)

        # get the loggers ready
        self.train_log_filename = self.save_dir.joinpath("train.log")
        self.validation_log_filename = self.save_dir.joinpath("validation.log")
        self.test_log_filename = self.save_dir.joinpath("test.log")

        self.train_logger = logzero.setup_logger(
            name="train-logger",
            logfile=self.train_log_filename,
            level=logging.INFO)
        self.validation_logger = logzero.setup_logger(
            name="valid-logger",
            logfile=self.validation_log_filename,
            level=logging.INFO,
        )
        self.test_logger = logzero.setup_logger(name="test-logger",
                                                logfile=self.test_log_filename,
                                                level=logging.INFO)

        if self.lr_scheduler_is_plateau:
            if self.best_track_value == "loss" and self.lr_scheduler.mode == "max":
                self.msg_printer.warn(
                    "You are optimizing loss and lr schedule mode is max instead of min"
                )
            if (self.best_track_value == "macro_fscore"
                    or self.best_track_value == "fscore"
                    and self.lr_scheduler.mode == "min"):
                self.msg_printer.warn(
                    f"You are optimizing for macro_fscore and lr scheduler mode is min instead of max"
                )
            if (self.best_track_value == "micro_fscore"
                    and self.lr_scheduler.mode == "min"):
                self.msg_printer.warn(
                    f"You are optimizing for micro_fscore and lr scheduler mode is min instead of max"
                )

    def get_loader(self, dataset: Dataset) -> DataLoader:
        """ Returns the DataLoader for the Dataset

        Parameters
        ----------
        dataset : Dataset

        Returns
        -------
        DataLoader
            A pytorch DataLoader

        """
        dataset_size = len(dataset)
        sample_size = int(np.floor(dataset_size * self.sample_proportion))
        indices = np.random.choice(range(dataset_size),
                                   size=sample_size,
                                   replace=False)
        sampler = SubsetRandomSampler(indices=indices)
        loader = DataLoader(
            dataset=dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            collate_fn=self.collate_fn,
            pin_memory=True,
            sampler=sampler,
        )
        return loader

    def is_best_lower(self, current_best=None):
        """ Returns True if the current value of the metric is lower than the best metric.
        This is useful for tracking metrics like loss where, lower the value, the better it is

        Parameters
        ----------
        current_best : float
            The current value for the metric that is being tracked

        Returns
        -------
        bool


        """
        return True if current_best < self.best_track_value else False

    def is_best_higher(self, current_best=None):
        """ Returns ``True`` if the current value of the metric is HIGHER than the best metric.
        This is useful for tracking metrics like FSCORE where, higher the value, the better it is

        Parameters
        ----------
        current_best : float
            The current value for the metric that is being tracked

        Returns
        -------
        bool
        """
        return True if current_best >= self.best_track_value else False

    def set_best_track_value(self, current_best=None):
        """ Set the best value of the value being tracked

        Parameters
        ----------
        current_best : float
            The current value that is best

        Returns
        -------

        """
        if self.track_for_best == "loss":
            self.best_track_value = np.inf if current_best is None else current_best
        elif self.track_for_best == "macro_fscore" or self.track_for_best == "fscore":
            self.best_track_value = 0 if current_best is None else current_best
        elif self.track_for_best == "micro_fscore":
            self.best_track_value = 0 if current_best is None else current_best

    def run(self):
        """
        Run the engine
        :return:
        """
        for epoch_num in range(self.num_epochs):
            self.train_epoch(epoch_num)
            self.validation_epoch(epoch_num)

        self.test_epoch(epoch_num)

    def train_epoch(self, epoch_num: int):
        """
        Run the training for one epoch
        :param epoch_num: type: int
        The current epoch number
        """

        # refresh everything necessary before training begins
        num_iterations = 0
        train_iter = self.get_iter(self.train_loader)
        self.train_loss_meter.reset()
        self.train_metric_calc.reset()
        self.model.train()

        self.msg_printer.info(
            f"Starting Training Epoch: {epoch_num+1}/{self.num_epochs}")
        while True:
            try:
                # N*T, N * 1, N * 1
                lines_labels = next(train_iter)
                lines_labels = list(zip(*lines_labels))
                lines = lines_labels[0]
                labels = lines_labels[1]
                batch_size = len(lines)

                model_forward_out = self.model(
                    lines=lines,
                    labels=labels,
                    is_training=True,
                    is_validation=False,
                    is_test=False,
                )
                self.train_metric_calc.calc_metric(
                    lines=lines,
                    labels=labels,
                    model_forward_dict=model_forward_out)

                try:
                    self.optimizer.zero_grad()
                    loss = model_forward_out["loss"]
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        self.model.parameters(),
                        max_norm=self.gradient_norm_clip_value)
                    self.optimizer.step()
                    self.train_loss_meter.add_loss(loss.item(), batch_size)

                except KeyError:
                    self.msg_printer.fail(
                        "The model output dictionary does not have "
                        "a key called loss. Please check to have "
                        "loss in the model output")
                num_iterations += 1
                if (num_iterations + 1) % self.log_train_metrics_every == 0:
                    metrics = self.train_metric_calc.report_metrics()
                    for label_namespace, table in metrics.items():
                        self.msg_printer.divider(
                            text=f"Train Metrics for {label_namespace.upper()}"
                        )
                        print(table)
            except StopIteration:
                self.train_epoch_end(epoch_num)
                break

    def train_epoch_end(self, epoch_num: int):
        """ Performs house-keeping at the end of a training epoch

        At the end of the training epoch, it does some house-keeping. It reports the average loss, the
        average metric and other information.

        Parameters
        ----------
        epoch_num : int
            The current epoch number (0 based)

        """
        self.msg_printer.divider(f"Training end @ Epoch {epoch_num + 1}")
        average_loss = self.train_loss_meter.get_average()
        self.msg_printer.text("Average Loss: {0}".format(average_loss))
        self.train_logger.info(
            f"Average loss @ Epoch {epoch_num+1} - {average_loss}")
        metric = self.train_metric_calc.get_metric()

        if self.use_wandb:
            wandb.log({"train_loss": average_loss}, step=epoch_num + 1)
            if self.track_for_best != "loss":
                for label_namespace in self.label_namespaces:
                    wandb.log(
                        {
                            f"train_{self.track_for_best}_{label_namespace}":
                            metric[label_namespace][self.track_for_best]
                        },
                        step=epoch_num + 1,
                    )

        # save the model after every `self.save_every` epochs
        if (epoch_num + 1) % self.save_every == 0:
            torch.save(
                {
                    "epoch_num": epoch_num,
                    "optimizer_state": self.optimizer.state_dict(),
                    "model_state": self.model.state_dict(),
                    "loss": average_loss,
                },
                self.save_dir.joinpath(f"model_epoch_{epoch_num+1}.pt"),
            )

        # log loss to tensor board
        self.summaryWriter.add_scalars(
            "train_validation_loss",
            {"train_loss": average_loss or np.inf},
            epoch_num + 1,
        )

    def validation_epoch(self, epoch_num: int):
        """ Runs one validation epoch on the validation dataset

        Parameters
        ----------
        epoch_num : int
        0-based epoch number

        """
        self.model.eval()
        valid_iter = iter(self.validation_loader)
        self.validation_loss_meter.reset()
        self.validation_metric_calc.reset()

        self.msg_printer.info(
            f"Starting Validation Epoch: {epoch_num + 1}/{self.num_epochs}")
        while True:
            try:
                lines_labels = next(valid_iter)
                lines_labels = list(zip(*lines_labels))
                lines = lines_labels[0]
                labels = lines_labels[1]
                batch_size = len(lines)

                with torch.no_grad():
                    model_forward_out = self.model(
                        lines=lines,
                        labels=labels,
                        is_training=False,
                        is_validation=True,
                        is_test=False,
                    )
                loss = model_forward_out["loss"]
                self.validation_loss_meter.add_loss(loss, batch_size)
                self.validation_metric_calc.calc_metric(
                    lines=lines,
                    labels=labels,
                    model_forward_dict=model_forward_out)
            except StopIteration:
                self.validation_epoch_end(epoch_num)
                break

    def validation_epoch_end(self, epoch_num: int):
        """Performs house-keeping at the end of validation epoch

        Parameters
        ----------
        epoch_num : int
            The current epoch number
        """

        self.msg_printer.divider(f"Validation @ Epoch {epoch_num+1}")

        metric_report = self.validation_metric_calc.report_metrics()

        average_loss = self.validation_loss_meter.get_average()

        for label_namespace, table in metric_report.items():
            self.msg_printer.divider(
                text=f"Validation Metrics for {label_namespace.upper()}")
            print(table)

        self.msg_printer.text(f"Average Loss: {average_loss}")

        self.validation_logger.info(
            f"Validation Loss @ Epoch {epoch_num+1} - {average_loss}")

        if self.use_wandb:
            wandb.log({"validation_loss": average_loss}, step=epoch_num + 1)
            metric = self.validation_metric_calc.get_metric()
            if self.track_for_best != "loss":
                for label_namespace in self.label_namespaces:
                    wandb.log(
                        {
                            f"validation_{self.track_for_best}_{label_namespace}":
                            metric[label_namespace][self.track_for_best]
                        },
                        step=epoch_num + 1,
                    )

        self.summaryWriter.add_scalars(
            "train_validation_loss",
            {"validation_loss": average_loss or np.inf},
            epoch_num + 1,
        )

        is_best: bool = None
        value_tracked: str = None
        if self.track_for_best == "loss":
            value_tracked = average_loss
            is_best = self.is_best_lower(average_loss)
        elif (self.track_for_best == "micro_fscore"
              or self.track_for_best == "macro_fscore"
              or self.track_for_best == "fscore"):
            # If there are multiple namespaces for the metric
            # we decide the best model based on the average score
            values_tracked = []
            metrics = self.validation_metric_calc.get_metric()
            for label_namespace in self.label_namespaces:
                value_tracked = metrics[label_namespace][self.track_for_best]
                values_tracked.append(value_tracked)

            value_tracked = sum(values_tracked) / len(values_tracked)
            is_best = self.is_best_higher(current_best=value_tracked)

        if self.lr_scheduler is not None:
            self.lr_scheduler.step(value_tracked)

        if is_best:
            self.set_best_track_value(current_best=value_tracked)
            self.msg_printer.good(f"Found Best Model @ epoch {epoch_num + 1}")
            torch.save(
                {
                    "epoch_num": epoch_num,
                    "optimizer_state": self.optimizer.state_dict(),
                    "model_state": self.model.state_dict(),
                    "loss": average_loss,
                },
                self.save_dir.joinpath("best_model.pt"),
            )

    def test_epoch(self, epoch_num: int):
        """Runs the test epoch for ``epoch_num``

        Loads the best model that is saved during the training
        and runs the test dataset.

        Parameters
        ----------
        epoch_num : int
            zero based epoch number for which the test dataset is run
            This is after the last training epoch.

        """
        self.msg_printer.divider("Running on Test Batch")
        self.load_model_from_file(self.save_dir.joinpath("best_model.pt"))
        self.model.eval()
        test_iter = iter(self.test_loader)
        while True:
            try:
                lines_labels = next(test_iter)
                lines_labels = list(zip(*lines_labels))
                lines = lines_labels[0]
                labels = lines_labels[1]

                with torch.no_grad():
                    model_forward_out = self.model(
                        lines=lines,
                        labels=labels,
                        is_training=False,
                        is_validation=False,
                        is_test=True,
                    )
                self.test_metric_calc.calc_metric(
                    lines=lines,
                    labels=labels,
                    model_forward_dict=model_forward_out)
            except StopIteration:
                self.test_epoch_end(epoch_num)
                break

    def test_epoch_end(self, epoch_num: int):
        """ Performs house-keeping at the end of the test epoch

        It reports the metric that is being traced at the end
        of the test epoch

        Parameters
        ----------
        epoch_num : int
            Epoch num after which the test dataset is run

        """
        metric_report = self.test_metric_calc.report_metrics()
        for label_namespace, table in metric_report.items():
            self.msg_printer.divider(
                text=f"Test Metrics for {label_namespace.upper()}")
            print(table)

        precision_recall_fmeasure = self.test_metric_calc.get_metric()
        self.msg_printer.divider(f"Test @ Epoch {epoch_num+1}")
        self.test_logger.info(
            f"Test Metrics @ Epoch {epoch_num+1} - {precision_recall_fmeasure}"
        )
        if self.use_wandb:
            wandb.log({"test_metrics": str(precision_recall_fmeasure)})

        self.summaryWriter.close()

    def get_train_dataset(self):
        """ Returns the train dataset of the experiment

        Returns
        -------
        Dataset
            Anything that conforms to the pytorch style dataset.

        """
        return self.train_dataset

    def get_validation_dataset(self):
        """ Returns the validation dataset of the experiment

        Returns
        -------
        Dataset
            Anything that conforms to the pytorch style dataset.

        """
        return self.validation_dataset

    def get_test_dataset(self):
        """ Returns the test dataset of the experiment

        Returns
        -------
        Dataset
            Anything that conforms to the pytorch style dataset.

        """
        return self.test_dataset

    @staticmethod
    def get_iter(loader: DataLoader) -> Iterator:
        """ Returns the iterator for a pytorch data loader.

        The ``loader`` is a pytorch DataLoader that iterates
        over the dataset in batches and employs many strategies to do
        so. We want an iterator that returns the dataset in batches.
        The end of the iterator would signify the end of an epoch
        and then we can use that information to perform house-keeping.


        Parameters
        ----------
        loader : DataLoader
            a pytorch data loader

        Returns
        -------
        Iterator
            An iterator over the data loader
        """
        iterator = iter(loader)
        return iterator

    def load_model_from_file(self, filename: str):
        self.msg_printer.divider("LOADING MODEL FROM FILE")
        with self.msg_printer.loading(
                f"Loading Pytorch Model from file {filename}"):
            model_chkpoint = torch.load(filename)

        self.msg_printer.good("Finished Loading the Model")

        model_state = model_chkpoint["model_state"]
        self.model.load_state_dict(model_state)

    def _set_seeds(self):
        seed = self.seeds.get("random_seed", 17290)
        numpy_seed = self.seeds.get("numpy_seed", 1729)
        torch_seed = self.seeds.get("pytorch_seed", 172)

        if seed is not None:
            random.seed(seed)
        if numpy_seed is not None:
            np.random.seed(numpy_seed)
        if torch_seed is not None:
            torch.manual_seed(torch_seed)
            # Seed all GPUs with the same seed if available.
            if torch.cuda.is_available():
                torch.cuda.manual_seed_all(torch_seed)
Ejemplo n.º 16
0
def debug_data(
    lang,
    train_path,
    dev_path,
    base_model=None,
    pipeline="tagger,parser,ner",
    ignore_warnings=False,
    ignore_validation=False,
    verbose=False,
    no_format=False,
):
    msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)

    # Make sure all files and paths exists if they are needed
    if not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)

    # Initialize the model and pipeline
    pipeline = [p.strip() for p in pipeline.split(",")]
    if base_model:
        nlp = load_model(base_model)
    else:
        lang_cls = get_lang_class(lang)
        nlp = lang_cls()

    msg.divider("Data format validation")
    # Load the data in one – might take a while but okay in this case
    train_data = _load_file(train_path, msg)
    dev_data = _load_file(dev_path, msg)

    # Validate data format using the JSON schema
    # TODO: update once the new format is ready
    train_data_errors = []  # TODO: validate_json
    dev_data_errors = []  # TODO: validate_json
    if not train_data_errors:
        msg.good("Training data JSON format is valid")
    if not dev_data_errors:
        msg.good("Development data JSON format is valid")
    for error in train_data_errors:
        msg.fail("Training data: {}".format(error))
    for error in dev_data_errors:
        msg.fail("Develoment data: {}".format(error))
    if (train_data_errors or dev_data_errors) and not ignore_validation:
        sys.exit(1)

    # Create the gold corpus to be able to better analyze data
    with msg.loading("Analyzing corpus..."):
        train_data = read_json_object(train_data)
        dev_data = read_json_object(dev_data)
        corpus = GoldCorpus(train_data, dev_data)
        train_docs = list(corpus.train_docs(nlp))
        dev_docs = list(corpus.dev_docs(nlp))
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_docs constantly
    gold_data = _compile_gold(train_docs, pipeline)
    train_texts = gold_data["texts"]
    dev_texts = set([doc.text for doc, gold in dev_docs])

    msg.divider("Training stats")
    msg.text("Training pipeline: {}".format(", ".join(pipeline)))
    for pipe in [p for p in pipeline if p not in nlp.factories]:
        msg.fail("Pipeline component '{}' not available in factories".format(pipe))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
    msg.text("{} training docs".format(len(train_docs)))
    msg.text("{} evaluation docs".format(len(dev_docs)))

    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn("{} training examples also in evaluation data".format(overlap))
    else:
        msg.good("No overlap between training and evaluation data")
    if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
        text = "Low number of examples to train from a blank model ({})".format(
            len(train_docs)
        )
        if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
            "It's recommended to use at least {} examples (minimum {})".format(
                BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
            ),
            show=verbose,
        )

    msg.divider("Vocab & Vectors")
    n_words = gold_data["n_words"]
    msg.info(
        "{} total {} in the data ({} unique)".format(
            n_words, "word" if n_words == 1 else "words", len(gold_data["words"])
        )
    )
    most_common_words = gold_data["words"].most_common(10)
    msg.text(
        "10 most common words: {}".format(
            _format_labels(most_common_words, counts=True)
        ),
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info(
            "{} vectors ({} unique keys, {} dimensions)".format(
                len(nlp.vocab.vectors),
                nlp.vocab.vectors.n_keys,
                nlp.vocab.vectors_length,
            )
        )
    else:
        msg.info("No word vectors present in the model")

    if "ner" in pipeline:
        # Get all unique NER labels present in the data
        labels = set(label for label in gold_data["ner"] if label not in ("O", "-"))
        label_counts = gold_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False

        msg.divider("Named Entity Recognition")
        msg.info(
            "{} new {}, {} existing {}".format(
                len(new_labels),
                "label" if len(new_labels) == 1 else "labels",
                len(existing_labels),
                "label" if len(existing_labels) == 1 else "labels",
            )
        )
        missing_values = label_counts["-"]
        msg.text(
            "{} missing {} (tokens with '-' label)".format(
                missing_values, "value" if missing_values == 1 else "values"
            )
        )
        if new_labels:
            labels_with_counts = [
                (label, count)
                for label, count in label_counts.most_common()
                if label != "-"
            ]
            labels_with_counts = _format_labels(labels_with_counts, counts=True)
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text(
                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
            )

        if gold_data["ws_ents"]:
            msg.fail("{} invalid whitespace entity spans".format(gold_data["ws_ents"]))
            has_ws_ents_error = True

        for label in new_labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
                    "Low number of examples for new label '{}' ({})".format(
                        label, label_counts[label]
                    )
                )
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(train_docs, label)
                if neg_docs == 0:
                    msg.warn(
                        "No examples for texts WITHOUT new label '{}'".format(label)
                    )
                    has_no_neg_warning = True

        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurences available for all labels")
        if not has_ws_ents_error:
            msg.good("No entities consisting of or starting/ending with whitespace")

        if has_low_data_warning:
            msg.text(
                "To train a new entity type, your data should include at "
                "least {} insteances of the new label".format(NEW_LABEL_THRESHOLD),
                show=verbose,
            )
        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of entities "
                "in context, as well as examples without a given entity "
                "type.",
                show=verbose,
            )
        if has_ws_ents_error:
            msg.text(
                "As of spaCy v2.1.0, entity spans consisting of or starting/ending "
                "with whitespace characters are considered invalid."
            )

    if "textcat" in pipeline:
        msg.divider("Text Classification")
        labels = [label for label in gold_data["textcat"]]
        model_labels = _get_labels_from_model(nlp, "textcat")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        msg.info(
            "Text Classification: {} new label(s), {} existing label(s)".format(
                len(new_labels), len(existing_labels)
            )
        )
        if new_labels:
            labels_with_counts = _format_labels(
                gold_data["textcat"].most_common(), counts=True
            )
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text(
                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
            )

    if "tagger" in pipeline:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_data["tags"]]
        tag_map = nlp.Defaults.tag_map
        msg.info(
            "{} {} in data ({} {} in tag map)".format(
                len(labels),
                "label" if len(labels) == 1 else "labels",
                len(tag_map),
                "label" if len(tag_map) == 1 else "labels",
            )
        )
        labels_with_counts = _format_labels(
            gold_data["tags"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)
        non_tagmap = [l for l in labels if l not in tag_map]
        if not non_tagmap:
            msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
        for label in non_tagmap:
            msg.fail(
                "Label '{}' not found in tag map for language '{}'".format(
                    label, nlp.lang
                )
            )

    if "parser" in pipeline:
        msg.divider("Dependency Parsing")
        labels = [label for label in gold_data["deps"]]
        msg.info(
            "{} {} in data".format(
                len(labels), "label" if len(labels) == 1 else "labels"
            )
        )
        labels_with_counts = _format_labels(
            gold_data["deps"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)

    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
        msg.good(
            "{} {} passed".format(
                good_counts, "check" if good_counts == 1 else "checks"
            )
        )
    if warn_counts:
        msg.warn(
            "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
        )
    if fail_counts:
        msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))

    if fail_counts:
        sys.exit(1)
Ejemplo n.º 17
0
class Terminal:
    def __init__(self):
        self.printer = Printer()

        self._pending_ecode = False

    @property
    def pending_ecode(self):
        """Whether a console task is awaiting exit code."""
        return self._pending_ecode

    @pending_ecode.setter
    def pending_ecode(self, value):
        """Set whether a console task is awaiting exit code."""
        self._pending_ecode = value

    def cmd_head(self, task_name: str, task_text: str):
        script_name = color(SCRIPT_NAME, bold=True)
        self.printer.text(f"{script_name} {task_name}: {task_text}")

    # @classmethod
    def echo(self, title: str, details: str = None, spaced: bool = True):
        if details != None:
            self.printer.text(title=title, text=details, spaced=spaced)
        else:
            self.printer.text(title=title, spaced=spaced)

    # @classmethod
    def task(self, task_name: str, task_details: str = None, wait: bool = True):
        msg_title = f"{task_name}"

        if self.pending_ecode:
            raise Exception("Task already running")
        
        if wait:
            if task_details != None:
                self.echo(title=msg_title, details=task_details, spaced=False)
            else:
                self.echo(title=msg_title, details=None, spaced=False)
            self.pending_ecode = True
        else:
            if task_details != None:
                self.echo(title=msg_title, details=task_details, spaced=True)
            else:
                self.echo(title=msg_title, details=None, spaced=True)

    # @classmethod
    def ok(self, exit: int = None):
        if self.pending_ecode != True:
            raise errors.NoPendingStatusError()

        self.printer.good("OK")
        self.pending_ecode = False

        if exit != None:
            sys.exit(exit)

    # @classmethod
    def fail(self, exit: int = None):
        if not self.pending_ecode:
            raise errors.NoPendingStatusError()

        self.printer.fail("FAIL")
        self.pending_ecode = False

        if exit != None:
            sys.exit(exit)

    # @classmethod
    def exit(self, code: int = 0, message: str = None):
        if self.pending_ecode:
            self.fail()

        if message != None:
            self.echo(message)
Ejemplo n.º 18
0
def link(origin, link_name, force=False, model_path=None):
    """
    Create a symlink for models within the spacy/data directory. Accepts
    either the name of a pip package, or the local path to the model data
    directory. Linking models allows loading them via spacy.load(link_name).
    """
    msg = Printer()
    if util.is_package(origin):
        model_path = util.get_package_path(origin)
    else:
        model_path = Path(origin) if model_path is None else Path(model_path)
    if not model_path.exists():
        msg.fail(
            "Can't locate model data",
            "The data should be located in {}".format(path2str(model_path)),
            exits=1,
        )
    data_path = util.get_data_path()
    if not data_path or not data_path.exists():
        spacy_loc = Path(__file__).parent.parent
        msg.fail(
            "Can't find the spaCy data path to create model symlink",
            "Make sure a directory `/data` exists within your spaCy "
            "installation and try again. The data directory should be located "
            "here:".format(path=spacy_loc),
            exits=1,
        )
    link_path = util.get_data_path() / link_name
    if link_path.is_symlink() and not force:
        msg.fail(
            "Link '{}' already exists".format(link_name),
            "To overwrite an existing link, use the --force flag",
            exits=1,
        )
    elif link_path.is_symlink():  # does a symlink exist?
        # NB: It's important to check for is_symlink here and not for exists,
        # because invalid/outdated symlinks would return False otherwise.
        link_path.unlink()
    elif link_path.exists():  # does it exist otherwise?
        # NB: Check this last because valid symlinks also "exist".
        msg.fail(
            "Can't overwrite symlink '{}'".format(link_name),
            "This can happen if your data directory contains a directory or "
            "file of the same name.",
            exits=1,
        )
    details = "%s --> %s" % (path2str(model_path), path2str(link_path))
    try:
        symlink_to(link_path, model_path)
    except:  # noqa: E722
        # This is quite dirty, but just making sure other errors are caught.
        msg.fail(
            "Couldn't link model to '{}'".format(link_name),
            "Creating a symlink in spacy/data failed. Make sure you have the "
            "required permissions and try re-running the command as admin, or "
            "use a virtualenv. You can still import the model as a module and "
            "call its load() method, or create the symlink manually.",
        )
        msg.text(details)
        raise
    msg.good("Linking successful", details)
    msg.text(
        "You can now load the model via spacy.load('{}')".format(link_name))
Ejemplo n.º 19
0
def validate():
    """
    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
    msg = Printer()
    with msg.loading("Loading compatibility table..."):
        r = requests.get(about.__compatibility__)
        if r.status_code != 200:
            msg.fail(
                "Server error ({})".format(r.status_code),
                "Couldn't fetch compatibility table.",
                exits=1,
            )
    msg.good("Loaded compatibility table")
    compat = r.json()["spacy"]
    version = about.__version__
    version = version.rsplit(".dev", 1)[0]
    current_compat = compat.get(version)
    if not current_compat:
        msg.fail(
            "Can't find spaCy v{} in compatibility table".format(version),
            about.__compatibility__,
            exits=1,
        )
    all_models = set()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
        for model, model_vs in models.items():
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    model_links = get_model_links(current_compat)
    model_pkgs = get_model_pkgs(current_compat, all_models)
    incompat_links = {l for l, d in model_links.items() if not d["compat"]}
    incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
    incompat_models.update(
        [d["name"] for _, d in model_links.items() if not d["compat"]]
    )
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

    msg.divider("Installed models (spaCy v{})".format(about.__version__))
    msg.info("spaCy installation: {}".format(path2str(spacy_dir)))

    if model_links or model_pkgs:
        header = ("TYPE", "NAME", "MODEL", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
            rows.append(get_model_row(current_compat, name, data, msg))
        for name, data in model_links.items():
            rows.append(get_model_row(current_compat, name, data, msg, "link"))
        msg.table(rows, header=header)
    else:
        msg.text("No models found in your current environment.", exits=0)
    if update_models:
        msg.divider("Install updates")
        msg.text("Use the following commands to update the model packages:")
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
        msg.text(
            "The following models are not available for spaCy "
            "v{}: {}".format(about.__version__, ", ".join(na_models))
        )
    if incompat_links:
        msg.text(
            "You may also want to overwrite the incompatible links using the "
            "`python -m spacy link` command with `--force`, or remove them "
            "from the data directory. "
            "Data path: {path}".format(path=path2str(get_data_path()))
        )
    if incompat_models or incompat_links:
        sys.exit(1)
Ejemplo n.º 20
0
def debug_data(
    lang,
    train_path,
    dev_path,
    base_model=None,
    pipeline="tagger,parser,ner",
    ignore_warnings=False,
    ignore_validation=False,
    verbose=False,
    no_format=False,
):
    msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)

    # Make sure all files and paths exists if they are needed
    if not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)

    # Initialize the model and pipeline
    pipeline = [p.strip() for p in pipeline.split(",")]
    if base_model:
        nlp = load_model(base_model)
    else:
        lang_cls = get_lang_class(lang)
        nlp = lang_cls()

    msg.divider("Data format validation")

    # Validate data format using the JSON schema
    # TODO: update once the new format is ready
    # TODO: move validation to GoldCorpus in order to be able to load from dir
    train_data_errors = []  # TODO: validate_json
    dev_data_errors = []  # TODO: validate_json
    if not train_data_errors:
        msg.good("Training data JSON format is valid")
    if not dev_data_errors:
        msg.good("Development data JSON format is valid")
    for error in train_data_errors:
        msg.fail("Training data: {}".format(error))
    for error in dev_data_errors:
        msg.fail("Develoment data: {}".format(error))
    if (train_data_errors or dev_data_errors) and not ignore_validation:
        sys.exit(1)

    # Create the gold corpus to be able to better analyze data
    loading_train_error_message = ""
    loading_dev_error_message = ""
    with msg.loading("Loading corpus..."):
        corpus = GoldCorpus(train_path, dev_path)
        try:
            train_docs = list(corpus.train_docs(nlp))
            train_docs_unpreprocessed = list(
                corpus.train_docs_without_preprocessing(nlp))
        except ValueError as e:
            loading_train_error_message = "Training data cannot be loaded: {}".format(
                str(e))
        try:
            dev_docs = list(corpus.dev_docs(nlp))
        except ValueError as e:
            loading_dev_error_message = "Development data cannot be loaded: {}".format(
                str(e))
    if loading_train_error_message or loading_dev_error_message:
        if loading_train_error_message:
            msg.fail(loading_train_error_message)
        if loading_dev_error_message:
            msg.fail(loading_dev_error_message)
        sys.exit(1)
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_docs constantly
    gold_train_data = _compile_gold(train_docs, pipeline)
    gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed,
                                                   pipeline)
    gold_dev_data = _compile_gold(dev_docs, pipeline)

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]

    msg.divider("Training stats")
    msg.text("Training pipeline: {}".format(", ".join(pipeline)))
    for pipe in [p for p in pipeline if p not in nlp.factories]:
        msg.fail(
            "Pipeline component '{}' not available in factories".format(pipe))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
    msg.text("{} training docs".format(len(train_docs)))
    msg.text("{} evaluation docs".format(len(dev_docs)))

    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn(
            "{} training examples also in evaluation data".format(overlap))
    else:
        msg.good("No overlap between training and evaluation data")
    if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
        text = "Low number of examples to train from a blank model ({})".format(
            len(train_docs))
        if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
            "It's recommended to use at least {} examples (minimum {})".format(
                BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD),
            show=verbose,
        )

    msg.divider("Vocab & Vectors")
    n_words = gold_train_data["n_words"]
    msg.info("{} total {} in the data ({} unique)".format(
        n_words, "word" if n_words == 1 else "words",
        len(gold_train_data["words"])))
    if gold_train_data["n_misaligned_words"] > 0:
        msg.warn("{} misaligned tokens in the training data".format(
            gold_train_data["n_misaligned_words"]))
    if gold_dev_data["n_misaligned_words"] > 0:
        msg.warn("{} misaligned tokens in the dev data".format(
            gold_dev_data["n_misaligned_words"]))
    most_common_words = gold_train_data["words"].most_common(10)
    msg.text(
        "10 most common words: {}".format(
            _format_labels(most_common_words, counts=True)),
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info("{} vectors ({} unique keys, {} dimensions)".format(
            len(nlp.vocab.vectors),
            nlp.vocab.vectors.n_keys,
            nlp.vocab.vectors_length,
        ))
    else:
        msg.info("No word vectors present in the model")

    if "ner" in pipeline:
        # Get all unique NER labels present in the data
        labels = set(label for label in gold_train_data["ner"]
                     if label not in ("O", "-"))
        label_counts = gold_train_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False

        msg.divider("Named Entity Recognition")
        msg.info("{} new {}, {} existing {}".format(
            len(new_labels),
            "label" if len(new_labels) == 1 else "labels",
            len(existing_labels),
            "label" if len(existing_labels) == 1 else "labels",
        ))
        missing_values = label_counts["-"]
        msg.text("{} missing {} (tokens with '-' label)".format(
            missing_values, "value" if missing_values == 1 else "values"))
        if new_labels:
            labels_with_counts = [
                (label, count) for label, count in label_counts.most_common()
                if label != "-"
            ]
            labels_with_counts = _format_labels(labels_with_counts,
                                                counts=True)
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text("Existing: {}".format(_format_labels(existing_labels)),
                     show=verbose)

        if gold_train_data["ws_ents"]:
            msg.fail("{} invalid whitespace entity spans".format(
                gold_train_data["ws_ents"]))
            has_ws_ents_error = True

        for label in new_labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
                    "Low number of examples for new label '{}' ({})".format(
                        label, label_counts[label]))
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(train_docs, label)
                if neg_docs == 0:
                    msg.warn(
                        "No examples for texts WITHOUT new label '{}'".format(
                            label))
                    has_no_neg_warning = True

        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurrences available for all labels")
        if not has_ws_ents_error:
            msg.good(
                "No entities consisting of or starting/ending with whitespace")

        if has_low_data_warning:
            msg.text(
                "To train a new entity type, your data should include at "
                "least {} instances of the new label".format(
                    NEW_LABEL_THRESHOLD),
                show=verbose,
            )
        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of entities "
                "in context, as well as examples without a given entity "
                "type.",
                show=verbose,
            )
        if has_ws_ents_error:
            msg.text(
                "As of spaCy v2.1.0, entity spans consisting of or starting/ending "
                "with whitespace characters are considered invalid.")

    if "textcat" in pipeline:
        msg.divider("Text Classification")
        labels = [label for label in gold_train_data["textcat"]]
        model_labels = _get_labels_from_model(nlp, "textcat")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        msg.info("Text Classification: {} new label(s), {} existing label(s)".
                 format(len(new_labels), len(existing_labels)))
        if new_labels:
            labels_with_counts = _format_labels(
                gold_train_data["textcat"].most_common(), counts=True)
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text("Existing: {}".format(_format_labels(existing_labels)),
                     show=verbose)

    if "tagger" in pipeline:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_train_data["tags"]]
        tag_map = nlp.Defaults.tag_map
        msg.info("{} {} in data ({} {} in tag map)".format(
            len(labels),
            "label" if len(labels) == 1 else "labels",
            len(tag_map),
            "label" if len(tag_map) == 1 else "labels",
        ))
        labels_with_counts = _format_labels(
            gold_train_data["tags"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)
        non_tagmap = [l for l in labels if l not in tag_map]
        if not non_tagmap:
            msg.good("All labels present in tag map for language '{}'".format(
                nlp.lang))
        for label in non_tagmap:
            msg.fail(
                "Label '{}' not found in tag map for language '{}'".format(
                    label, nlp.lang))

    if "parser" in pipeline:
        msg.divider("Dependency Parsing")

        # profile sentence length
        msg.info("Found {} sentence{} with an average length of {:.1f} words.".
                 format(
                     gold_train_data["n_sents"],
                     "s" if len(train_docs) > 1 else "",
                     gold_train_data["n_words"] / gold_train_data["n_sents"]))

        # profile labels
        labels_train = [label for label in gold_train_data["deps"]]
        labels_train_unpreprocessed = [
            label for label in gold_train_unpreprocessed_data["deps"]
        ]
        labels_dev = [label for label in gold_dev_data["deps"]]

        if gold_train_unpreprocessed_data["n_nonproj"] > 0:
            msg.info("Found {} nonprojective train sentence{}".format(
                gold_train_unpreprocessed_data["n_nonproj"], "s"
                if gold_train_unpreprocessed_data["n_nonproj"] > 1 else ""))
        if gold_dev_data["n_nonproj"] > 0:
            msg.info("Found {} nonprojective dev sentence{}".format(
                gold_dev_data["n_nonproj"],
                "s" if gold_dev_data["n_nonproj"] > 1 else ""))

        msg.info("{} {} in train data".format(
            len(labels_train_unpreprocessed),
            "label" if len(labels_train) == 1 else "labels"))
        msg.info("{} {} in projectivized train data".format(
            len(labels_train),
            "label" if len(labels_train) == 1 else "labels"))

        labels_with_counts = _format_labels(
            gold_train_unpreprocessed_data["deps"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)

        # rare labels in train
        for label in gold_train_unpreprocessed_data["deps"]:
            if gold_train_unpreprocessed_data["deps"][
                    label] <= DEP_LABEL_THRESHOLD:
                msg.warn("Low number of examples for label '{}' ({})".format(
                    label, gold_train_unpreprocessed_data["deps"][label]))
                has_low_data_warning = True

        # rare labels in projectivized train
        rare_projectivized_labels = []
        for label in gold_train_data["deps"]:
            if gold_train_data["deps"][
                    label] <= DEP_LABEL_THRESHOLD and "||" in label:
                rare_projectivized_labels.append("{}: {}".format(
                    label, str(gold_train_data["deps"][label])))

        if len(rare_projectivized_labels) > 0:
            msg.warn(
                "Low number of examples for {} label{} in the "
                "projectivized dependency trees used for training. You may "
                "want to projectivize labels such as punct before "
                "training in order to improve parser performance.".format(
                    len(rare_projectivized_labels),
                    "s" if len(rare_projectivized_labels) > 1 else ""))
            msg.warn("Projectivized labels with low numbers of examples: "
                     "{}".format("\n".join(rare_projectivized_labels)),
                     show=verbose)
            has_low_data_warning = True

        # labels only in train
        if set(labels_train) - set(labels_dev):
            msg.warn("The following labels were found only in the train data: "
                     "{}".format(
                         ", ".join(set(labels_train) - set(labels_dev))),
                     show=verbose)

        # labels only in dev
        if set(labels_dev) - set(labels_train):
            msg.warn("The following labels were found only in the dev data: " +
                     ", ".join(set(labels_dev) - set(labels_train)),
                     show=verbose)

        if has_low_data_warning:
            msg.text(
                "To train a parser, your data should include at "
                "least {} instances of each label.".format(
                    DEP_LABEL_THRESHOLD),
                show=verbose,
            )

        # multiple root labels
        if len(gold_train_unpreprocessed_data["roots"]) > 1:
            msg.warn(
                "Multiple root labels ({}) ".format(", ".join(
                    gold_train_unpreprocessed_data["roots"])) +
                "found in training data. spaCy's parser uses a single root "
                "label ROOT so this distinction will not be available.")

        # these should not happen, but just in case
        if gold_train_data["n_nonproj"] > 0:
            msg.fail(
                "Found {} nonprojective projectivized train sentence{}".format(
                    gold_train_data["n_nonproj"],
                    "s" if gold_train_data["n_nonproj"] > 1 else ""))
        if gold_train_data["n_cycles"] > 0:
            msg.fail(
                "Found {} projectivized train sentence{} with cycles".format(
                    gold_train_data["n_cycles"],
                    "s" if gold_train_data["n_cycles"] > 1 else ""))

    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
        msg.good("{} {} passed".format(
            good_counts, "check" if good_counts == 1 else "checks"))
    if warn_counts:
        msg.warn("{} {}".format(warn_counts,
                                "warning" if warn_counts == 1 else "warnings"))
    if fail_counts:
        msg.fail("{} {}".format(fail_counts,
                                "error" if fail_counts == 1 else "errors"))

    if fail_counts:
        sys.exit(1)
Ejemplo n.º 21
0
def package(input_dir,
            output_dir,
            meta_path=None,
            create_meta=False,
            force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
    output directory, and model data will be copied over. If --create-meta is
    set and a meta.json already exists in the output directory, the existing
    values will be used as the defaults in the command-line prompt.
    """
    msg = Printer()
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
        msg.fail("Can't locate model data", input_path, exits=1)
    if not output_path or not output_path.exists():
        msg.fail("Output directory not found", output_path, exits=1)
    if meta_path and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)

    meta_path = meta_path or input_path / "meta.json"
    if meta_path.is_file():
        meta = srsly.read_json(meta_path)
        if not create_meta:  # only print if user doesn't want to overwrite
            msg.good("Loaded meta.json from file", meta_path)
        else:
            meta = generate_meta(input_dir, meta, msg)
    for key in ("lang", "name", "version"):
        if key not in meta or meta[key] == "":
            msg.fail(
                "No '{}' setting found in meta.json".format(key),
                "This setting is required to build your package.",
                exits=1,
            )
    model_name = meta["lang"] + "_" + meta["name"]
    model_name_v = model_name + "-" + meta["version"]
    main_path = output_path / model_name_v
    package_path = main_path / model_name

    if package_path.exists():
        if force:
            shutil.rmtree(path2str(package_path))
        else:
            msg.fail(
                "Package directory already exists",
                "Please delete the directory and try again, or use the "
                "`--force` flag to overwrite existing "
                "directories.".format(path=path2str(package_path)),
                exits=1,
            )
    Path.mkdir(package_path, parents=True)
    shutil.copytree(path2str(input_path),
                    path2str(package_path / model_name_v))
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
    create_file(package_path / "__init__.py", TEMPLATE_INIT)
    msg.good("Successfully created package '{}'".format(model_name_v),
             main_path)
    msg.text(
        "To build the package, run `python setup.py sdist` in this directory.")
Ejemplo n.º 22
0
def pretrain(
    texts_loc,
    vectors_model,
    output_dir,
    width=96,
    depth=4,
    bilstm_depth=2,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
    dropout=0.2,
    n_iter=1000,
    batch_size=3000,
    max_length=500,
    min_length=5,
    seed=0,
    n_save_every=None,
    init_tok2vec=None,
    epoch_start=None,
):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pretrained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pretrained weights
    files to the 'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. The API and
    errors around this need some improvement.
    """
    config = dict(locals())
    for key in config:
        if isinstance(config[key], Path):
            config[key] = str(config[key])
    msg = Printer()
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
    if has_gpu:
        import torch

        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory")
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
        if not texts:
            msg.fail("Input file is empty", texts_loc, exits=1)
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

    with msg.loading("Loading model '{}'...".format(vectors_model)):
        nlp = util.load_model(vectors_model)
    msg.good("Loaded model '{}'".format(vectors_model))
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
            width,
            embed_rows,
            conv_depth=depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=bilstm_depth,  # Requires PyTorch. Experimental.
            cnn_maxout_pieces=3,  # You can try setting this higher
            subword_features=True,  # Set to False for Chinese etc
        ),
    )
    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))
        # Parse the epoch number from the given weight file
        model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
        if model_name:
            # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
            epoch_start = int(model_name.group(0)[5:][:-4]) + 1
        else:
            if not epoch_start:
                msg.fail(
                    "You have to use the '--epoch-start' argument when using a renamed weight file for "
                    "'--init-tok2vec'",
                    exits=True,
                )
            elif epoch_start < 0:
                msg.fail(
                    "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid"
                    % epoch_start,
                    exits=True,
                )
    else:
        # Without '--init-tok2vec' the '--epoch-start' argument is ignored
        epoch_start = 0

    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
    msg.divider("Pre-training tok2vec layer - starting at epoch %d" %
                epoch_start)
    row_settings = {
        "widths": (3, 10, 10, 6, 4),
        "aligns": ("r", "r", "r", "r", "r")
    }
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / ("model%d%s.bin" %
                                (epoch, is_temp_str))).open("wb") as file_:
                file_.write(model.tok2vec.to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    skip_counter = 0
    for epoch in range(epoch_start, n_iter + epoch_start):
        for batch_id, batch in enumerate(
                util.minibatch_by_words(((text, None) for text in texts),
                                        size=batch_size)):
            docs, count = make_docs(
                nlp,
                [text for (text, _) in batch],
                max_length=max_length,
                min_length=min_length,
            )
            skip_counter += count
            loss = make_update(model,
                               docs,
                               optimizer,
                               objective=loss_func,
                               drop=dropout)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10**7:
                    break
            if n_save_every and (batch_id % n_save_every == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
    if skip_counter > 0:
        msg.warn(
            "Skipped {count} empty values".format(count=str(skip_counter)))
    msg.good("Successfully finished pretrain")
Ejemplo n.º 23
0
def train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    vectors=None,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """
    msg = Printer()
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [
            p for p in output_path.iterdir() if p.is_dir()
    ]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    msg.text("Training pipeline: {}".format(pipeline))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
        nlp.disable_pipes(*other_pipes)
        for pipe in pipeline:
            if pipe not in nlp.pipe_names:
                if pipe == "parser":
                    pipe_cfg = {"learn_tokens": learn_tokens}
                else:
                    pipe_cfg = {}
                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            else:
                pipe_cfg = {}
            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks),
                         ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail("Can't use multitask objective without '{}' in the "
                         "pipeline".format(pipe_name))
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model:
        # Start with an existing model, use default optimizer
        optimizer = create_default_optimizer(Model.ops)
    else:
        # Start with a blank model, call begin_training
        optimizer = nlp.begin_training(lambda: corpus.train_tuples,
                                       device=use_gpu)

    nlp._optimizer = None

    # Load in pre-trained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # fmt: off
    row_head = [
        "Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F",
        "Tag %", "Token %", "CPU WPS", "GPU WPS"
    ]
    row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7]
    if has_beam_widths:
        row_head.insert(1, "Beam W.")
        row_widths.insert(1, 7)
    row_settings = {
        "widths": row_widths,
        "aligns": tuple(["r" for i in row_head]),
        "spacing": 2
    }
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(nlp,
                                           noise_level=noise_level,
                                           gold_preproc=gold_preproc,
                                           max_length=0)
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8)
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs,
                                                     size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs,
                        golds,
                        sgd=optimizer,
                        drop=next(dropout_rates),
                        losses=losses,
                    )
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc))
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, debug)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        with Model.use_device("cpu"):
                            nlp_loaded = util.load_model_from_path(
                                epoch_model_path)
                            for name, component in nlp_loaded.pipeline:
                                if hasattr(component, "cfg"):
                                    component.cfg["beam_width"] = beam_width
                            dev_docs = list(
                                corpus.dev_docs(nlp_loaded,
                                                gold_preproc=gold_preproc))
                            start_time = timer()
                            scorer = nlp_loaded.evaluate(dev_docs)
                            end_time = timer()
                            cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta["accuracy"] = scorer.scores
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        meta["beam_accuracy"][beam_width] = scorer.scores
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        msg.text("Early stopping, best iteration "
                                 "is: {}".format(i - iter_since_best))
                        msg.text("Best score = {}; Final iteration "
                                 "score = {}".format(best_score,
                                                     current_score))
                        break
    finally:
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(meta, output_path,
                                                  nlp.pipe_names)
        msg.good("Created best model", best_model_path)
Ejemplo n.º 24
0
    def fit(self, kb_aliases: List[str], verbose: bool = False):
        """Build tfidf vectorizer and ann index.
        Warning: Running this function can take a lot of memory
        
        kb_aliases (List[str]): Aliases in the KnoweledgeBase to fit 
            the ANN index on.
        verbose (bool, optional): Set to True to get print updates while fitting the index. Defaults to False.
        
        RETURNS (CandidateGenerator): An initialized CandidateGenerator
        """
        msg = Printer(no_print=verbose)

        # kb_aliases = self.kb.get_alias_strings()
        short_aliases = set([a for a in kb_aliases if len(a) < 4])

        # nmslib hyperparameters (very important)
        # guide: https://github.com/nmslib/nmslib/blob/master/python_bindings/parameters.md
        # m_parameter = 100
        # # `C` for Construction. Set to the maximum recommended value
        # # Improves recall at the expense of longer indexing time
        # construction = 2000
        # num_threads = 60  # set based on the machine
        index_params = {
            "M": self.m_parameter,
            "indexThreadQty": self.n_threads,
            "efConstruction": self.ef_construction,
            "post": 0,
        }

        # NOTE: here we are creating the tf-idf vectorizer with float32 type, but we can serialize the
        # resulting vectors using float16, meaning they take up half the memory on disk. Unfortunately
        # we can't use the float16 format to actually run the vectorizer, because of this bug in sparse
        # matrix representations in scipy: https://github.com/scipy/scipy/issues/7408

        msg.text(f"Fitting tfidf vectorizer on {len(kb_aliases)} aliases")
        tfidf_vectorizer = TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(3, 3),
                                           min_df=1,
                                           dtype=np.float32)
        start_time = timer()
        alias_tfidfs = tfidf_vectorizer.fit_transform(kb_aliases)
        end_time = timer()
        total_time = end_time - start_time
        msg.text(
            f"Fitting and saving vectorizer took {round(total_time)} seconds")

        msg.text(f"Finding empty (all zeros) tfidf vectors")
        empty_tfidfs_boolean_flags = np.array(
            alias_tfidfs.sum(axis=1) != 0).reshape(-1, )
        number_of_non_empty_tfidfs = sum(empty_tfidfs_boolean_flags == False)  # pylint: disable=singleton-comparison
        total_number_of_tfidfs = np.size(alias_tfidfs, 0)

        msg.text(
            f"Deleting {number_of_non_empty_tfidfs}/{total_number_of_tfidfs} aliases because their tfidf is empty"
        )
        # remove empty tfidf vectors, otherwise nmslib will crash
        aliases = [
            alias
            for alias, flag in zip(kb_aliases, empty_tfidfs_boolean_flags)
            if flag
        ]
        alias_tfidfs = alias_tfidfs[empty_tfidfs_boolean_flags]
        assert len(aliases) == np.size(alias_tfidfs, 0)

        msg.text(f"Fitting ann index on {len(aliases)} aliases")
        start_time = timer()
        ann_index = nmslib.init(
            method="hnsw",
            space="cosinesimil_sparse",
            data_type=nmslib.DataType.SPARSE_VECTOR,
        )
        ann_index.addDataPointBatch(alias_tfidfs)
        ann_index.createIndex(index_params, print_progress=verbose)
        query_time_params = {"efSearch": self.ef_search}
        ann_index.setQueryTimeParams(query_time_params)
        end_time = timer()
        total_time = end_time - start_time
        msg.text(f"Fitting ann index took {round(total_time)} seconds")

        self._initialize(aliases, short_aliases, ann_index, tfidf_vectorizer,
                         alias_tfidfs)
        return self
Ejemplo n.º 25
0
def link(origin, link_name, force=False, model_path=None):
    """
    Create a symlink for models within the spacy/data directory. Accepts
    either the name of a pip package, or the local path to the model data
    directory. Linking models allows loading them via spacy.load(link_name).
    """
    msg = Printer()
    if util.is_package(origin):
        model_path = util.get_package_path(origin)
    else:
        model_path = Path(origin) if model_path is None else Path(model_path)
    if not model_path.exists():
        msg.fail(
            "Can't locate model data",
            "The data should be located in {}".format(path2str(model_path)),
            exits=1,
        )
    data_path = util.get_data_path()
    if not data_path or not data_path.exists():
        spacy_loc = Path(__file__).parent.parent
        msg.fail(
            "Can't find the spaCy data path to create model symlink",
            "Make sure a directory `/data` exists within your spaCy "
            "installation and try again. The data directory should be located "
            "here:".format(path=spacy_loc),
            exits=1,
        )
    link_path = util.get_data_path() / link_name
    if link_path.is_symlink() and not force:
        msg.fail(
            "Link '{}' already exists".format(link_name),
            "To overwrite an existing link, use the --force flag",
            exits=1,
        )
    elif link_path.is_symlink():  # does a symlink exist?
        # NB: It's important to check for is_symlink here and not for exists,
        # because invalid/outdated symlinks would return False otherwise.
        link_path.unlink()
    elif link_path.exists():  # does it exist otherwise?
        # NB: Check this last because valid symlinks also "exist".
        msg.fail(
            "Can't overwrite symlink '{}'".format(link_name),
            "This can happen if your data directory contains a directory or "
            "file of the same name.",
            exits=1,
        )
    details = "%s --> %s" % (path2str(model_path), path2str(link_path))
    try:
        symlink_to(link_path, model_path)
    except:  # noqa: E722
        # This is quite dirty, but just making sure other errors are caught.
        msg.fail(
            "Couldn't link model to '{}'".format(link_name),
            "Creating a symlink in spacy/data failed. Make sure you have the "
            "required permissions and try re-running the command as admin, or "
            "use a virtualenv. You can still import the model as a module and "
            "call its load() method, or create the symlink manually.",
        )
        msg.text(details)
        raise
    msg.good("Linking successful", details)
    msg.text("You can now load the model via spacy.load('{}')".format(link_name))