def to_disk(self, path: Path, **kwargs):
        """Serialize CandidateGenerator to disk
        
        path (Path): Directory to serialize to
        """
        cfg = {
            "k": self.k,
            "m_parameter": self.m_parameter,
            "ef_search": self.ef_search,
            "ef_construction": self.ef_construction,
            "n_threads": self.n_threads,
        }
        serializers = {
            "cg_cfg":
            lambda p: srsly.write_json(p, cfg),
            "aliases":
            lambda p: srsly.write_json(p.with_suffix(".json"), self.aliases),
            "short_aliases":
            lambda p: srsly.write_json(p.with_suffix(".json"), self.
                                       short_aliases),
            "ann_index":
            lambda p: self.ann_index.saveIndex(str(p.with_suffix(".bin"))),
            "tfidf_vectorizer":
            lambda p: joblib.dump(self.vectorizer, p.with_suffix(".joblib")),
            "tfidf_vectors_sparse":
            lambda p: scipy.sparse.save_npz(
                p.with_suffix(".npz"), self.alias_tfidfs.astype(np.float16)),
        }

        to_disk(path, serializers, {})
Example #2
0
    def from_bytes(self, bytes_data):
        msg = srsly.msgpack_loads(bytes_data)
        config_dict = msg["config"]
        tok_dict = msg["tokenizer"]
        if config_dict:
            with make_tempdir() as temp_dir:
                config_file = temp_dir / "config.json"
                srsly.write_json(config_file, config_dict)
                config = AutoConfig.from_pretrained(config_file)
                for x, x_bytes in tok_dict.items():
                    Path(temp_dir / x).write_bytes(x_bytes)
                tokenizer = AutoTokenizer.from_pretrained(
                    str(temp_dir.absolute()))

            transformer = AutoModel.from_config(config)
            self._hfmodel = HFObjects(tokenizer, transformer,
                                      SimpleFrozenDict(), SimpleFrozenDict())
            self._model = transformer
            filelike = BytesIO(msg["state"])
            filelike.seek(0)
            ops = get_current_ops()
            if ops.device_type == "cpu":
                map_location = "cpu"
            else:  # pragma: no cover
                device_id = torch.cuda.current_device()
                map_location = f"cuda:{device_id}"
            self._model.load_state_dict(
                torch.load(filelike, map_location=map_location))
            self._model.to(map_location)
        else:
            self._hfmodel = HFObjects(None, None,
                                      msg["_init_tokenizer_config"],
                                      msg["_init_transformer_config"])
        return self
Example #3
0
 def to_disk(self, path: Union[str, Path], exclude=tuple(), disable=None):
     """Overrides Language.to_disk to save `nlp.lang` properly."""
     path = Path(path)
     super().to_disk(path, exclude, disable)
     meta = self.meta
     meta["lang"] = self.lang
     srsly.write_json(path / "meta.json", meta)
Example #4
0
    def to_disk(self, output_path: Path, force: bool = False, save_examples: bool = True) -> None:
        """Save Corpus to Disk

        Args:
            output_path (Path): Output file path to save data to
            force (bool): Force save to directory. Create parent directories
                or overwrite existing data.
            save_examples (bool): Save the example store along with the state.
        """
        output_path = ensure_path(output_path)
        output_dir = output_path.parent
        state_dir = output_dir / ".recon" / self.name
        if force:
            output_dir.mkdir(parents=True, exist_ok=True)

            if not state_dir.exists():
                state_dir.mkdir(parents=True, exist_ok=True)

        ds_op_state = DatasetOperationsState(
            name=self.name, commit=self.commit_hash, size=len(self), operations=self.operations
        )
        srsly.write_json(state_dir / "state.json", ds_op_state.dict())

        if save_examples:
            self.example_store.to_disk(state_dir / "example_store.jsonl")

        srsly.write_jsonl(output_path, [e.dict() for e in self.data])
Example #5
0
def convert(
    input_file,
    output_dir="-",
    file_type="jsonl",
    n_sents=1,
    morphology=False,
    converter="auto",
    lang=None,
):
    """
    Convert files into JSON format for use with train command and other
    experiment management functions. If no output_dir is specified, the data
    is written to stdout, so you can pipe them forward to a JSONL file:
    $ spacy convert some_file.conllu > some_file.jsonl
    """
    msg = Printer()
    input_path = Path(input_file)
    if file_type not in FILE_TYPES:
        msg.fail(
            "Unknown file type: '{}'".format(file_type),
            "Supported file types: '{}'".format(", ".join(FILE_TYPES)),
            exits=1,
        )
    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
        # TODO: support msgpack via stdout in srsly?
        msg.fail(
            "Can't write .{} data to stdout.".format(file_type),
            "Please specify an output directory.",
            exits=1,
        )
    if not input_path.exists():
        msg.fail("Input file not found", input_path, exits=1)
    if output_dir != "-" and not Path(output_dir).exists():
        msg.fail("Output directory not found", output_dir, exits=1)
    if converter == "auto":
        converter = input_path.suffix[1:]
    if converter not in CONVERTERS:
        msg.fail("Can't find converter for {}".format(converter), exits=1)
    # Use converter function to convert data
    func = CONVERTERS[converter]
    input_data = input_path.open("r", encoding="utf-8").read()
    data = func(input_data, n_sents=n_sents, use_morphology=morphology, lang=lang)
    if output_dir != "-":
        # Export data to a file
        suffix = ".{}".format(file_type)
        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
        if file_type == "json":
            srsly.write_json(output_file, data)
        elif file_type == "jsonl":
            srsly.write_jsonl(output_file, data)
        elif file_type == "msg":
            srsly.write_msgpack(output_file, data)
        msg.good("Generated output file ({} documents)".format(len(data)), output_file)
    else:
        # Print to stdout
        if file_type == "json":
            srsly.write_json("-", data)
        elif file_type == "jsonl":
            srsly.write_jsonl("-", data)
Example #6
0
def sort(path: Path):
    """Sort the strings from the vocabulary of a spaCy model.

    For the original code of StringStore.to_disk(), see https://github.com/explosion/spaCy/blob/53a3b967ac704ff0a67a7102ede6d916e2a4545a/spacy/strings.pyx#L219-L227.
    """
    st = StringStore().from_disk(path)
    strings = sorted(st)
    srsly.write_json(path, strings)
Example #7
0
def _write_docs_to_file(data: Any, output_file: Path, output_type: str) -> None:
    if not output_file.parent.exists():
        output_file.parent.mkdir(parents=True)
    if output_type == "json":
        srsly.write_json(output_file, data)
    else:
        with output_file.open("wb") as file_:
            file_.write(data)
Example #8
0
def _init_labels(nlp, output_path):
    for name, component in nlp.pipeline:
        if getattr(component, "label_data", None) is not None:
            output_file = output_path / f"{name}.json"
            srsly.write_json(output_file, component.label_data)
            msg.good(
                f"Saving label data for component '{name}' to {output_file}")
        else:
            msg.info(f"No label data found for component '{name}'")
Example #9
0
def main(
    # fmt: off
    spacy_version: str = typer.Argument(">=3.0.0rc1,<3.1.0",
                                        help="The spaCy version range"),
    spacy_streamlit_version: str = typer.Argument(
        ">=1.0.0rc0,<1.1.0", help="The version range of spacy-streamlit"),
    req_path: Path = typer.Option(Path(__file__).parent / "requirements.txt",
                                  "--requirements-path",
                                  "-rp",
                                  help="Path to requirements.txt"),
    desc_path: Path = typer.Option(
        Path(__file__).parent / "models.json",
        "--models-json-path",
        "-mp",
        help="Path to models.json with model details for dropdown"),
    package: str = typer.Option(
        "spacy-nightly",
        "--package",
        "-p",
        help="The parent package (spacy, spacy-nightly, etc.)"),
    exclude: str = typer.Option("en_vectors_web_lg,xx_ent_wiki_sm",
                                "--exclude",
                                "-e",
                                help="Comma-separated model names to exclude"),
    # fmt: on
):
    exclude = [name.strip() for name in exclude.split(",")]
    r = requests.get(COMPAT_URL)
    r.raise_for_status()
    compat = r.json()["spacy"]
    data = None
    for version_option in compat:
        if is_compatible_version(version_option, spacy_version):
            data = compat[version_option]
            break
    if data is None:
        raise ValueError(f"No compatible models found for {spacy_version}")
    reqs = [
        f"# Auto-generated by {Path(__file__).name}",
        f"{package}{spacy_version}",
        f"spacy-streamlit{spacy_streamlit_version}",
    ]
    models = {}
    for model_name, model_versions in data.items():
        if model_name not in exclude and model_versions:
            reqs.append(
                URL_TEMPLATE.format(name=model_name,
                                    version=model_versions[0]))
            lang = model_name.split("_", 1)[0]
            lang_name = get_lang_class(lang).__name__
            models[model_name] = f"{lang_name} ({model_name})"
    with Path(req_path).open("w", encoding="utf8") as f:
        f.write("\n".join(reqs))
    srsly.write_json(desc_path, models)
    print(
        f"Generated requirements.txt and models.json for {len(reqs) - 1} models"
    )
Example #10
0
    def load_with_spacy(self):
        """
        This function will convert the CoNLL02/03 format to json format for spaCy.
        As the function will return a spacy.gold.GoldCorpus which needs a dev set
        this function also splits the dataset into a 70/30 split as is done by
        Pan et al. (2017).

        - Pan et al. (2017): https://aclweb.org/anthology/P17-1178
        :return:
        """
        import srsly
        from spacy.cli.converters import conll_ner2json
        from spacy.gold import GoldCorpus
        from spacy.gold import Path

        conll_path = os.path.join(self.dataset_dir,
                                  self.dataset_name + self.file_extension)
        dev_json_path = os.path.join(self.dataset_dir,
                                     self.dataset_name + "dev.json")
        train_json_path = os.path.join(self.dataset_dir,
                                       self.dataset_name + "train.json")

        if not os.path.isfile(dev_json_path) or not os.path.isfile(
                train_json_path):
            # Convert the conll ner files to json
            with open(conll_path, 'r') as file:
                file_as_string = file.read()
                file_as_json = conll_ner2json(file_as_string)

                all_sents = file_as_json[0]['paragraphs'][0]['sentences']
                train_sents, dev_sents = train_test_split(all_sents,
                                                          test_size=0.3,
                                                          random_state=42)

                train_json = [{
                    'id': 0,
                    'paragraphs': [{
                        'sentences': train_sents
                    }]
                }]
                dev_json = [{
                    'id': 0,
                    'paragraphs': [{
                        'sentences': dev_sents
                    }]
                }]

                srsly.write_json(train_json_path, train_json)
                srsly.write_json(dev_json_path, dev_json)

        assert os.path.isfile(train_json_path) and os.path.isfile(
            train_json_path)

        return GoldCorpus(Path(train_json_path), Path(dev_json_path))
Example #11
0
    def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
        """Serialize a Sense2Vec object to a directory.

        path (unicode / Path): The path.
        exclude (list): Names of serialization fields to exclude.
        """
        path = Path(path)
        self.vectors.to_disk(path)
        srsly.write_json(path / "cfg", self.cfg)
        srsly.write_json(path / "freqs.json", list(self.freqs.items()))
        if "strings" not in exclude:
            self.strings.to_disk(path / "strings.json")
Example #12
0
def test_issue4402():
    nlp = English()
    with make_tempdir() as tmpdir:
        print("temp", tmpdir)
        json_path = tmpdir / "test4402.json"
        srsly.write_json(json_path, json_data)

        corpus = GoldCorpus(str(json_path), str(json_path))

        train_docs = list(corpus.train_docs(nlp, gold_preproc=True, max_length=0))
        # assert that the data got split into 4 sentences
        assert len(train_docs) == 4
Example #13
0
def _collate_best_model(meta, output_path, components):
    bests = {}
    for component in components:
        bests[component] = _find_best(output_path, component)
    best_dest = output_path / "model-best"
    shutil.copytree(output_path / "model-final", best_dest)
    for component, best_component_src in bests.items():
        shutil.rmtree(best_dest / component)
        shutil.copytree(best_component_src / component, best_dest / component)
        accs = srsly.read_json(best_component_src / "accuracy.json")
        for metric in _get_metrics(component):
            meta["accuracy"][metric] = accs[metric]
    srsly.write_json(best_dest / "meta.json", meta)
    return best_dest
Example #14
0
def save_train_dev_data(gold_docs, split, train_file, dev_file):
    # shuffle the docs
    random.seed(27)
    random.shuffle(gold_docs)

    # split the gold data into training and evaluation
    num_training_tasks = round(len(gold_docs) * split / 100)
    train_docs = gold_docs[:num_training_tasks]
    dev_docs = gold_docs[num_training_tasks:]

    print("{} training entities".format(str(entity_count(train_docs))))
    print("{} dev entities".format(str(entity_count(dev_docs))))

    srsly.write_json(train_file, [docs_to_json(train_docs)])
    srsly.write_json(dev_file, [docs_to_json(dev_docs)])
Example #15
0
    def to_disk(self,
                path: Union[str, Path],
                *,
                exclude: Iterable[str] = tuple()) -> None:
        """Serialize the pipe to disk.

        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.

        DOCS: https://nightly.spacy.io/api/transformer#to_disk
        """
        def save_model(p):
            trf_dir = Path(p).absolute()
            if not trf_dir.exists():
                trf_dir.mkdir()
            self.model.attrs["tokenizer"].save_pretrained(str(trf_dir))
            transformer = self.model.layers[0].shims[0]._model
            torch.save(transformer.state_dict(), trf_dir / WEIGHTS_NAME)
            transformer.config.to_json_file(trf_dir / CONFIG_NAME)

        serialize = {}
        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
        serialize["model"] = lambda p: save_model(p)
        util.to_disk(path, serialize, exclude)
Example #16
0
    def to_disk(self, path, **kwargs):
        path = util.ensure_path(path)

        def save_pkuseg_model(path):
            if self.pkuseg_seg:
                if not path.exists():
                    path.mkdir(parents=True)
                self.pkuseg_seg.model.save(path)
                self.pkuseg_seg.feature_extractor.save(path)

        def save_pkuseg_processors(path):
            if self.pkuseg_seg:
                data = (
                    _get_pkuseg_trie_data(self.pkuseg_seg.preprocesser.trie),
                    self.pkuseg_seg.postprocesser.do_process,
                    sorted(list(self.pkuseg_seg.postprocesser.common_words)),
                    sorted(list(self.pkuseg_seg.postprocesser.other_words)),
                )
                srsly.write_msgpack(path, data)

        serializers = OrderedDict((
            ("cfg", lambda p: srsly.write_json(p, self._get_config())),
            ("pkuseg_model", lambda p: save_pkuseg_model(p)),
            ("pkuseg_processors", lambda p: save_pkuseg_processors(p)),
        ))
        return util.to_disk(path, serializers, [])
Example #17
0
def _collate_best_model(meta, output_path, components):
    bests = {}
    for component in components:
        bests[component] = _find_best(output_path, component)
    best_dest = output_path / "model-best"
    shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest))
    for component, best_component_src in bests.items():
        shutil.rmtree(path2str(best_dest / component))
        shutil.copytree(
            path2str(best_component_src / component), path2str(best_dest / component)
        )
        accs = srsly.read_json(best_component_src / "accuracy.json")
        for metric in _get_metrics(component):
            meta["accuracy"][metric] = accs[metric]
    srsly.write_json(best_dest / "meta.json", meta)
    return best_dest
Example #18
0
    def to_disk(self, path, **kwargs):
        """Save the entity ruler patterns to a directory. The patterns will be
        saved as newline-delimited JSON (JSONL).

        path (unicode / Path): The JSONL file to save.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler#to_disk
        """
        path = ensure_path(path)
        cfg = {
            "overwrite": self.overwrite,
            "phrase_matcher_attr": self.phrase_matcher_attr,
            "ent_id_sep": self.ent_id_sep,
        }
        serializers = {
            "patterns":
            lambda p: srsly.write_jsonl(p.with_suffix(".jsonl"), self.patterns
                                        ),
            "cfg":
            lambda p: srsly.write_json(p, cfg),
        }
        if path.suffix == ".jsonl":  # user wants to save only JSONL
            srsly.write_jsonl(path, self.patterns)
        else:
            to_disk(path, serializers, {})
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
        """Save the entity ruler patterns to a directory. The patterns will be
        saved as newline-delimited JSON (JSONL).

        path (str / Path): The JSONL file to save.

        DOCS: https://spacy.io/api/entityruler#to_disk
        """
        path = ensure_path(path)
        cfg = {
            "overwrite": self.overwrite,
            "phrase_matcher_attr": self.phrase_matcher_attr,
            "ent_id_sep": self.ent_id_sep,
        }
        serializers = {
            "patterns": lambda p: srsly.write_jsonl(
                p.with_suffix(".jsonl"), self.patterns
            ),
            "cfg": lambda p: srsly.write_json(p, cfg),
        }
        if path.suffix == ".jsonl":  # user wants to save only JSONL
            srsly.write_jsonl(path, self.patterns)
        else:
            to_disk(path, serializers, {})
Example #20
0
    def to_disk(self, path, exclude=tuple(), disable=None):
        """Save the current state to a directory.  If a model is loaded, this
        will include the model.

        path (unicode or Path): Path to a directory, which will be created if
            it doesn't exist.
        exclude (list): Names of components or serialization fields to exclude.

        DOCS: https://spacy.io/api/language#to_disk
        """
        if disable is not None:
            warnings.warn(Warnings.W014, DeprecationWarning)
            exclude = disable
        path = util.ensure_path(path)
        serializers = OrderedDict()
        serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
            p, exclude=["vocab"])
        serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)

        for name, proc in self.pipeline:
            if not hasattr(proc, "name"):
                continue
            if name in exclude:
                continue
            if not hasattr(proc, "to_disk"):
                continue
            serializers[name] = lambda p, proc=proc: proc.to_disk(
                p, exclude=["vocab"])
        serializers["vocab"] = lambda p: self.vocab.to_disk(p)
        util.to_disk(path, serializers, exclude)
Example #21
0
 def to_disk(self, path, **kwargs):
     path = util.ensure_path(path)
     serializers = OrderedDict(
         (
             ("cfg", lambda p: srsly.write_json(p, self._get_config())),
         )
     )
     return util.to_disk(path, serializers, [])
Example #22
0
def init_labels_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(...,
                            help="Path to config file",
                            exists=True,
                            allow_dash=True),
    output_path: Path = Arg(..., help="Output directory for the labels"),
    code_path: Optional[Path] = Opt(
        None,
        "--code",
        "-c",
        help=
        "Path to Python file with additional code (registered functions) to be imported"
    ),
    verbose: bool = Opt(
        False,
        "--verbose",
        "-V",
        "-VV",
        help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
):
    """Generate JSON files for the labels in the data. This helps speed up the
    training process, since spaCy won't have to preprocess the data to
    extract the labels."""
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    if not output_path.exists():
        output_path.mkdir(parents=True)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
    with show_validation_error(hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    for name, component in nlp.pipeline:
        if getattr(component, "label_data", None) is not None:
            output_file = output_path / f"{name}.json"
            srsly.write_json(output_file, component.label_data)
            msg.good(
                f"Saving label data for component '{name}' to {output_file}")
        else:
            msg.info(f"No label data found for component '{name}'")
Example #23
0
 def to_disk(self, path, exclude=tuple()):
     path = util.ensure_path(path)
     serializers = {
         "cfg": lambda p: srsly.write_json(p, self.cfg),
         "model": lambda p: self.model.to_disk(p),
         "vocab": lambda p: self.vocab.to_disk(p, exclude=exclude),
         "trees": lambda p: self.trees.to_disk(p),
     }
     util.to_disk(path, serializers, exclude)
Example #24
0
def main(textfile, output, dummymodel, labellist):
    #Need a dummy model to create a nlp object with the aim to transform a txt file to json
    nlp = spacy.load(dummymodel)
    sr_transfrom = load_SRs_file(textfile)

    sr_transfrom_string = eval(spacy_format(sr_transfrom, labellist))
    docs = []
    for text, annot in sr_transfrom_string:
        doc = nlp(text)
        doc.is_parsed = True
        tags = biluo_tags_from_offsets(doc, annot['entities'])
        entities = spans_from_biluo_tags(doc, tags)
        doc.ents = entities
        docs.append(doc)
    #Create the json file in the same directory that textfile

    mkdir_p(os.path.split(output)[0])
    srsly.write_json(output, [spacy.gold.docs_to_json(docs)])
Example #25
0
    def to_disk(self, path: Path, exclude: Tuple = tuple(), **kwargs):
        """Serialize AnnLinker to disk.
        
        path (Path): directory to serialize to
        exclude (Tuple, optional): config to exclude. Defaults to tuple().
        """        
        path = util.ensure_path(path)
        if not path.exists():
            path.mkdir()

        cfg = {
            "threshold": self.threshold,
            "no_description_threshold": self.no_description_threshold,
            "disambiguate": self.disambiguate
        }
        srsly.write_json(path / "cfg", cfg)

        self.kb.dump(path / "kb")
        self.cg.to_disk(path)
Example #26
0
    def to_disk(self, path: Path, exclude: Tuple = tuple(), **kwargs):
        """Serialize RemoteAnnLinker to disk.
        
        path (Path): directory to serialize to
        exclude (Tuple, optional): config to exclude. Defaults to tuple().
        """
        path = ensure_path(path)
        serializers = {"cfg": lambda p: srsly.write_json(p, self.cfg)}

        to_disk(path, serializers, {})
Example #27
0
def main(test_data_dir, experiment_dir, corpus):
    Token.set_extension("split_start", getter=get_token_split_start)
    Token.set_extension("split_end", getter=get_token_split_end)
    Token.set_extension("begins_fused", default=False)
    Token.set_extension("inside_fused", default=False)
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
    lang.ru.Russian.Defaults.use_pymorphy2 = False

    nlp = load_nlp(experiment_dir, corpus)

    treebank_code = nlp.meta["treebank"]
    for section in ("test", "dev"):
        if section == "dev":
            section_dir = "conll17-ud-development-2017-03-19"
        else:
            section_dir = "conll17-ud-test-2017-05-09"
        text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt")
        udpipe_path = (
            test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu")
        )
        gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu")

        header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"]
        print("\t".join(header))
        inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path}
        for input_type in ("udp", "raw"):
            input_path = inputs[input_type]
            output_path = (
                experiment_dir / corpus / "{section}.conllu".format(section=section)
            )

            parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path)

            accuracy = print_results(input_type, test_scores)
            acc_path = (
                experiment_dir
                / corpus
                / "{section}-accuracy.json".format(section=section)
            )
            srsly.write_json(acc_path, accuracy)
Example #28
0
    def dump(self, path: Path):
        path = ensure_path(path)

        super().dump(str(path / "kb"))

        cfg = {
            "k": self.k,
            "m_parameter": self.m_parameter,
            "ef_search": self.ef_search,
            "ef_construction": self.ef_construction,
            "n_threads": self.n_threads,
        }

        cg_cfg_path = path / "cg_cfg"
        aliases_path = path / "aliases.json"
        short_aliases_path = path / "short_aliases.json"
        ann_index_path = path / "ann_index.bin"
        tfidf_vectorizer_path = path / "tfidf_vectorizer.joblib"
        tfidf_vectors_path = path / "tfidf_vectors_sparse.npz"

        srsly.write_json(cg_cfg_path, cfg)
        srsly.write_json(aliases_path, self.aliases)
        srsly.write_json(short_aliases_path, list(self.short_aliases))

        self.ann_index.saveIndex(str(ann_index_path))
        joblib.dump(self.vectorizer, tfidf_vectorizer_path)
        scipy.sparse.save_npz(tfidf_vectors_path,
                              self.alias_tfidfs.astype(np.float16))
def test_load_model_version_compat():
    """Test warnings for various spacy_version specifications in meta. Since
    this is more of a hack for v2, manually specify the current major.minor
    version to simplify test creation."""
    nlp = util.load_model("blank:en")
    assert nlp.meta["spacy_version"].startswith(">=2.3")
    with make_tempdir() as d:
        # no change: compatible
        nlp.to_disk(d)
        meta_path = Path(d / "meta.json")
        util.get_model_meta(d)

        # additional compatible upper pin
        nlp.meta["spacy_version"] = ">=2.3.0,<2.4.0"
        srsly.write_json(meta_path, nlp.meta)
        util.get_model_meta(d)

        # incompatible older version
        nlp.meta["spacy_version"] = ">=2.2.5"
        srsly.write_json(meta_path, nlp.meta)
        with pytest.warns(UserWarning):
            util.get_model_meta(d)

        # invalid version specification
        nlp.meta["spacy_version"] = ">@#$%_invalid_version"
        srsly.write_json(meta_path, nlp.meta)
        with pytest.warns(UserWarning):
            util.get_model_meta(d)
Example #30
0
    def load_with_spacy(self):
        """
        Loads the DDT with spaCy. 
        
        This function converts the conllu files to json in the spaCy format.

        :return: GoldCorpus

        .. note:: Not using jsonl because of:
            https://github.com/explosion/spaCy/issues/3523
        """
        import srsly
        from spacy.cli.converters import conllu2json
        from spacy.gold import GoldCorpus
        from spacy.gold import Path

        for part in ['train', 'dev', 'test']:
            conll_path = os.path.join(
                self.dataset_dir, '{}.{}{}'.format(self.dataset_name, part,
                                                   self.file_extension))
            json_path = os.path.join(self.dataset_dir,
                                     "ddt.{}.json".format(part))

            if not os.path.isfile(
                    json_path):  # Convert the conllu files to json
                with open(conll_path, 'r') as file:
                    file_as_string = file.read()
                    file_as_string = file_as_string.replace(
                        "name=", "").replace("|SpaceAfter=No", "")
                    file_as_json = conllu2json(file_as_string)

                    srsly.write_json(json_path, file_as_json)

        train_json_path = os.path.join(self.dataset_dir, "ddt.train.json")
        dev_json_path = os.path.join(self.dataset_dir, "ddt.dev.json")

        assert os.path.isfile(train_json_path)
        assert os.path.isfile(dev_json_path)

        return GoldCorpus(Path(train_json_path), Path(dev_json_path))
Example #31
0
def main(test_data_dir, experiment_dir, corpus):
    Token.set_extension("split_start", getter=get_token_split_start)
    Token.set_extension("split_end", getter=get_token_split_end)
    Token.set_extension("begins_fused", default=False)
    Token.set_extension("inside_fused", default=False)
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
    lang.ru.Russian.Defaults.use_pymorphy2 = False

    nlp = load_nlp(experiment_dir, corpus)

    treebank_code = nlp.meta["treebank"]
    for section in ("test", "dev"):
        if section == "dev":
            section_dir = "conll17-ud-development-2017-03-19"
        else:
            section_dir = "conll17-ud-test-2017-05-09"
        text_path = test_data_dir / "input" / section_dir / (treebank_code +
                                                             ".txt")
        udpipe_path = (test_data_dir / "input" / section_dir /
                       (treebank_code + "-udpipe.conllu"))
        gold_path = test_data_dir / "gold" / section_dir / (treebank_code +
                                                            ".conllu")

        header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"]
        print("\t".join(header))
        inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path}
        for input_type in ("udp", "raw"):
            input_path = inputs[input_type]
            output_path = (experiment_dir / corpus /
                           "{section}.conllu".format(section=section))

            parsed_docs, test_scores = evaluate(nlp, input_path, gold_path,
                                                output_path)

            accuracy = print_results(input_type, test_scores)
            acc_path = (experiment_dir / corpus /
                        "{section}-accuracy.json".format(section=section))
            srsly.write_json(acc_path, accuracy)
Example #32
0
def test_roundtrip_docs_to_docbin(doc):
    text = doc.text
    idx = [t.idx for t in doc]
    tags = [t.tag_ for t in doc]
    pos = [t.pos_ for t in doc]
    morphs = [str(t.morph) for t in doc]
    lemmas = [t.lemma_ for t in doc]
    deps = [t.dep_ for t in doc]
    heads = [t.head.i for t in doc]
    cats = doc.cats
    ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
    # roundtrip to DocBin
    with make_tempdir() as tmpdir:
        # use a separate vocab to test that all labels are added
        reloaded_nlp = English()
        json_file = tmpdir / "roundtrip.json"
        srsly.write_json(json_file, [docs_to_json(doc)])
        output_file = tmpdir / "roundtrip.spacy"
        DocBin(docs=[doc]).to_disk(output_file)
        reader = Corpus(output_file)
        reloaded_examples = list(reader(reloaded_nlp))
    assert len(doc) == sum(len(eg) for eg in reloaded_examples)
    reloaded_example = reloaded_examples[0]
    assert text == reloaded_example.reference.text
    assert idx == [t.idx for t in reloaded_example.reference]
    assert tags == [t.tag_ for t in reloaded_example.reference]
    assert pos == [t.pos_ for t in reloaded_example.reference]
    assert morphs == [str(t.morph) for t in reloaded_example.reference]
    assert lemmas == [t.lemma_ for t in reloaded_example.reference]
    assert deps == [t.dep_ for t in reloaded_example.reference]
    assert heads == [t.head.i for t in reloaded_example.reference]
    assert ents == [
        (e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents
    ]
    assert "TRAVEL" in reloaded_example.reference.cats
    assert "BAKING" in reloaded_example.reference.cats
    assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
    assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
Example #33
0
def train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    vectors=None,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """
    msg = Printer()
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    msg.text("Training pipeline: {}".format(pipeline))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
        nlp.disable_pipes(*other_pipes)
        for pipe in pipeline:
            if pipe not in nlp.pipe_names:
                nlp.add_pipe(nlp.create_pipe(pipe))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            nlp.add_pipe(nlp.create_pipe(pipe))

    if learn_tokens:
        nlp.add_pipe(nlp.create_pipe("merge_subtokens"))

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail(
                    "Can't use multitask objective without '{}' in the "
                    "pipeline".format(pipe_name)
                )
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model:
        # Start with an existing model, use default optimizer
        optimizer = create_default_optimizer(Model.ops)
    else:
        # Start with a blank model, call begin_training
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)

    nlp._optimizer = None

    # Load in pre-trained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # fmt: off
    row_head = ["Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS"]
    row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7]
    if has_beam_widths:
        row_head.insert(1, "Beam W.")
        row_widths.insert(1, 7)
    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
                )
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs,
                        golds,
                        sgd=optimizer,
                        drop=next(dropout_rates),
                        losses=losses,
                    )
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
                    )
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, debug)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        with Model.use_device("cpu"):
                            nlp_loaded = util.load_model_from_path(epoch_model_path)
                            for name, component in nlp_loaded.pipeline:
                                if hasattr(component, "cfg"):
                                    component.cfg["beam_width"] = beam_width
                            dev_docs = list(
                                corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
                            )
                            start_time = timer()
                            scorer = nlp_loaded.evaluate(dev_docs)
                            end_time = timer()
                            cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta["accuracy"] = scorer.scores
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        meta["beam_accuracy"][beam_width] = scorer.scores
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        msg.text(
                            "Early stopping, best iteration "
                            "is: {}".format(i - iter_since_best)
                        )
                        msg.text(
                            "Best score = {}; Final iteration "
                            "score = {}".format(best_score, current_score)
                        )
                        break
    finally:
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
        msg.good("Created best model", best_model_path)
Example #34
0
def pretrain(
    texts_loc,
    vectors_model,
    output_dir,
    width=96,
    depth=4,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
    dropout=0.2,
    n_iter=1000,
    batch_size=3000,
    max_length=500,
    min_length=5,
    seed=0,
    n_save_every=None,
):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pre-trained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pre-trained weights
    files to the 'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. The API and
    errors around this need some improvement.
    """
    config = dict(locals())
    msg = Printer()
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory")
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

    with msg.loading("Loading model '{}'...".format(vectors_model)):
        nlp = util.load_model(vectors_model)
    msg.good("Loaded model '{}'".format(vectors_model))
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
            width,
            embed_rows,
            conv_depth=depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=0,  # Requires PyTorch. Experimental.
            cnn_maxout_pieces=3,  # You can try setting this higher
            subword_features=True,  # Set to False for Chinese etc
        ),
    )
    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
    msg.divider("Pre-training tok2vec layer")
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
                "wb"
            ) as file_:
                file_.write(model.tok2vec.to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    for epoch in range(n_iter):
        for batch_id, batch in enumerate(
            util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
        ):
            docs = make_docs(
                nlp,
                [text for (text, _) in batch],
                max_length=max_length,
                min_length=min_length,
            )
            loss = make_update(
                model, docs, optimizer, objective=loss_func, drop=dropout
            )
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
                    break
            if n_save_every and (batch_id % n_save_every == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)