Example #1
0
def init_model(lang, output_dir, freqs_loc=None,
               vectors_loc=None, no_expand_vectors=False,
               meta_overrides=None, prune_vectors=-1, min_word_frequency=50):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
    and word vectors.
    """
    output_dir = ensure_path(output_dir)
    if vectors_loc is not None:
        vectors_loc = cached_path(vectors_loc)
        vectors_loc = ensure_path(vectors_loc)
    if freqs_loc is not None:
        freqs_loc = cached_path(freqs_loc)
        freqs_loc = ensure_path(freqs_loc)

    if freqs_loc is not None and not freqs_loc.exists():
        msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
    probs, oov_prob = read_freqs(freqs_loc, min_freq=min_word_frequency) if freqs_loc is not None else ({}, -20)
    vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
    nlp = create_model(lang, probs, oov_prob, vectors_data, vector_keys, not no_expand_vectors, prune_vectors)

    # Insert our custom tokenizer into the base model.
    #nlp.tokenizer = combined_rule_tokenizer(nlp)

    nlp.tokenizer = Tokenizer(nlp.vocab)

    if meta_overrides is not None:
        metadata = json.load(open(meta_overrides))
        nlp.meta.update(metadata)
        nlp.meta["version"] = VERSION

    if not output_dir.exists():
        os.makedirs(output_dir, exist_ok=True)
    nlp.to_disk(output_dir)
    return nlp
Example #2
0
def test_de_tagger_tagset(NLP, test_file):
    """Check that no tags outside the tagset are used."""
    gold_tags = set([
        "$(", "$,", "$.", "ADJA", "ADJD", "ADV", "APPO", "APPR", "APPRART",
        "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON", "KOUI", "KOUS",
        "NE", "NN", "NNE", "PDAT", "PDS", "PIAT", "PIS", "PPER", "PPOSAT",
        "PPOSS", "PRELAT", "PRELS", "PRF", "PROAV", "PTKA", "PTKANT", "PTKNEG",
        "PTKVZ", "PTKZU", "PWAT", "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP",
        "VAINF", "VAPP", "VMFIN", "VMINF", "VMPP", "VVFIN", "VVIMP", "VVINF",
        "VVIZU", "VVPP", "XY"
    ])

    data_path = os.path.join(TEST_FILES_DIR, test_file)
    data_path = util.ensure_path(data_path)
    if not data_path.exists():
        raise FileNotFoundError("Test corpus not found", data_path)
    corpus = GoldCorpus(data_path, data_path)
    dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False))

    pred_tags = set()
    tagger = NLP.get_pipe('tagger')

    for doc, _ in dev_docs:
        tagger(doc)
        pred_tags = pred_tags.union(set([t.tag_ for t in doc]))

    assert len(pred_tags - gold_tags) == 0
Example #3
0
    def from_disk(self, path, **kwargs):
        """Load waterwheel from a file. Expects file to contain
        a bytestring of the following dict format:
        {
            'stop_words': {},
            'vocab': {},
            'wikidata': {},
            'doc_bins': doc_bins_bytes,
        }

        Parameters
        ----------
        path : Path
            path to the serialized file.
        
        Returns
        -------
        self : WaterWheel
            The loaded WaterWheel object.
        """

        path = ensure_path(path)
        with open(path, 'rb') as file:
            serial = file.read()
        self.from_bytes(serial)
        return self
Example #4
0
def main(name: ("模型名称", "positional", None, None, trf_list),
         make_cache_dir: (" 创建缓存文件夹", "flag", "mk"),
         use_local_class: ("不使用网络读取", "flag", "local")):
    if make_cache_dir:
        c_path = ensure_path(f"{cache_path + name}")
        if c_path.exists():
            msg.warn(f"{cache_path + name} already exists")
        else:
            c_path.mkdir()
            msg.good(f" 缓存文件夹已创建:\t{cache_path}{name}")

    msg.warn("\n================url================\n")

    config_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[name]

    model_file = ALL_PRETRAINED_MODEL_ARCHIVE_MAP[name]
    msg.text(f"{config_file}\n{model_file}\n")

    vocab = get_tokenizer(name, use_local_class)
    pretrained_vocab_files_map = vocab.pretrained_vocab_files_map
    for vocab_file in pretrained_vocab_files_map.values():
        msg.text(f"{vocab_file[name]}\n")

    msg.warn("\n================url================\n")
    msg.good("\n使用下载工具下载后,将模型文件放入缓存文件夹中。")
Example #5
0
    def to_disk(self, output_path: Path, force: bool = False, save_examples: bool = True) -> None:
        """Save Corpus to Disk

        Args:
            output_path (Path): Output file path to save data to
            force (bool): Force save to directory. Create parent directories
                or overwrite existing data.
            save_examples (bool): Save the example store along with the state.
        """
        output_path = ensure_path(output_path)
        output_dir = output_path.parent
        state_dir = output_dir / ".recon" / self.name
        if force:
            output_dir.mkdir(parents=True, exist_ok=True)

            if not state_dir.exists():
                state_dir.mkdir(parents=True, exist_ok=True)

        ds_op_state = DatasetOperationsState(
            name=self.name, commit=self.commit_hash, size=len(self), operations=self.operations
        )
        srsly.write_json(state_dir / "state.json", ds_op_state.dict())

        if save_examples:
            self.example_store.to_disk(state_dir / "example_store.jsonl")

        srsly.write_jsonl(output_path, [e.dict() for e in self.data])
Example #6
0
    def from_disk(
        self,
        data_dir: Path,
        train_file: str = "train.jsonl",
        dev_file: str = "dev.jsonl",
        test_file: str = "test.jsonl",
        loader_func: Callable = read_jsonl,
    ) -> "Corpus":
        """Load Corpus from disk given a directory with files
        named explicitly train.jsonl, dev.jsonl, and test.jsonl

        Args:
            data_dir (Path): directory to load from.
            train_file (str, optional): Filename of train data under data_dir. Defaults to train.jsonl.
            dev_file (str, optional): Filename of dev data under data_dir. Defaults to dev.jsonl.
            test_file (str, optional): Filename of test data under data_dir. Defaults to test.jsonl.
            loader_func (Callable, optional): Callable that reads a file and returns a List of examples.
                Defaults to [read_jsonl][recon.loaders.read_jsonl]
        """
        data_dir = ensure_path(data_dir) / self.name

        train = Dataset("train").from_disk(data_dir / train_file)
        dev = Dataset("dev").from_disk(data_dir / dev_file)

        try:
            test = Dataset("test").from_disk(data_dir / test_file)
            corpus = self(self.name, train, dev, test=test)
        except ValueError as e:
            corpus = self(self.name, train, dev)
        return corpus
Example #7
0
def test_issue4042_bug2():
    """
    Test that serialization of an NER works fine when new labels were added.
    This is the second bug of two bugs underlying the issue 4042.
    """
    nlp1 = English()
    # add ner pipe
    ner1 = nlp1.add_pipe("ner")
    ner1.add_label("SOME_LABEL")
    nlp1.initialize()
    # add a new label to the doc
    doc1 = nlp1("What do you think about Apple ?")
    assert len(ner1.labels) == 1
    assert "SOME_LABEL" in ner1.labels
    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
    doc1.ents = list(doc1.ents) + [apple_ent]
    # Add the label explicitly. Previously we didn't require this.
    ner1.add_label("MY_ORG")
    ner1(doc1)
    assert len(ner1.labels) == 2
    assert "SOME_LABEL" in ner1.labels
    assert "MY_ORG" in ner1.labels
    with make_tempdir() as d:
        # assert IO goes fine
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        ner1.to_disk(output_dir)
        config = {}
        ner2 = nlp1.create_pipe("ner", config=config)
        ner2.from_disk(output_dir)
        assert len(ner2.labels) == 2
def test_issue4042():
    """Test that serialization of an EntityRuler before NER works fine."""
    nlp = English()
    # add ner pipe
    ner = nlp.add_pipe("ner")
    ner.add_label("SOME_LABEL")
    nlp.initialize()
    # Add entity ruler
    patterns = [
        {"label": "MY_ORG", "pattern": "Apple"},
        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
    ]
    # works fine with "after"
    ruler = nlp.add_pipe("entity_ruler", before="ner")
    ruler.add_patterns(patterns)
    doc1 = nlp("What do you think about Apple ?")
    assert doc1.ents[0].label_ == "MY_ORG"

    with make_tempdir() as d:
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        nlp2 = load_model(output_dir)
        doc2 = nlp2("What do you think about Apple ?")
        assert doc2.ents[0].label_ == "MY_ORG"
Example #9
0
def test_issue4674():
    """Test that setting entities with overlapping identifiers does not mess up IO"""
    nlp = English()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)

    vector1 = [0.9, 1.1, 1.01]
    vector2 = [1.8, 2.25, 2.01]
    kb.set_entities(entity_list=["Q1", "Q1"],
                    freq_list=[32, 111],
                    vector_list=[vector1, vector2])

    assert kb.get_size_entities() == 1

    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb.dump(str(file_path))

        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
        kb2.load_bulk(str(file_path))

    assert kb2.get_size_entities() == 1
Example #10
0
    def dump(self, path: Path):
        path = ensure_path(path)

        super().dump(str(path / "kb"))

        cfg = {
            "k": self.k,
            "m_parameter": self.m_parameter,
            "ef_search": self.ef_search,
            "ef_construction": self.ef_construction,
            "n_threads": self.n_threads,
        }

        cg_cfg_path = path / "cg_cfg"
        aliases_path = path / "aliases.json"
        short_aliases_path = path / "short_aliases.json"
        ann_index_path = path / "ann_index.bin"
        tfidf_vectorizer_path = path / "tfidf_vectorizer.joblib"
        tfidf_vectors_path = path / "tfidf_vectors_sparse.npz"

        srsly.write_json(cg_cfg_path, cfg)
        srsly.write_json(aliases_path, self.aliases)
        srsly.write_json(short_aliases_path, list(self.short_aliases))

        self.ann_index.saveIndex(str(ann_index_path))
        joblib.dump(self.vectorizer, tfidf_vectorizer_path)
        scipy.sparse.save_npz(tfidf_vectors_path,
                              self.alias_tfidfs.astype(np.float16))
Example #11
0
def test_en_parser_depset(NLP, test_file):
    """Check that no tags outside the tagset are produced."""
    gold_deps = set([
        "ROOT", "acl", "acomp", "advcl", "advmod", "agent", "amod", "appos",
        "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj",
        "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj",
        "mark", "meta", "neg", "nmod", "npadvmod", "nsubj", "nsubjpass",
        "nummod", "oprd", "parataxis", "pcomp", "pobj", "poss", "preconj",
        "predet", "prep", "prt", "punct", "quantmod", "relcl", "root", "xcomp"
    ])

    data_path = os.path.join(TEST_FILES_DIR, test_file)
    data_path = util.ensure_path(data_path)
    if not data_path.exists():
        raise FileNotFoundError("Test corpus not found", data_path)
    corpus = GoldCorpus(data_path, data_path)
    dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False))

    pred_deps = set()
    parser = NLP.get_pipe('parser')

    for doc, _ in dev_docs:
        parser(doc)
        pred_deps = pred_deps.union(set([t.dep_ for t in doc]))

    print(pred_deps - gold_deps)
    assert len(pred_deps - gold_deps) == 0
Example #12
0
def test_tokenizer_handle_text_from_file(combined_rule_tokenizer_fixture,
                                         file_name):
    loc = util.ensure_path(__file__).parent / file_name
    text = loc.open('r', encoding='utf8').read()
    assert len(text) != 0
    tokens = combined_rule_tokenizer_fixture(text)
    assert len(tokens) > 100
Example #13
0
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
    loc = ensure_path(__file__).parent / file_name
    with loc.open("r", encoding="utf8") as infile:
        text = infile.read()
    assert len(text) != 0
    tokens = tokenizer(text)
    assert len(tokens) > 100
Example #14
0
def test_issue4054(en_vocab):
    """Test that a new blank model can be made with a vocab from file,
    and that serialization does not drop the language at any point."""
    nlp1 = English()
    vocab1 = nlp1.vocab
    with make_tempdir() as d:
        vocab_dir = ensure_path(d / "vocab")
        if not vocab_dir.exists():
            vocab_dir.mkdir()
        vocab1.to_disk(vocab_dir)
        vocab2 = Vocab().from_disk(vocab_dir)
        nlp2 = spacy.blank("en", vocab=vocab2)
        nlp_dir = ensure_path(d / "nlp")
        if not nlp_dir.exists():
            nlp_dir.mkdir()
        nlp2.to_disk(nlp_dir)
        nlp3 = load_model(nlp_dir)
        assert nlp3.lang == "en"
Example #15
0
def load_texts(path):
    """Load inputs from a jsonl file.
    
    Each line should be a dict like {"text": "..."}
    """
    path = ensure_path(path)
    with path.open("r", encoding="utf8") as file_:
        texts = [json.loads(line) for line in file_]
    random.shuffle(texts)
    return texts
Example #16
0
    def to_disk(self, path: Path, exclude: Tuple = tuple(), **kwargs):
        """Serialize RemoteAnnLinker to disk.
        
        path (Path): directory to serialize to
        exclude (Tuple, optional): config to exclude. Defaults to tuple().
        """
        path = ensure_path(path)
        serializers = {"cfg": lambda p: srsly.write_json(p, self.cfg)}

        to_disk(path, serializers, {})
Example #17
0
def test_de_tagger_corpus(NLP, test_file, accuracy_threshold):
    data_path = os.path.join(TEST_FILES_DIR, test_file)
    data_path = util.ensure_path(data_path)
    if not data_path.exists():
        raise FileNotFoundError("Test corpus not found", data_path)
    corpus = GoldCorpus(data_path, data_path)
    dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False))
    scorer = NLP.evaluate(dev_docs)

    assert scorer.tags_acc > accuracy_threshold
Example #18
0
    def from_disk(self, path: Path, loader_func: Callable = read_jsonl) -> "Dataset":
        """Load Dataset from disk given a path and a loader function that reads the data
        and returns an iterator of Examples

        Args:
            path (Path): path to load from
            loader_func (Callable, optional): Callable that reads a file and returns a List of examples.
                Defaults to [read_jsonl][recon.loaders.read_jsonl]
        """
        path = ensure_path(path)
        ds_op_state = None
        if (path.parent / ".recon" / self.name).exists():
            state = srsly.read_json(path.parent / ".recon" / self.name / "state.json")
            ds_op_state = DatasetOperationsState(**state)
            self.operations = ds_op_state.operations

        data = loader_func(path)
        self.data = data
        for example in self.data:
            self.example_store.add(example)

        if ds_op_state and self.commit_hash != ds_op_state.commit:
            # Dataset changed, examples added
            self.operations.append(
                OperationState(
                    name="examples_added_external",
                    status=OperationStatus.COMPLETED,
                    ts=datetime.now(),
                    examples_added=max(len(self) - ds_op_state.size, 0),
                    examples_removed=max(ds_op_state.size - len(self), 0),
                    examples_changed=0,
                    transformations=[],
                )
            )

            for op in self.operations:
                op.status = OperationStatus.NOT_STARTED

        seen: Set[str] = set()
        operations_to_run: Dict[str, OperationState] = {}

        for op in self.operations:
            if (
                op.name not in operations_to_run
                and op.name in registry.operations
                and op.status != OperationStatus.COMPLETED
            ):
                operations_to_run[op.name] = op

        for op_name, state in operations_to_run.items():
            op = registry.operations.get(op_name)
            self.apply_(op, *state.args, initial_state=state, **state.kwargs)  # type: ignore

        return self
Example #19
0
    def to_disk(self, path: Path) -> None:
        """Save store to disk
        
        Args:
            path (Path): Path to save store to
        """
        path = ensure_path(path)
        examples = []
        for example_hash, example in self._map.items():
            examples.append({"example_hash": example_hash, "example": example.dict()})

        srsly.write_jsonl(path, examples)
Example #20
0
 def __init__(
     self,
     path: Union[str, Path],
     *,
     limit: int = 0,
     min_length: int = 0,
     max_length: int = 0,
 ) -> None:
     self.path = util.ensure_path(path)
     self.limit = limit
     self.min_length = min_length
     self.max_length = max_length
Example #21
0
    def to_disk(self, path, **kwargs):
        """Serialize waterwheel data to a file.
        
        Parameters
        ----------
        path : Path
            path to file.
        """

        path = ensure_path(path)
        serial = self.to_bytes()
        srsly.write_msgpack(path, serial)
Example #22
0
def open_file(loc):
    '''Handle .gz, .tar.gz or unzipped files'''
    loc = ensure_path(loc)
    print("Open loc")
    if tarfile.is_tarfile(str(loc)):
        return tarfile.open(str(loc), 'r:gz')
    elif loc.parts[-1].endswith('gz'):
        return (line.decode('utf8') for line in gzip.open(str(loc), 'r'))
    elif loc.parts[-1].endswith('zip'):
        zip_file = zipfile.ZipFile(str(loc))
        names = zip_file.namelist()
        file_ = zip_file.open(names[0])
        return (line.decode('utf8') for line in file_)
    else:
        return loc.open('r', encoding='utf8')
Example #23
0
    def from_disk(self, path: Path, **kwargs):
        """Deserialize saved RemoteAnnLinker from disk.
        
        path (Path): directory to deserialize from
        
        RETURNS (RemoteAnnLinker): Initialized RemoteAnnLinker
        """
        path = ensure_path(path)
        cfg = {}
        deserializers = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
        from_disk(path, deserializers, {})
        self.cfg.update(cfg)
        self.base_url = cfg.get('base_url')
        self.headers = cfg.get('headers', {})

        return self
Example #24
0
def test_serialize_kb_disk(en_vocab):
    # baseline assertions
    kb1 = _get_dummy_kb(en_vocab)
    _check_kb(kb1)

    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb1.to_disk(str(file_path))
        kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
        kb2.from_disk(str(file_path))

    # final assertions
    _check_kb(kb2)
Example #25
0
    def from_disk(self, path: Path) -> "ExampleStore":
        """Load store from disk

        Args:
            path (Path): Path to file to load from
        
        Returns:
            ExampleStore: Initialized ExampleStore
        """
        path = ensure_path(path)
        examples = srsly.read_jsonl(path)
        for e in examples:
            example_hash = e["example_hash"]
            raw_example = e["example"]
            example = Example(**raw_example)
            assert hash(example) == example_hash
            self.add(example)

        return self
Example #26
0
    def to_disk(self, path: Path, exclude: Tuple = tuple(), **kwargs):
        """Serialize AnnLinker to disk.
        
        path (Path): directory to serialize to
        exclude (Tuple, optional): config to exclude. Defaults to tuple().
        """        
        path = util.ensure_path(path)
        if not path.exists():
            path.mkdir()

        cfg = {
            "threshold": self.threshold,
            "no_description_threshold": self.no_description_threshold,
            "disambiguate": self.disambiguate
        }
        srsly.write_json(path / "cfg", cfg)

        self.kb.dump(path / "kb")
        self.cg.to_disk(path)
Example #27
0
def main(input_path: str, output_path: str, min_word_frequency: int):
    if input_path is not None:
        input_path = cached_path(input_path)
        input_path = ensure_path(input_path)

    probs, oov_prob = (
        read_freqs(input_path, min_freq=min_word_frequency)
        if input_path is not None
        else ({}, -20)
    )

    with open(output_path, "w") as _jsonl_file:
        _jsonl_file.write(
            json.dumps({"lang": "en", "settings": {"oov_prob": -20.502029418945312}})
        )
        _jsonl_file.write("\n")

        for word, prob in probs.items():
            _jsonl_file.write(json.dumps({"orth": word, "prob": prob}))
            _jsonl_file.write("\n")
Example #28
0
    def to_disk(self, data_dir: Path, force: bool = False) -> None:
        """Save Corpus to Disk

        Args:
            data_dir (Path): Directory to save data to
            force (bool): Force save to directory. Create parent directories
                or overwrite existing data.
        """
        data_dir = ensure_path(data_dir) / self.name
        state_dir = data_dir / ".recon"
        if force:
            data_dir.mkdir(parents=True, exist_ok=True)

            if not state_dir.exists():
                state_dir.mkdir(parents=True, exist_ok=True)

        self._train.to_disk(data_dir / "train.jsonl", force=force, save_examples=False)
        self._dev.to_disk(data_dir / "dev.jsonl", force=force, save_examples=False)
        if self._test:
            self._test.to_disk(data_dir / "test.jsonl", force=force, save_examples=False)

        self.example_store.to_disk(state_dir / "example_store.jsonl")
Example #29
0
def test_issue4042_bug2():
    """
    Test that serialization of an NER works fine when new labels were added.
    This is the second bug of two bugs underlying the issue 4042.
    """
    nlp1 = English()
    vocab = nlp1.vocab

    # add ner pipe
    ner1 = nlp1.create_pipe("ner")
    ner1.add_label("SOME_LABEL")
    nlp1.add_pipe(ner1)
    nlp1.begin_training()

    # add a new label to the doc
    doc1 = nlp1("What do you think about Apple ?")
    assert len(ner1.labels) == 1
    assert "SOME_LABEL" in ner1.labels
    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
    doc1.ents = list(doc1.ents) + [apple_ent]

    # reapply the NER - at this point it should resize itself
    ner1(doc1)
    assert len(ner1.labels) == 2
    assert "SOME_LABEL" in ner1.labels
    assert "MY_ORG" in ner1.labels

    with make_tempdir() as d:
        # assert IO goes fine
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        ner1.to_disk(output_dir)

        nlp2 = English(vocab)
        ner2 = EntityRecognizer(vocab)
        ner2.from_disk(output_dir)
        assert len(ner2.labels) == 2
Example #30
0
    def load_bulk(self, path: Path):
        path = ensure_path(path)

        super().load_bulk(str(path / "kb"))

        aliases_path = path / "aliases.json"
        short_aliases_path = path / "short_aliases.json"
        ann_index_path = path / "ann_index.bin"
        tfidf_vectorizer_path = path / "tfidf_vectorizer.joblib"
        tfidf_vectors_path = path / "tfidf_vectors_sparse.npz"

        cfg = srsly.read_json(path / "cg_cfg")

        self.k = cfg.get("k", 5)
        self.m_parameter = cfg.get("m_parameter", 100)
        self.ef_search = cfg.get("ef_search", 200)
        self.ef_construction = cfg.get("ef_construction", 2000)
        self.n_threads = cfg.get("n_threads", 60)

        aliases = srsly.read_json(aliases_path)
        short_aliases = set(srsly.read_json(short_aliases_path))
        tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)
        alias_tfidfs = scipy.sparse.load_npz(tfidf_vectors_path).astype(
            np.float32)
        ann_index = nmslib.init(
            method="hnsw",
            space="cosinesimil_sparse",
            data_type=nmslib.DataType.SPARSE_VECTOR,
        )
        ann_index.addDataPointBatch(alias_tfidfs)
        ann_index.loadIndex(str(ann_index_path))
        query_time_params = {"efSearch": self.ef_search}
        ann_index.setQueryTimeParams(query_time_params)

        self._initialize(aliases, short_aliases, ann_index, tfidf_vectorizer,
                         alias_tfidfs)

        return self
Example #31
0
def test_util_ensure_path_succeeds(text):
    path = util.ensure_path(text)
    assert isinstance(path, Path)
Example #32
0
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
    loc = ensure_path(__file__).parent / file_name
    text = loc.open("r", encoding="utf8").read()
    assert len(text) != 0
    tokens = tokenizer(text)
    assert len(tokens) > 100