Beispiel #1
0
 def add_word_to_stringstore(self, word, path):
     try:
         self.stringstore = StringStore().from_disk(path)
         self.stringstore.add(word)
     except:
         self.stringstore = StringStore(word)
     self.stringstore.to_disk(path)
Beispiel #2
0
def test_serialize_stringstore_roundtrip_bytes(strings1, strings2):
    sstore1 = StringStore(strings=strings1)
    sstore2 = StringStore(strings=strings2)
    sstore1_b = sstore1.to_bytes()
    sstore2_b = sstore2.to_bytes()
    if strings1 == strings2:
        assert sstore1_b == sstore2_b
    else:
        assert sstore1_b != sstore2_b
    sstore1 = sstore1.from_bytes(sstore1_b)
    assert sstore1.to_bytes() == sstore1_b
    new_sstore1 = StringStore().from_bytes(sstore1_b)
    assert new_sstore1.to_bytes() == sstore1_b
    assert list(new_sstore1) == strings1
Beispiel #3
0
def process(batch_id, inputs, output_dir, lang, n_threads, batch_size,
            min_ngram, max_ngram):
    logging.info('Processing batch_id: {}'.format(batch_id))
    subtrees = PreshCounter()
    subtrees_string_map = StringStore()
    noun_chunks = PreshCounter()
    noun_chunks_string_map = StringStore()

    if lang.lower() == "en":
        from spacy.en import English
        NLU = English()
        NLU.matcher = None
    elif lang.lower() == "id":
        from spacy.id import Indonesian
        NLU = Indonesian()
        NLU.matcher = None

    for i, doc in enumerate(
            NLU.pipe(inputs, batch_size=batch_size, n_threads=n_threads)):
        phrases = set()
        for tok in doc:
            st_len = len(list(tok.subtree))
            if min_ngram <= st_len <= max_ngram:
                st = ''.join([rep_text(t.text_with_ws)
                              for t in tok.subtree]).strip()
                orth = subtrees_string_map[st]
                subtrees.inc(orth, 1)
        for np in doc.noun_chunks:
            if min_ngram <= len(np) <= max_ngram:
                st = ''.join([rep_text(t.text_with_ws) for t in np]).strip()
                orth = noun_chunks_string_map[st]
                noun_chunks.inc(orth, 1)

        if i % batch_size == 0:
            logging.info('Processing batch_id: {}, doc: {}'.format(
                batch_id, i))

    output_fname = path.join(output_dir, 'batch{}.st.freq'.format(batch_id))
    with io.open(output_fname, 'w', encoding='utf-8') as out:
        for orth, count in subtrees:
            st = subtrees_string_map[orth]
            if count >= 5 and '!LONGWORD!' not in st:
                out.write('{}\t{}\n'.format(count, st))

    output_fname = path.join(output_dir, 'batch{}.np.freq'.format(batch_id))
    with io.open(output_fname, 'w', encoding='utf-8') as out:
        for orth, count in noun_chunks:
            if count >= 5:
                st = noun_chunks_string_map[orth]
                out.write('{}\t{}\n'.format(count, st))
Beispiel #4
0
def build_doc_term_matrix(terms_lists, weighted=True):
    '''
    Construct a sparse document/term matrix, optionally weighted by the position of the terms in each document (i.e. in a SERP)
    :param terms_lists: list of urls
    :param weighted: weight by rank
    :return: sparse matrix of urls
    '''
    stringstore = StringStore()

    data = []
    rows = []
    cols = []
    for row_idx, terms_list in enumerate(terms_lists):
        bow = tuple((stringstore[term] - 1, 1. / (i**2 + 2) if weighted else 1)
                    for i, term in enumerate(terms_list) if term)

        data.extend(count for _, count in bow)
        cols.extend(term_id for term_id, _ in bow)
        rows.extend(itertools.repeat(row_idx, times=len(bow)))

    #import pdb;pdb.set_trace()
    doc_term_matrix = sp.coo_matrix((data, (rows, cols)),
                                    dtype=float if weighted else int).tocsr()

    return doc_term_matrix
Beispiel #5
0
    def __init__(
            self,
            shape: tuple = (1000, 128),
            strings: StringStore = None,
            senses: List[str] = [],
            vectors_name: str = "sense2vec",
            overrides: Dict[str, str] = SimpleFrozenDict(),
    ):
        """Initialize the Sense2Vec object.

        shape (tuple): The vector shape.
        strings (StringStore): Optional string store. Will be created if it
            doesn't exist.
        senses (list): Optional list of all available senses. Used in methods
            that generate the best sense or other senses.
        vectors_name (unicode): Optional name to assign to the Vectors object.
        overrides (dict): Optional custom functions to use, mapped to names
            registered via the registry, e.g. {"make_key": "custom_make_key"}.
        RETURNS (Sense2Vec): The newly constructed object.
        """
        self.vectors = Vectors(shape=shape, name=vectors_name)
        self._row2key = None
        self.strings = StringStore() if strings is None else strings
        self.freqs: Dict[int, int] = {}
        self.cache = None
        self.cfg: Dict[str, Any] = {
            "senses": senses,
            "make_key": "default",
            "split_key": "default",
        }
        self.cfg.update(overrides)
Beispiel #6
0
def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
    sstore1 = StringStore(strings=strings1)
    sstore2 = StringStore(strings=strings2)
    with make_tempdir() as d:
        file_path1 = d / "strings1"
        file_path2 = d / "strings2"
        sstore1.to_disk(file_path1)
        sstore2.to_disk(file_path2)
        sstore1_d = StringStore().from_disk(file_path1)
        sstore2_d = StringStore().from_disk(file_path2)
        assert list(sstore1_d) == list(sstore1)
        assert list(sstore2_d) == list(sstore2)
        if strings1 == strings2:
            assert list(sstore1_d) == list(sstore2_d)
        else:
            assert list(sstore1_d) != list(sstore2_d)
Beispiel #7
0
def docs_to_gensim(spacy_docs,
                   spacy_vocab,
                   lemmatize=True,
                   filter_stops=True,
                   filter_punct=True,
                   filter_nums=False):
    """
    Convert multiple ``spacy.Doc`` s into a gensim dictionary and bag-of-words corpus.

    Args:
        spacy_docs (list(``spacy.Doc``))
        spacy_vocab (``spacy.Vocab``)
        lemmatize (bool): if True, use lemmatized strings for words; otherwise,
            use the original form of the string as it appears in ``doc``
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
            integer word ID to word string mapping
        list(list((int, int))): list of bag-of-words documents, where each doc is
            a list of (integer word ID, word count) 2-tuples
    """
    gdict = Dictionary()
    gcorpus = []
    stringstore = StringStore()
    doc_freqs = Counter()

    for spacy_doc in spacy_docs:
        if lemmatize is True:
            bow = (
                (spacy_vocab[tok_id], count)
                for tok_id, count in spacy_doc.count_by(attrs.LEMMA).items())
        else:
            bow = ((spacy_vocab[tok_id], count)
                   for tok_id, count in spacy_doc.count_by(attrs.ORTH).items())

        if filter_stops is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_stop)
        if filter_punct is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_punct)
        if filter_nums is True:
            bow = ((lex, count) for lex, count in bow if not lex.like_num)

        bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow),
                     key=itemgetter(0))

        doc_freqs.update(tok_id for tok_id, _ in bow)
        gdict.num_docs += 1
        gdict.num_pos += sum(count for _, count in bow)
        gdict.num_nnz += len(bow)

        gcorpus.append(bow)

    gdict.token2id = {s: i for i, s in enumerate(stringstore)}
    gdict.dfs = dict(doc_freqs)

    return (gdict, gcorpus)
Beispiel #8
0
def test_dump_load(sstore):
    id_ = sstore[u'qqqqq']
    with tempfile.TemporaryFile('w+t') as file_:
        sstore.dump(file_)
        file_.seek(0)
        new_store = StringStore()
        new_store.load(file_)
    assert new_store[id_] == u'qqqqq'
Beispiel #9
0
 def add_stringstore_to_vocab_temporarely(self, file):
     try:
         self.stringstore = StringStore().from_disk(file)
         for word in self.stringstore:
             lex = self.nlp.vocab[word]
             self.nlp.vocab[word].is_oov = False
     except:
         print("cannot read stringstore in file " + file)
Beispiel #10
0
def sort(path: Path):
    """Sort the strings from the vocabulary of a spaCy model.

    For the original code of StringStore.to_disk(), see https://github.com/explosion/spaCy/blob/53a3b967ac704ff0a67a7102ede6d916e2a4545a/spacy/strings.pyx#L219-L227.
    """
    st = StringStore().from_disk(path)
    strings = sorted(st)
    srsly.write_json(path, strings)
Beispiel #11
0
def test_pickle_string_store(text1, text2):
    stringstore = StringStore()
    store1 = stringstore[text1]
    store2 = stringstore[text2]
    data = srsly.pickle_dumps(stringstore, protocol=-1)
    unpickled = srsly.pickle_loads(data)
    assert unpickled[text1] == store1
    assert unpickled[text2] == store2
    assert len(stringstore) == len(unpickled)
Beispiel #12
0
def docs_to_gensim(spacy_docs,
                   spacy_vocab,
                   lemmatize=True,
                   lowercase=False,
                   filter_stops=True,
                   filter_punct=True,
                   filter_nums=False):
    """
    Convert a sequence of ``spacy.Doc`` s into a gensim-friendly corpus and a
    string that can be loaded into a :class:`gensim.corpora.Dictionary`.

    Args:
        spacy_docs (Iterable[``spacy.Doc``])
        spacy_vocab (``spacy.Vocab``)
        lemmatize (bool): if True, use lemmatized strings for words
        lowercase (bool): if True (and ``lemmatize`` is False), use lowercased
            strings for words
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        str: words, their integer ids, and their document frequencies in
            ``spacy_docs``, as a string formatted like `id[TAB]word[TAB]df[NEWLINE]`;
            when written to file, can be converted into a gensim ``Dictionary``
            via :meth:`gensim.corpora.Dictionary.load_from_text()`
        List[List[Tuple[int, int]]]: list of documents as bags-of-words, where
            each doc is a list of (integer word ID, word count) 2-tuples
    """
    count_by = (attrs.LEMMA if lemmatize is True else
                attrs.LOWER if lowercase is True else attrs.ORTH)
    gcorpus = []
    stringstore = StringStore()
    doc_freqs = Counter()

    for spacy_doc in spacy_docs:
        bow = ((spacy_vocab[tok_id], count)
               for tok_id, count in spacy_doc.count_by(count_by).items())
        bow = ((lex, count) for lex, count in bow if not lex.is_space)
        if filter_stops is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_stop)
        if filter_punct is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_punct)
        if filter_nums is True:
            bow = ((lex, count) for lex, count in bow if not lex.like_num)
        bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow),
                     key=itemgetter(0))

        doc_freqs.update(tok_id for tok_id, _ in bow)
        gcorpus.append(bow)

    gdict_str = '\n'.join(
        '{}\t{}\t{}'.format(i, s, doc_freqs[i])
        for i, s in sorted(enumerate(stringstore), key=itemgetter(1)))

    return (gdict_str, gcorpus)
Beispiel #13
0
def test_unapplicable_trees():
    strings = StringStore()
    trees = EditTrees(strings)
    tree3 = trees.add("deelt", "delen")

    # Replacement fails.
    assert trees.apply(tree3, "deeld") == None

    # Suffix + prefix are too large.
    assert trees.apply(tree3, "de") == None
Beispiel #14
0
def test_dutch():
    strings = StringStore()
    trees = EditTrees(strings)
    tree = trees.add("deelt", "delen")
    assert trees.tree_to_str(
        tree) == "(m 0 3 () (m 0 2 (s '' 'l') (s 'lt' 'n')))"

    tree = trees.add("gedeeld", "delen")
    assert (trees.tree_to_str(tree) ==
            "(m 2 3 (s 'ge' '') (m 0 2 (s '' 'l') (s 'ld' 'n')))")
Beispiel #15
0
    def test_encode_decode(self):
        strings = StringStore()
        hello_id = strings[u'Hello']
        world_id = strings[u'World']

        self.assertNotEqual(hello_id, world_id)

        self.assertEqual(strings[hello_id], u'Hello')
        self.assertEqual(strings[world_id], u'World')

        self.assertEqual(strings[u'Hello'], hello_id)
        self.assertEqual(strings[u'World'], world_id)
Beispiel #16
0
def word_movers(doc1, doc2, metric='cosine'):
    """
    Measure the semantic similarity between two documents using Word Movers
    Distance.

    Args:
        doc1 (``textacy.Doc`` or ``spacy.Doc``)
        doc2 (``textacy.Doc`` or ``spacy.Doc``)
        metric ({'cosine', 'euclidean', 'l1', 'l2', 'manhattan'})

    Returns:
        float: similarity between `doc1` and `doc2` in the interval [0.0, 1.0],
            where larger values correspond to more similar documents

    References:
        Ofir Pele and Michael Werman, "A linear time histogram metric for improved
            SIFT matching," in Computer Vision - ECCV 2008, Marseille, France, 2008.
        Ofir Pele and Michael Werman, "Fast and robust earth mover's distances,"
            in Proc. 2009 IEEE 12th Int. Conf. on Computer Vision, Kyoto, Japan, 2009.
        Kusner, Matt J., et al. "From word embeddings to document distances."
            Proceedings of the 32nd International Conference on Machine Learning
            (ICML 2015). 2015. http://jmlr.org/proceedings/papers/v37/kusnerb15.pdf
    """
    stringstore = StringStore()

    n = 0
    word_vecs = []
    for word in itertoolz.concatv(extract.words(doc1), extract.words(doc2)):
        if word.has_vector:
            if stringstore[
                    word.text] - 1 == n:  # stringstore[0] always empty space
                word_vecs.append(word.vector)
                n += 1
    distance_mat = pairwise_distances(np.array(word_vecs),
                                      metric=metric).astype(np.double)
    distance_mat /= distance_mat.max()

    vec1 = collections.Counter(stringstore[word.text] - 1
                               for word in extract.words(doc1)
                               if word.has_vector)
    vec1 = np.array([vec1[word_idx] for word_idx in range(len(stringstore))
                     ]).astype(np.double)
    vec1 /= vec1.sum()  # normalize word counts

    vec2 = collections.Counter(stringstore[word.text] - 1
                               for word in extract.words(doc2)
                               if word.has_vector)
    vec2 = np.array([vec2[word_idx] for word_idx in range(len(stringstore))
                     ]).astype(np.double)
    vec2 /= vec2.sum()  # normalize word counts

    return 1.0 - emd(vec1, vec2, distance_mat)
Beispiel #17
0
def merge_counts(locs, out_loc):
    string_map = StringStore()
    counts = PreshCounter()
    for loc in locs:
        with io.open(loc, 'r', encoding='utf8') as file_:
            for line in file_:
                freq, word = line.strip().split('\t', 1)
                orth = string_map[word]
                counts.inc(orth, int(freq))
    with io.open(out_loc, 'w', encoding='utf8') as file_:
        for orth, count in counts:
            string = string_map[orth]
            file_.write('%d\t%s\n' % (count, string))
    def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()):
        """Load a Sense2Vec object from a bytestring.

        bytes_data (bytes): The data to load.
        exclude (list): Names of serialization fields to exclude.
        RETURNS (Sense2Vec): The loaded object.
        """
        data = srsly.msgpack_loads(bytes_data)
        self.vectors = Vectors().from_bytes(data["vectors"])
        self.freqs = dict(data.get("freqs", []))
        self.cfg.update(data.get("cfg", {}))
        if "strings" not in exclude and "strings" in data:
            self.strings = StringStore().from_bytes(data["strings"])
        return self
    def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
        """Load a Sense2Vec object from a directory.

        path (unicode / Path): The path to load from.
        exclude (list): Names of serialization fields to exclude.
        RETURNS (Sense2Vec): The loaded object.
        """
        path = Path(path)
        strings_path = path / "strings.json"
        freqs_path = path / "freqs.json"
        self.vectors = Vectors().from_disk(path)
        self.cfg.update(srsly.read_json(path / "cfg"))
        if freqs_path.exists():
            self.freqs = dict(srsly.read_json(freqs_path))
        if "strings" not in exclude and strings_path.exists():
            self.strings = StringStore().from_disk(strings_path)
        return self
Beispiel #20
0
def test_from_to_bytes():
    strings = StringStore()
    trees = EditTrees(strings)
    trees.add("deelt", "delen")
    trees.add("gedeeld", "delen")

    b = trees.to_bytes()

    trees2 = EditTrees(strings)
    trees2.from_bytes(b)

    # Verify that the nodes did not change.
    assert len(trees) == len(trees2)
    for i in range(len(trees)):
        assert trees.tree_to_str(i) == trees2.tree_to_str(i)

    # Reinserting the same trees should not add new nodes.
    trees2.add("deelt", "delen")
    trees2.add("gedeeld", "delen")
    assert len(trees) == len(trees2)
Beispiel #21
0
def test_from_to_disk():
    strings = StringStore()
    trees = EditTrees(strings)
    trees.add("deelt", "delen")
    trees.add("gedeeld", "delen")

    trees2 = EditTrees(strings)
    with make_tempdir() as temp_dir:
        trees_file = temp_dir / "edit_trees.bin"
        trees.to_disk(trees_file)
        trees2 = trees2.from_disk(trees_file)

    # Verify that the nodes did not change.
    assert len(trees) == len(trees2)
    for i in range(len(trees)):
        assert trees.tree_to_str(i) == trees2.tree_to_str(i)

    # Reinserting the same trees should not add new nodes.
    trees2.add("deelt", "delen")
    trees2.add("gedeeld", "delen")
    assert len(trees) == len(trees2)
Beispiel #22
0
    def __init__(self):
        import os
        from sagas.conf.conf import cf
        from pyltp import Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
        from spacy.strings import StringStore

        self.stringstore = StringStore()

        MODELDIR = f'{cf.conf_dir}/ai/ltp/ltp_data_v3.4.0'
        self.postagger = Postagger()
        self.postagger.load(os.path.join(MODELDIR, "pos.model"))
        par_model_path = os.path.join(MODELDIR, 'parser.model')
        self.parser = Parser()
        self.parser.load(par_model_path)
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(MODELDIR, "ner.model"))
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(MODELDIR, "pisrl.model"))

        self.conf = AnalConf('zh')
        self.conf.setup(self)
Beispiel #23
0
def build_doc_term_matrix(terms_lists,
                          weighting='tf',
                          normalize=False, sublinear_tf=False, smooth_idf=True,
                          min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None):
    """
    Build a document-term matrix of shape (# docs, # unique terms) from a sequence
    of documents, each represented as a sequence of (str) terms, with a variety of
    weighting and normalization schemes for the matrix values.

    Args:
        terms_lists (iterable(iterable(str))): a sequence of documents, each as a
            sequence of (str) terms; note that the terms in each doc are to be
            counted, so these probably shouldn't be sets containing *unique* terms;
            example inputs::

                >>> ([tok.lemma_ for tok in spacy_doc]
                ...  for spacy_doc in spacy_docs)
                >>> ((ne.text for ne in doc.named_entities())
                ...  for doc in textcorpus)
                >>> (tuple(ng.text for ng in itertools.chain.from_iterable(doc.ngrams(i) for i in range(1, 3)))
                ...  for doc in docs)

        weighting (str {'tf', 'tfidf', 'binary'}, optional): if 'tf', matrix values
            (i, j) correspond to the number of occurrences of term j in doc i; if
            'tfidf', term frequencies (tf) are multiplied by their corresponding
            inverse document frequencies (idf); if 'binary', all non-zero values
            are set equal to 1
        normalize (bool, optional): if True, normalize term frequencies by the
            L2 norms of the vectors
        binarize (bool, optional): if True, set all term frequencies greater than
            0 equal to 1
        sublinear_tf (bool, optional): if True, apply sub-linear term-frequency
            scaling, i.e. tf => 1 + log(tf)
        smooth_idf (bool, optional): if True, add 1 to all document frequencies,
            equivalent to adding a single document to the corpus containing every
            unique term
        min_df (float or int, optional): if float, value is the fractional proportion
            of the total number of documents, which must be in [0.0, 1.0]; if int,
            value is the absolute number; filter terms whose document frequency
            is less than ``min_df``
        max_df (float or int, optional): if float, value is the fractional proportion
            of the total number of documents, which must be in [0.0, 1.0]; if int,
            value is the absolute number; filter terms whose document frequency
            is greater than ``max_df``
        min_ic (float, optional): filter terms whose information content is less
            than `min_ic`; value must be in [0.0, 1.0]
        max_n_terms (int, optional): only include terms whose document frequency
            is within the top ``max_n_terms``

    Returns:
        :class:`scipy.sparse.csr_matrix <scipy.sparse.csr_matrix>`: sparse matrix
            of shape (# docs, # unique terms), where value (i, j) is the weight
            of term j in doc i
        dict: id to term mapping, where keys are unique integers as term ids and
            values are corresponding strings
    """
    stringstore = StringStore()
    data = []; rows = []; cols = []
    for row_idx, terms_list in enumerate(terms_lists):

        # an empty string always occupies index 0 in the stringstore, which causes
        # an empty first col in the doc-term matrix that we don't want;
        # so, we subtract 1 from the stringstore's assigned id
        bow = tuple((stringstore[term] - 1, count)
                    for term, count in collections.Counter(terms_list).items()
                    if term)

        data.extend(count for _, count in bow)
        cols.extend(term_id for term_id, _ in bow)
        rows.extend(itertools.repeat(row_idx, times=len(bow)))

    doc_term_matrix = sp.coo_matrix((data, (rows, cols)), dtype=int).tocsr()
    # ignore the 0-index empty string in stringstore, as above
    id_to_term = {term_id - 1: term for term_id, term in enumerate(stringstore)
                  if term_id != 0}

    # filter terms by document frequency or information content?
    if max_df != 1.0 or min_df != 1 or max_n_terms is not None:
        doc_term_matrix, id_to_term = filter_terms_by_df(
            doc_term_matrix, id_to_term,
            max_df=max_df, min_df=min_df, max_n_terms=max_n_terms)
    if min_ic != 0.0:
        doc_term_matrix, id_to_term = filter_terms_by_ic(
            doc_term_matrix, id_to_term,
            min_ic=min_ic, max_n_terms=max_n_terms)

    if weighting == 'binary':
        doc_term_matrix = binarize_mat(doc_term_matrix, threshold=0.0, copy=False)
    else:
        if sublinear_tf is True:
            doc_term_matrix = doc_term_matrix.astype(np.float64)
            np.log(doc_term_matrix.data, doc_term_matrix.data)
            doc_term_matrix.data += 1
        if weighting == 'tfidf':
            doc_term_matrix = apply_idf_weighting(doc_term_matrix,
                                                  smooth_idf=smooth_idf)

    if normalize is True:
        doc_term_matrix = normalize_mat(doc_term_matrix,
                                        norm='l2', axis=1, copy=False)

    return (doc_term_matrix, id_to_term)
Beispiel #24
0
def test_empty_strings():
    strings = StringStore()
    trees = EditTrees(strings)
    no_change = trees.add("xyz", "xyz")
    empty = trees.add("", "")
    assert no_change == empty
Beispiel #25
0
 def test_create(self):
     lemmatizer = Lemmatizer({}, {}, {})
     strings = StringStore()
     lemmatizer = Lemmatizer({}, {}, {})
     morphology = Morphology(strings, {}, lemmatizer)
def morphology():
    morphology = Morphology(StringStore())
    morphology.add("Feat1=Val1|Feat2=Val2")
    morphology.add("Feat3=Val3|Feat4=Val4")
    return morphology
def morphology():
    lemmatizer = Lemmatizer(Lookups())
    return Morphology(StringStore(), {}, lemmatizer)
Beispiel #28
0
def sstore():
    return StringStore()
Beispiel #29
0
def stringstore():
    return StringStore()
Beispiel #30
0
def test_stringstore_to_bytes(stringstore, text):
    store = stringstore.add(text)
    serialized = stringstore.to_bytes()
    new_stringstore = StringStore().from_bytes(serialized)
    assert new_stringstore[store] == text