Example #1
0
def test_multisegment():
    schema = fields.Schema(text=fields.TEXT(spelling=True))
    ix = RamStorage().create_index(schema)
    domain = u("special specious spectacular spongy spring specials").split()
    for word in domain:
        w = ix.writer()
        w.add_document(text=word)
        w.commit(merge=False)

    with ix.reader() as r:
        assert not r.is_atomic()
        words = list(dawg.flatten(r.word_graph("text")))
        assert_equal(words, sorted(domain))

        corr = r.corrector("text")
        assert_equal(corr.suggest("specail", maxdist=2),
                     ["special", "specials"])

    ix.optimize()
    with ix.reader() as r:
        assert r.is_atomic()
        assert_equal(list(r.lexicon("text")), sorted(domain))
        assert r.has_word_graph("text")
        words = list(dawg.flatten(r.word_graph("text")))
        assert_equal(words, sorted(domain))

        corr = r.corrector("text")
        assert_equal(corr.suggest("specail", maxdist=2),
                     ["special", "specials"])
Example #2
0
def test_multisegment():
    schema = fields.Schema(text=fields.TEXT(spelling=True))
    ix = RamStorage().create_index(schema)
    domain = u("special specious spectacular spongy spring specials").split()
    for word in domain:
        w = ix.writer()
        w.add_document(text=word)
        w.commit(merge=False)

    with ix.reader() as r:
        assert not r.is_atomic()
        words = list(dawg.flatten(r.word_graph("text")))
        assert_equal(words, sorted(domain))

        corr = r.corrector("text")
        assert_equal(corr.suggest("specail", maxdist=2), ["special", "specials"])

    ix.optimize()
    with ix.reader() as r:
        assert r.is_atomic()
        assert_equal(list(r.lexicon("text")), sorted(domain))
        assert r.has_word_graph("text")
        words = list(dawg.flatten(r.word_graph("text")))
        assert_equal(words, sorted(domain))

        corr = r.corrector("text")
        assert_equal(corr.suggest("specail", maxdist=2), ["special", "specials"])
Example #3
0
def test_dawg():
    from whoosh.support.dawg import DawgBuilder

    with TempStorage() as st:
        df = st.create_file("test.dawg")

        dw = DawgBuilder(field_root=True)
        dw.insert(["test"] + list("special"))
        dw.insert(["test"] + list("specials"))
        dw.write(df)

        assert_equal(list(dawg.flatten(dw.root.edge("test"))), ["special", "specials"])
Example #4
0
def test_dawg():
    from whoosh.support.dawg import DawgBuilder

    with TempStorage() as st:
        df = st.create_file("test.dawg")

        dw = DawgBuilder(field_root=True)
        dw.insert(["test"] + list("special"))
        dw.insert(["test"] + list("specials"))
        dw.write(df)

        assert_equal(list(dawg.flatten(dw.root.edge("test"))),
                     ["special", "specials"])
Example #5
0
def test_bypass_stemming():
    from whoosh.support.dawg import flatten

    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(text=u("rendering shading modeling reactions"))
    w.commit()

    with ix.reader() as r:
        assert_equal(list(r.lexicon("text")), ["model", "reaction", "render", "shade"])
        assert_equal(list(flatten(r.word_graph("text"))), ["modeling", "reactions", "rendering", "shading"])
Example #6
0
def test_bypass_stemming():
    from whoosh.support.dawg import flatten

    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(text=u("rendering shading modeling reactions"))
    w.commit()

    with ix.reader() as r:
        assert_equal(list(r.lexicon("text")),
                     ["model", "reaction", "render", "shade"])
        assert_equal(list(flatten(r.word_graph("text"))),
                     ["modeling", "reactions", "rendering", "shading"])
Example #7
0
    def add_reader(self, reader):
        self._check_state()
        startdoc = self.docnum

        has_deletions = reader.has_deletions()
        if has_deletions:
            docmap = {}

        fieldnames = set(self.schema.names())

        # Add stored documents, vectors, and field lengths
        for docnum in reader.all_doc_ids():
            if (not has_deletions) or (not reader.is_deleted(docnum)):
                d = dict(item
                         for item in iteritems(reader.stored_fields(docnum))
                         if item[0] in fieldnames)
                # We have to append a dictionary for every document, even if
                # it's empty.
                self.storedfields.append(d)

                if has_deletions:
                    docmap[docnum] = self.docnum

                for fieldname in reader.schema.scorable_names():
                    length = reader.doc_field_length(docnum, fieldname)
                    if length and fieldname in fieldnames:
                        self.pool.add_field_length(self.docnum, fieldname,
                                                   length)

                for fieldname in reader.schema.vector_names():
                    if (fieldname in fieldnames
                            and reader.has_vector(docnum, fieldname)):
                        vpostreader = reader.vector(docnum, fieldname)
                        self._add_vector_reader(self.docnum, fieldname,
                                                vpostreader)

                self.docnum += 1

        # Add dawg contents to word sets for fields that require separate
        # handling
        for fieldname in self.schema.separate_spelling_names():
            if reader.has_word_graph(fieldname):
                graph = reader.word_graph(fieldname)
                self.add_spell_words(fieldname, flatten(graph))

        # Add postings
        for fieldname, text in reader.all_terms():
            if fieldname in fieldnames:
                postreader = reader.postings(fieldname, text)
                while postreader.is_active():
                    docnum = postreader.id()
                    valuestring = postreader.value()
                    if has_deletions:
                        newdoc = docmap[docnum]
                    else:
                        newdoc = startdoc + docnum

                    self.pool.add_posting(fieldname, text, newdoc,
                                          postreader.weight(), valuestring)
                    postreader.next()

        self._added = True
Example #8
0
    def add_reader(self, reader):
        self._check_state()
        startdoc = self.docnum

        has_deletions = reader.has_deletions()
        if has_deletions:
            docmap = {}

        fieldnames = set(self.schema.names())

        # Add stored documents, vectors, and field lengths
        for docnum in reader.all_doc_ids():
            if (not has_deletions) or (not reader.is_deleted(docnum)):
                d = dict(item for item
                         in iteritems(reader.stored_fields(docnum))
                         if item[0] in fieldnames)
                # We have to append a dictionary for every document, even if
                # it's empty.
                self.storedfields.append(d)

                if has_deletions:
                    docmap[docnum] = self.docnum

                for fieldname in reader.schema.scorable_names():
                    length = reader.doc_field_length(docnum, fieldname)
                    if length and fieldname in fieldnames:
                        self.pool.add_field_length(self.docnum, fieldname,
                                                   length)

                for fieldname in reader.schema.vector_names():
                    if (fieldname in fieldnames
                        and reader.has_vector(docnum, fieldname)):
                        vpostreader = reader.vector(docnum, fieldname)
                        self._add_vector_reader(self.docnum, fieldname,
                                                vpostreader)

                self.docnum += 1

        # Add dawg contents to word sets for fields that require separate
        # handling
        for fieldname in self.schema.separate_spelling_names():
            if reader.has_word_graph(fieldname):
                graph = reader.word_graph(fieldname)
                self.add_spell_words(fieldname, flatten(graph))

        # Add postings
        for fieldname, text in reader.all_terms():
            if fieldname in fieldnames:
                postreader = reader.postings(fieldname, text)
                while postreader.is_active():
                    docnum = postreader.id()
                    valuestring = postreader.value()
                    if has_deletions:
                        newdoc = docmap[docnum]
                    else:
                        newdoc = startdoc + docnum

                    self.pool.add_posting(fieldname, text, newdoc,
                                          postreader.weight(), valuestring)
                    postreader.next()

        self._added = True