コード例 #1
0
    def __init__(self,
                 minsize=2,
                 maxsize=4,
                 stored=False,
                 field_boost=1.0,
                 tokenizer=None,
                 at=None,
                 queryor=False):
        """
        :param minsize: The minimum length of the N-grams.
        :param maxsize: The maximum length of the N-grams.
        :param stored: Whether to store the value of this field with the
            document. Since this field type generally contains a lot of text,
            you should avoid storing it with the document unless you need to,
            for example to allow fast excerpts in the search results.
        :param tokenizer: an instance of :class:`whoosh.analysis.Tokenizer`
            used to break the text into words.
        :param at: if 'start', only takes N-grams from the start of the word.
            If 'end', only takes N-grams from the end. Otherwise the default
            is to take all N-grams from each word.
        :param queryor: if True, combine the N-grams with an Or query. The
            default is to combine N-grams with an And query.
        """

        self.analyzer = NgramWordAnalyzer(minsize, maxsize, tokenizer, at=at)
        self.format = formats.Frequency(field_boost=field_boost)
        self.stored = stored
        self.queryor = queryor
コード例 #2
0
def test_copyfield():
    qp = qparser.QueryParser("a", None)
    qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, None))
    assert_equal(text_type(qp.parse("hello b:matt")), "(a:hello AND b:matt AND c:matt)")

    qp = qparser.QueryParser("a", None)
    qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.AndMaybeGroup))
    assert_equal(text_type(qp.parse("hello b:matt")), "(a:hello AND (b:matt ANDMAYBE c:matt))")

    qp = qparser.QueryParser("a", None)
    qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.RequireGroup))
    assert_equal(text_type(qp.parse("hello (there OR b:matt)")), "(a:hello AND (a:there OR (b:matt REQUIRE c:matt)))")

    qp = qparser.QueryParser("a", None)
    qp.add_plugin(plugins.CopyFieldPlugin({"a": "c"}, syntax.OrGroup))
    assert_equal(text_type(qp.parse("hello there")), "((a:hello OR c:hello) AND (a:there OR c:there))")

    qp = qparser.QueryParser("a", None)
    qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, mirror=True))
    assert_equal(text_type(qp.parse("hello c:matt")), "(a:hello AND (c:matt OR b:matt))")

    qp = qparser.QueryParser("a", None)
    qp.add_plugin(plugins.CopyFieldPlugin({"c": "a"}, mirror=True))
    assert_equal(text_type(qp.parse("hello c:matt")), "((a:hello OR c:hello) AND (c:matt OR a:matt))")

    ana = analysis.RegexAnalyzer(r"\w+") | analysis.DoubleMetaphoneFilter()
    fmt = formats.Frequency()
    schema = fields.Schema(name=fields.KEYWORD, name_phone=fields.FieldType(fmt, ana, multitoken_query="or"))
    qp = qparser.QueryParser("name", schema)
    qp.add_plugin(plugins.CopyFieldPlugin({"name": "name_phone"}))
    assert_equal(text_type(qp.parse(u("spruce view"))), "((name:spruce OR name_phone:SPRS) AND (name:view OR name_phone:F OR name_phone:FF))")
コード例 #3
0
    def __init__(self,
                 stored=False,
                 lowercase=False,
                 commas=False,
                 vector=None,
                 scorable=False,
                 unique=False,
                 field_boost=1.0,
                 spelling=False):
        """
        :param stored: Whether to store the value of the field with the
            document.
        :param comma: Whether this is a comma-separated field. If this is False
            (the default), it is treated as a space-separated field.
        :param scorable: Whether this field is scorable.
        """

        self.analyzer = KeywordAnalyzer(lowercase=lowercase, commas=commas)
        self.format = formats.Frequency(field_boost=field_boost)
        self.scorable = scorable
        self.stored = stored
        self.unique = unique
        self.spelling = spelling

        if vector:
            if type(vector) is type:
                vector = vector()
            elif isinstance(vector, formats.Format):
                pass
            else:
                vector = self.format
        else:
            vector = None
        self.vector = vector
コード例 #4
0
def test_huge_postfile():
    with TempStorage("hugeindex") as st:
        pf = st.create_file("test.pst")

        gb5 = 5 * 1024 * 1024 * 1024
        pf.seek(gb5)
        pf.write("\x00\x00\x00\x00")
        assert_equal(pf.tell(), gb5 + 4)

        fpw = FilePostingWriter(pf)
        format = formats.Frequency(None)
        offset = fpw.start(format)
        for i in xrange(10):
            fpw.write(i, float(i), struct.pack("!I", i), 10)
        posttotal = fpw.finish()
        assert_equal(posttotal, 10)
        fpw.close()

        pf = st.open_file("test.pst")
        pfr = FilePostingReader(pf, offset, format)
        i = 0
        while pfr.is_active():
            assert_equal(pfr.id(), i)
            assert_equal(pfr.weight(), float(i))
            assert_equal(pfr.value(), struct.pack("!I", i))
            pfr.next()
            i += 1
        pf.close()
コード例 #5
0
    def test_vector_unicode(self):
        a = analysis.StandardAnalyzer()
        schema = fields.Schema(content=fields.TEXT(vector=formats.Frequency(
            analyzer=a)))
        ix = self.make_index("testindex", schema, "vector_unicode")
        try:
            writer = ix.writer()
            writer.add_document(
                content=u"\u1234\u2345\u3456 \u4567\u5678\u6789")
            writer.add_document(
                content=u"\u0123\u1234\u4567 \u4567\u5678\u6789")
            writer.commit()

            writer = ix.writer()
            writer.add_document(
                content=u"\u2345\u3456\u4567 \u789a\u789b\u789c")
            writer.add_document(
                content=u"\u0123\u1234\u4567 \u2345\u3456\u4567")
            writer.commit()

            reader = ix.reader()
            vec = list(reader.vector_as("frequency", 0, 0))
            self.assertEqual(vec, [(u'\u3456\u4567', 1),
                                   (u'\u789a\u789b\u789c', 1)])
        finally:
            pass
コード例 #6
0
def test_more_like_this(model=classify.Bo2Model):
    docs = [
        u("alfa bravo charlie delta echo foxtrot golf"),
        u("delta echo foxtrot golf hotel india juliet"),
        u("echo foxtrot golf hotel india juliet kilo"),
        u("foxtrot golf hotel india juliet kilo lima"),
        u("golf hotel india juliet kilo lima mike"),
        u("foxtrot golf hotel india alfa bravo charlie")
    ]

    def _check(schema, **kwargs):
        ix = RamStorage().create_index(schema)
        with ix.writer() as w:
            for i, text in enumerate(docs):
                w.add_document(id=text_type(i + 1), text=text)

        with ix.searcher() as s:
            docnum = s.document_number(id=u("1"))
            r = s.more_like(docnum, "text", model=model, **kwargs)
            assert [hit["id"] for hit in r] == ["6", "2", "3"]

    schema = fields.Schema(id=fields.ID(stored=True),
                           text=fields.TEXT(stored=True))
    _check(schema)

    ana = analysis.StandardAnalyzer()
    schema = fields.Schema(id=fields.ID(stored=True),
                           text=fields.TEXT(analyzer=ana,
                                            vector=formats.Frequency()))
    _check(schema)

    schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    _check(schema, text=docs[0])
コード例 #7
0
ファイル: test_ramindex.py プロジェクト: ws-os/oh-mainline
def make_index():
    ana = analysis.StandardAnalyzer(stoplist=None)
    sc = fields.Schema(id=fields.ID(stored=True),
                       text=fields.TEXT(analyzer=ana,
                                        vector=formats.Frequency()),
                       subs=fields.NUMERIC(int, stored=True))
    ix = RamIndex(sc)
    ix.add_document(
        id=u("fieldtype"),
        text=u("The FieldType object supports the following attributes"),
        subs=56)
    ix.add_document(id=u("format"),
                    text=u("the storage format for the field contents"),
                    subs=100)
    ix.add_document(
        id=u("vector"),
        text=u("the storage format for the field vectors (forward index)"),
        subs=23)
    ix.add_document(
        id=u("scorable"),
        text=u("whether searches against this field may be scored."),
        subs=34)
    ix.add_document(
        id=u("stored"),
        text=u(
            "whether the content of this field is stored for each document."),
        subs=575)
    ix.add_document(
        id=u("unique"),
        text=u("whether this field value is unique to each document."),
        subs=2)
    ix.add_document(id=u("const"),
                    text=u("The constructor for the base field type simply"),
                    subs=58204)
    return ix
コード例 #8
0
    def test_vector_merge(self):
        a = analysis.StandardAnalyzer()
        schema = fields.Schema(
            title=fields.TEXT,
            content=fields.TEXT(vector=formats.Frequency(analyzer=a)))
        ix = self.make_index("testindex", schema, "vector_merge")
        try:
            writer = ix.writer()
            writer.add_document(
                title=u"one",
                content=u"This is the story of the black hole story")
            writer.commit()

            writer = ix.writer()
            writer.add_document(title=u"two",
                                content=u"You can read along in your book")
            writer.commit()

            reader = ix.reader()
            searcher = Searcher(reader)

            docnum = searcher.document_number(title=u"one")
            vec = list(reader.vector(docnum, "content").items_as("frequency"))
            self.assertEqual(vec, [(u'black', 1), (u'hole', 1), (u'story', 2)])

            docnum = searcher.document_number(title=u"two")
            vec = list(searcher.vector_as("frequency", docnum, "content"))
            self.assertEqual(vec, [(u'along', 1), (u'book', 1), (u'read', 1),
                                   (u'your', 1)])
        finally:
            pass
コード例 #9
0
ファイル: test_vectors.py プロジェクト: datakortet/whoosh
def test_vector_merge():
    schema = fields.Schema(title=fields.TEXT,
                           content=fields.TEXT(vector=formats.Frequency()))

    with TempIndex(schema, "vectormerge") as ix:
        writer = ix.writer()
        writer.add_document(title=u("one"),
                            content=u("This is the story of the black hole " +
                                      "story"))
        writer.commit()

        writer = ix.writer()
        writer.add_document(title=u("two"),
                            content=u("You can read along in your book"))
        writer.commit()

        with ix.searcher() as s:
            r = s.reader()

            docnum = s.document_number(title=u("one"))
            vec = list(r.vector_as("frequency", docnum, "content"))
            assert_equal(vec,
                         [(u('black'), 1), (u('hole'), 1), (u('story'), 2)])

            docnum = s.document_number(title=u("two"))

            vec = list(r.vector_as("frequency", docnum, "content"))
            assert_equal(vec,
                         [(u('along'), 1), (u('book'), 1), (u('read'), 1)])
コード例 #10
0
ファイル: test_vectors.py プロジェクト: datakortet/whoosh
def test_vector_reading():
    schema = fields.Schema(title=fields.TEXT,
                           content=fields.TEXT(vector=formats.Frequency()))

    with TempIndex(schema, "vectorreading") as ix:
        writer = ix.writer()
        writer.add_document(title=u("one"),
                            content=u("This is the story of the black " +
                                      "hole story"))
        writer.commit()

        with ix.reader() as r:
            assert_equal(list(r.vector_as("frequency", 0, "content")),
                         [(u('black'), 1), (u('hole'), 1), (u('story'), 2)])
コード例 #11
0
def test_vector_values():
    field = fields.TEXT(vector=formats.Frequency())
    st, codec, seg = _make_codec()
    content = u("alfa bravo charlie alfa")

    dw = codec.per_document_writer(st, seg)
    dw.start_doc(0)
    vals = sorted(field.vector.word_values(content, field.analyzer))
    dw.add_vector_items("f1", field, vals)
    dw.finish_doc()
    dw.close()

    vr = codec.vector_reader(st, seg)
    m = vr.matcher(0, "f1", field.vector)
    assert_equal(list(m.items_as("frequency")), [("alfa", 2), ("bravo", 1),
                                                 ("charlie", 1)])
コード例 #12
0
ファイル: test_classify.py プロジェクト: CuteCha/dssm-theano
def create_index():
    analyzer = analysis.StandardAnalyzer()
    vector_format = formats.Frequency()
    schema = fields.Schema(path=fields.ID(stored=True),
                           content=fields.TEXT(analyzer=analyzer,
                                               vector=vector_format))

    ix = RamStorage().create_index(schema)

    w = ix.writer()
    from string import ascii_lowercase
    for letter, content in zip(ascii_lowercase, domain):
        w.add_document(path=u("/%s") % letter, content=content)
    w.commit()

    return ix
コード例 #13
0
    def test_vector_phrase(self):
        ana = analysis.StandardAnalyzer()
        ftype = fields.FieldType(formats.Frequency(ana),
                                 formats.Positions(ana),
                                 scorable=True)
        schema = fields.Schema(name=fields.ID(stored=True), value=ftype)
        storage = RamStorage()
        ix = storage.create_index(schema)
        writer = ix.writer()
        writer.add_document(name=u"A",
                            value=u"Little Miss Muffet sat on a tuffet")
        writer.add_document(name=u"B", value=u"Miss Little Muffet tuffet")
        writer.add_document(name=u"C", value=u"Miss Little Muffet tuffet sat")
        writer.add_document(
            name=u"D",
            value=u"Gibberish blonk falunk miss muffet sat tuffet garbonzo")
        writer.add_document(name=u"E", value=u"Blah blah blah pancakes")
        writer.commit()

        searcher = ix.searcher()

        def names(results):
            return sorted([fields['name'] for fields in results])

        q = query.Phrase("value",
                         [u"little", u"miss", u"muffet", u"sat", u"tuffet"])
        sc = q.scorer(searcher)
        self.assertEqual(sc.__class__.__name__, "VectorPhraseScorer")

        self.assertEqual(names(searcher.search(q)), ["A"])

        q = query.Phrase("value", [u"miss", u"muffet", u"sat", u"tuffet"])
        self.assertEqual(names(searcher.search(q)), ["A", "D"])

        q = query.Phrase("value", [u"falunk", u"gibberish"])
        self.assertEqual(names(searcher.search(q)), [])

        q = query.Phrase("value", [u"gibberish", u"falunk"], slop=2)
        self.assertEqual(names(searcher.search(q)), ["D"])

        #q = query.Phrase("value", [u"blah"] * 4)
        #self.assertEqual(names(searcher.search(q)), []) # blah blah blah blah

        q = query.Phrase("value", [u"blah"] * 3)
        self.assertEqual(names(searcher.search(q)), ["E"])
コード例 #14
0
def test_vector_unicode():
    schema = fields.Schema(content=fields.TEXT(vector=formats.Frequency()))
    ix = RamStorage().create_index(schema)

    writer = ix.writer()
    writer.add_document(content=u("\u1234\u2345\u3456 \u4567\u5678\u6789"))
    writer.add_document(content=u("\u0123\u1234\u4567 \u4567\u5678\u6789"))
    writer.commit()

    writer = ix.writer()
    writer.add_document(content=u("\u2345\u3456\u4567 \u789a\u789b\u789c"))
    writer.add_document(content=u("\u0123\u1234\u4567 \u2345\u3456\u4567"))
    writer.commit()

    with ix.reader() as r:
        vec = list(r.vector_as("frequency", 0, "content"))
        assert_equal(vec, [(u('\u3456\u4567'), 1),
                           (u('\u789a\u789b\u789c'), 1)])
コード例 #15
0
    def test_vector_reading(self):
        a = analysis.StandardAnalyzer()
        schema = fields.Schema(
            title=fields.TEXT,
            content=fields.TEXT(vector=formats.Frequency(analyzer=a)))
        ix = self.make_index("testindex", schema, "vector_reading")
        try:
            writer = ix.writer()
            writer.add_document(
                title=u"one",
                content=u"This is the story of the black hole story")
            writer.commit()

            reader = ix.reader()
            self.assertEqual(list(reader.vector_as("frequency", 0, "content")),
                             [(u'black', 1), (u'hole', 1), (u'story', 2)])
        finally:
            pass
コード例 #16
0
def test_keyterms():
    ana = analysis.StandardAnalyzer()
    vectorformat = formats.Frequency()
    schema = fields.Schema(path=fields.ID,
                           content=fields.TEXT(analyzer=ana,
                                               vector=vectorformat))
    st = RamStorage()
    ix = st.create_index(schema)
    w = ix.writer()
    w.add_document(path=u("a"), content=u("This is some generic content"))
    w.add_document(path=u("b"), content=u("This is some distinctive content"))
    w.commit()

    with ix.searcher() as s:
        docnum = s.document_number(path=u("b"))
        keyterms = list(s.key_terms([docnum], "content"))
        assert len(keyterms) > 0
        assert keyterms[0][0] == "distinctive"

        r = s.search(query.Term("path", u("b")))
        keyterms2 = list(r.key_terms("content"))
        assert len(keyterms2) > 0
        assert keyterms2[0][0] == "distinctive"
コード例 #17
0
ファイル: test_quality.py プロジェクト: ws-os/oh-mainline
def test_lowlevel_block_writing():
    st = RamStorage()
    f = st.create_file("postfile")
    fpw = FilePostingWriter(f, blocklimit=4)
    fmt = formats.Frequency()
    fpw.start(fmt)
    fpw.write(0, 1.0, fmt.encode(1.0), 1)
    fpw.write(1, 2.0, fmt.encode(2.0), 2)
    fpw.write(2, 12.0, fmt.encode(12.0), 6)
    fpw.write(5, 6.5, fmt.encode(6.5), 420)

    fpw.write(11, 1.5, fmt.encode(1.5), 1)
    fpw.write(12, 2.5, fmt.encode(2.5), 2)
    fpw.write(26, 100.5, fmt.encode(100.5), 21)
    fpw.write(50, 8.0, fmt.encode(8.0), 1020)
    ti = fpw.finish()

    assert_equal(ti.weight(), 134.0)
    assert_equal(ti.doc_frequency(), 8)
    assert_equal(ti.min_length(), 1)
    assert_equal(ti.max_length(), byte_to_length(length_to_byte(1020)))
    assert_equal(ti.max_weight(), 100.5)
    assert_equal(ti.max_wol(), 100.5 / byte_to_length(length_to_byte(21)))
コード例 #18
0
    def ranked_set_baseline(self,basefile):
        # Helper from http://effbot.org/zone/element-lib.htm
        def flatten(elem, include_tail=0):
            text = elem.text or ""
            for e in elem:
                text += flatten(e, 1)
                if include_tail and elem.tail: text += elem.tail
            return text
        # step 1: Create a temporary whoosh index in order to find out
        # the most significant words for each article

        ana = analysis.StandardAnalyzer()
        # ana = analysis.StemmingAnalyzer()
        vectorformat = formats.Frequency(ana)
        schema = fields.Schema(article=fields.ID(unique=True),
                               title=fields.TEXT(stored=True),
                               content=fields.TEXT(analyzer=ana,
                                                   vector=vectorformat))

        st = RamStorage()
        tmpidx = st.create_index(schema)
        w = tmpidx.writer()

        XHT_NS = "{http://www.w3.org/1999/xhtml}"
        tree = ET.parse(self.parsed_path(basefile))
        els = tree.findall("//"+XHT_NS+"div")
        articles = []
        for el in els:
            if 'typeof' in el.attrib and el.attrib['typeof'] == "eurlex:Article":
                text = Util.normalizeSpace(flatten(el))
                article = unicode(el.attrib['id'][1:])
                articles.append(article)
                w.update_document(article=article,title="Article "+ article,content=text)

        w.commit()
        self.log.info("Indexed %d articles" % len(articles))

        # Step 2: Open the large whoosh index containing the text of
        # all cases. Then, for each article, use the 20 most distinctive terms
        # (filtering away numbers) to create a query against that index

        # things to vary:
        # * numterms
        # * connector (AND or OR)
        # * scoring (weighting=scoring.Cosine())
        numterms = 5
        connector = " AND "
        indexdir = os.path.sep.join([self.config['datadir'],'ecj','index'])
        storage = FileStorage(indexdir)
        idx = storage.open_index()
        searcher = idx.searcher(weighting=scoring.BM25F())

        tempsearch = tmpidx.searcher()

        rankedset = {}
        
        for article in articles:
            rankedset[article] = []
            r = tempsearch.search(query.Term("article",article))
            terms = [t[0] for t in r.key_terms("content", numterms=numterms+1) if not t[0].isdigit()][:numterms]
            print "Article %s:%r" % (article, terms)
            parser = qparser.QueryParser("content")
            q = parser.parse(connector.join(terms))
            results = searcher.search(q, limit=10)
            resultidx = 0
            for result in results:
                reslbl = "%s (%s)"%(result['title'],results.score(resultidx))
                rankedset[article].append([result['basefile'],reslbl])
                print u"\t%s (%s)" % (result['title'], results.score(resultidx))
                resultidx += 1

        return rankedset