Exemple #1
0
    def __init__(self, doc_path, stopwords=None):
        st = RamStorage()
        st.create()
        schema = Schema(entity1_name=TEXT(stored=True),
                        fieldname=TEXT(stored=True),
                        entity2_name=TEXT(stored=True))
        self.ix = st.create_index(schema)
        writer = self.ix.writer()
        self.remove_stopwords_while_indexing = False
        if stopwords:
            self.remove_stopwords_while_indexing = True
            self.stopwords_dict = read_file_as_dict(stopwords)

        with open(doc_path, 'r') as graph_file:
            reader = csv.DictReader(graph_file,
                                    delimiter="\t",
                                    fieldnames=['e1_relation', 'e2'])
            for row in tqdm(reader):
                entity_relation, e2 = row['e1_relation'], row['e2']
                tokens = entity_relation.split()
                e1 = tokens[1]
                relation = tokens[2]
                writer.add_document(entity1_name=e1,
                                    fieldname=relation,
                                    entity2_name=e2)
        writer.commit()
Exemple #2
0
def test_finalweighting():
    from whoosh.scoring import Frequency

    schema = fields.Schema(id=fields.ID(stored=True),
                           summary=fields.TEXT,
                           n_comments=fields.STORED)
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    w.add_document(id=u("1"), summary=u("alfa bravo"), n_comments=5)
    w.add_document(id=u("2"), summary=u("alfa"), n_comments=12)
    w.add_document(id=u("3"), summary=u("bravo"), n_comments=2)
    w.add_document(id=u("4"), summary=u("bravo bravo"), n_comments=7)
    w.commit()

    class CommentWeighting(Frequency):
        use_final = True

        def final(self, searcher, docnum, score):
            ncomments = searcher.stored_fields(docnum).get("n_comments", 0)
            return ncomments

    with ix.searcher(weighting=CommentWeighting()) as s:
        r = s.search(qparser.QueryParser("summary", None).parse("alfa OR bravo"))
        ids = [fs["id"] for fs in r]
        assert_equal(["2", "4", "1", "3"], ids)
class ToolBoxSearch( object ):
    """
    Support searching tools in a toolbox. This implementation uses
    the "whoosh" search library.
    """
    
    def __init__( self, toolbox ):
        """
        Create a searcher for `toolbox`. 
        """
        self.toolbox = toolbox
        self.enabled = tool_search_enabled
        if tool_search_enabled:
            self.build_index()
        
    def build_index( self ):
        self.storage = RamStorage()
        self.index = self.storage.create_index( schema )
        writer = self.index.writer()
        ## TODO: would also be nice to search section headers.
        for id, tool in self.toolbox.tools_by_id.iteritems():
            writer.add_document( id=id, title=to_unicode(tool.name), description=to_unicode(tool.description), help=to_unicode(tool.help) )
        writer.commit()
        
    def search( self, query, return_attribute='id' ):
        if not tool_search_enabled:
            return []
        # Change field boosts for searcher to place more weight on title, description than help.
        searcher = self.index.searcher( \
                        weighting=BM25F( field_B={ 'title_B' : 3, 'description_B' : 2, 'help_B' : 1 } \
                                    ) )
        # Set query to search title, description, and help.
        parser = MultifieldParser( [ 'title', 'description', 'help' ], schema = schema )
        results = searcher.search( parser.parse( query ), minscore=2.0 )
        return [ result[ return_attribute ] for result in results ]
 def test_pages(self):
     from whoosh.scoring import Frequency
     
     schema = fields.Schema(id=fields.ID(stored=True), c=fields.TEXT)
     st = RamStorage()
     ix = st.create_index(schema)
     
     w = ix.writer()
     w.add_document(id=u"1", c=u"alfa alfa alfa alfa alfa alfa")
     w.add_document(id=u"2", c=u"alfa alfa alfa alfa alfa")
     w.add_document(id=u"3", c=u"alfa alfa alfa alfa")
     w.add_document(id=u"4", c=u"alfa alfa alfa")
     w.add_document(id=u"5", c=u"alfa alfa")
     w.add_document(id=u"6", c=u"alfa")
     w.commit()
     
     s = ix.searcher(weighting=Frequency)
     q = query.Term("c", u"alfa")
     r = s.search(q)
     self.assertEqual([d["id"] for d in r], ["1", "2", "3", "4", "5", "6"])
     r = s.search_page(q, 2, pagelen=2)
     self.assertEqual([d["id"] for d in r], ["3", "4"])
     
     r = s.search_page(q, 10, pagelen=4)
     self.assertEqual(r.total, 6)
     self.assertEqual(r.pagenum, 2)
     self.assertEqual(r.pagelen, 2)
Exemple #5
0
def test_add_sortable():
    st = RamStorage()
    schema = fields.Schema(chapter=fields.ID(stored=True), price=fields.NUMERIC)
    ix = st.create_index(schema)
    with ix.writer() as w:
        w.add_document(chapter=u("alfa"), price=100)
        w.add_document(chapter=u("bravo"), price=200)
        w.add_document(chapter=u("charlie"), price=300)
        w.add_document(chapter=u("delta"), price=400)
    with ix.writer() as w:
        w.add_document(chapter=u("bravo"), price=500)
        w.add_document(chapter=u("alfa"), price=600)
        w.add_document(chapter=u("delta"), price=100)
        w.add_document(chapter=u("charlie"), price=200)
        w.merge = False

    with ix.reader() as r:
        assert not r.has_column("chapter")
        assert not r.has_column("price")

    with ix.writer() as w:
        sorting.add_sortable(w, "chapter", sorting.StoredFieldFacet("chapter"))
        sorting.add_sortable(w, "price", sorting.FieldFacet("price"))
        w.schema.test = 100

    with ix.reader() as r:
        assert r.has_column("chapter")
        assert r.has_column("price")

        chapr = r.column_reader("chapter")
        pricer = r.column_reader("price")
        assert chapr[0] == "alfa"
        assert pricer[0] == 100
Exemple #6
0
def test_midlevel_writing():
    st = RamStorage()
    schema = fields.Schema(t=fields.TEXT(phrase=False))
    ix = st.create_index(schema)
    w = ix.writer()
    w.add_document(t=u("alfa bravo charlie delta alfa bravo alfa"))
    w.commit()

    with ix.reader() as r:
        ti = r.termsindex["t", u("alfa")]
        assert_equal(ti.weight(), 3.0)
        assert_equal(ti.doc_frequency(), 1)
        assert_equal(ti.min_length(), 7)
        assert_equal(ti.max_length(), 7)
        assert_equal(ti.max_weight(), 3.0)
        assert_almost_equal(ti.max_wol(), 3.0 / 7)
        assert_equal(ti.postings, ((0,), (3.0,), (b('\x00\x00\x00\x03'),)))

    w = ix.writer()
    w.add_document(t=u("alfa charlie alfa"))
    w.commit()

    with ix.reader() as r:
        ti = r.termsindex["t", u("alfa")]
        assert_equal(ti.weight(), 5.0)
        assert_equal(ti.doc_frequency(), 2)
        assert_equal(ti.min_length(), 3)
        assert_equal(ti.max_length(), 7)
        assert_equal(ti.max_weight(), 3.0)
        assert_almost_equal(ti.max_wol(), 2.0 / 3)
        assert_equal(ti.postings, 0)
 def test_finalweighting(self):
     from whoosh.scoring import Weighting
     
     schema = fields.Schema(id=fields.ID(stored=True),
                            summary=fields.TEXT,
                            n_comments=fields.ID(stored=True))
     st = RamStorage()
     ix = st.create_index(schema)
     
     w = ix.writer()
     w.add_document(id=u"1", summary=u"alfa bravo", n_comments=u"5")
     w.add_document(id=u"2", summary=u"alfa", n_comments=u"12")
     w.add_document(id=u"3", summary=u"bravo", n_comments=u"2")
     w.add_document(id=u"4", summary=u"bravo bravo", n_comments=u"7")
     w.commit()
     
     class CommentWeighting(Weighting):
         def score(self, *args, **kwargs):
             return 0
         
         def final(self, searcher, docnum, score):
             ncomments = int(searcher.stored_fields(docnum).get("n_comments"))
             return ncomments
     
     s = ix.searcher(weighting=CommentWeighting())
     r = s.search(qparser.QueryParser("summary").parse("alfa OR bravo"))
     ids = [fs["id"] for fs in r]
     self.assertEqual(ids, ["2", "4", "1", "3"])
Exemple #8
0
def test_not2():
    schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT)
    storage = RamStorage()
    ix = storage.create_index(schema)
    writer = ix.writer()
    writer.add_document(name=u("a"), value=u("alfa bravo charlie delta echo"))
    writer.add_document(name=u("b"), value=u("bravo charlie delta echo foxtrot"))
    writer.add_document(name=u("c"), value=u("charlie delta echo foxtrot golf"))
    writer.add_document(name=u("d"), value=u("delta echo golf hotel india"))
    writer.add_document(name=u("e"), value=u("echo golf hotel india juliet"))
    writer.commit()

    with ix.searcher() as s:
        p = qparser.QueryParser("value", None)
        results = s.search(p.parse("echo NOT golf"))
        assert_equal(sorted([d["name"] for d in results]), ["a", "b"])

        results = s.search(p.parse("echo NOT bravo"))
        assert_equal(sorted([d["name"] for d in results]), ["c", "d", "e"])

    ix.delete_by_term("value", u("bravo"))

    with ix.searcher() as s:
        results = s.search(p.parse("echo NOT charlie"))
        assert_equal(sorted([d["name"] for d in results]), ["d", "e"])
Exemple #9
0
def test_missing_field_scoring():
    schema = fields.Schema(name=fields.TEXT(stored=True),
                           hobbies=fields.TEXT(stored=True))
    storage = RamStorage()
    ix = storage.create_index(schema)
    writer = ix.writer()
    writer.add_document(name=u('Frank'), hobbies=u('baseball, basketball'))
    writer.commit()
    r = ix.reader()
    assert_equal(r.field_length("hobbies"), 2)
    assert_equal(r.field_length("name"), 1)
    r.close()

    writer = ix.writer()
    writer.add_document(name=u('Jonny'))
    writer.commit()

    with ix.searcher() as s:
        r = s.reader()
        assert_equal(len(ix._segments()), 1)
        assert_equal(r.field_length("hobbies"), 2)
        assert_equal(r.field_length("name"), 2)

        parser = qparser.MultifieldParser(['name', 'hobbies'], schema)
        q = parser.parse(u("baseball"))
        result = s.search(q)
        assert_equal(len(result), 1)
Exemple #10
0
    def test_weighting(self):
        from whoosh.scoring import Weighting

        schema = fields.Schema(id=fields.ID(stored=True),
                               n_comments=fields.ID(stored=True))
        st = RamStorage()
        ix = st.create_index(schema)

        w = ix.writer()
        w.add_document(id=u"1", n_comments=u"5")
        w.add_document(id=u"2", n_comments=u"12")
        w.add_document(id=u"3", n_comments=u"2")
        w.add_document(id=u"4", n_comments=u"7")
        w.commit()

        class CommentWeighting(Weighting):
            def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
                ncomments = int(
                    searcher.stored_fields(docnum).get("n_comments", "0"))
                return ncomments

        s = ix.searcher(weighting=CommentWeighting())
        r = s.search(qparser.QueryParser("id").parse("[1 TO 4]"))
        ids = [fs["id"] for fs in r]
        self.assertEqual(ids, ["2", "4", "1", "3"])
def search_column_headers(entities, graph, table):
    #initiallize the Bigram index
    schema = Schema(
        title=NGRAMWORDS(minsize=2, maxsize=4, stored=True, field_boost=1.0, 
                         tokenizer=None, at='start', queryor=False, sortable=False),
        uri=TEXT(stored=True)
    )

    storage = RamStorage()
    ix = storage.create_index(schema)
    writer = ix.writer()
    for e in entities:
        entity = entities[e]
        if entity.type != "property":
            continue
        
        for value in entity.values:
            writer.add_document(title=unicode(value), uri=unicode(e))
    
    writer.commit()

    #loop the literal colunm headers
    for column in table.columns:
        query = column.header
        qp = QueryParser("title", schema=ix.schema)

        with ix.searcher() as searcher:
            for word in query.split():
                q = qp.parse(word.strip())
                results = searcher.search(q)
                for result in results:
                    column.candidates.add(result['uri'])
Exemple #12
0
    def test_deletion(self):
        s = fields.Schema(key=fields.ID, name=fields.TEXT, value=fields.TEXT)
        st = RamStorage()
        ix = st.create_index(s)

        w = ix.writer()
        w.add_document(key=u"A",
                       name=u"Yellow brown",
                       value=u"Blue red green purple?")
        w.add_document(key=u"B",
                       name=u"Alpha beta",
                       value=u"Gamma delta epsilon omega.")
        w.add_document(key=u"C", name=u"One two", value=u"Three four five.")
        w.commit()

        count = ix.delete_by_term("key", u"B")
        self.assertEqual(count, 1)
        ix.commit()

        self.assertEqual(ix.doc_count_all(), 3)
        self.assertEqual(ix.doc_count(), 2)

        ix.optimize()
        self.assertEqual(ix.doc_count(), 2)
        tr = ix.reader()
        self.assertEqual(list(tr.lexicon("name")),
                         ["brown", "one", "two", "yellow"])
        tr.close()
Exemple #13
0
    def test_merged(self):
        sc = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT)
        st = RamStorage()
        ix = st.create_index(sc)
        w = ix.writer()
        w.add_document(id=u"alfa", content=u"alfa")
        w.add_document(id=u"bravo", content=u"bravo")
        w.add_document(id=u"charlie", content=u"charlie")
        w.add_document(id=u"delta", content=u"delta")
        w.commit()

        s = ix.searcher()
        r = s.search(query.Term("content", u"bravo"))
        self.assertEqual(len(r), 1)
        self.assertEqual(r[0]["id"], "bravo")

        w = ix.writer()
        w.add_document(id=u"echo", content=u"echo")
        w.commit()
        self.assertEqual(len(ix.segments), 1)

        s = ix.searcher()
        r = s.search(query.Term("content", u"bravo"))
        self.assertEqual(len(r), 1)
        self.assertEqual(r[0]["id"], "bravo")
Exemple #14
0
    def test_frequency_text(self):
        s = fields.Schema(content=fields.KEYWORD)
        st = RamStorage()
        ix = st.create_index(s)

        w = ix.writer()
        w.add_document(content=u"alfa bravo charlie delta echo")
        w.add_document(content=u"bravo bravo bravo bravo charlie delta delta")
        w.add_document(content=u"delta echo foxtrot")
        w.commit()

        tr = ix.reader()
        self.assertEqual(tr.doc_frequency("content", u"bravo"), 2)
        self.assertEqual(tr.frequency("content", u"bravo"), 5)
        self.assertEqual(tr.doc_frequency("content", u"echo"), 2)
        self.assertEqual(tr.frequency("content", u"echo"), 2)
        self.assertEqual(tr.doc_frequency("content", u"alfa"), 1)
        self.assertEqual(tr.frequency("content", u"alfa"), 1)
        self.assertEqual(tr.doc_frequency("content", u"delta"), 3)
        self.assertEqual(tr.frequency("content", u"delta"), 4)
        self.assertEqual(tr.doc_frequency("content", u"foxtrot"), 1)
        self.assertEqual(tr.frequency("content", u"foxtrot"), 1)
        self.assertEqual(tr.doc_frequency("content", u"zulu"), 0)
        self.assertEqual(tr.frequency("content", u"zulu"), 0)
        self.assertEqual(list(tr), [(0, u"alfa", 1, 1), (0, u"bravo", 2, 5),
                                    (0, u"charlie", 2, 2), (0, u"delta", 3, 4),
                                    (0, u"echo", 2, 2), (0, u"foxtrot", 1, 1)])
        tr.close()
Exemple #15
0
    def test_frequency_keyword(self):
        s = fields.Schema(content=fields.KEYWORD)
        st = RamStorage()
        ix = st.create_index(s)

        w = ix.writer()
        w.add_document(content=u"A B C D E")
        w.add_document(content=u"B B B B C D D")
        w.add_document(content=u"D E F")
        w.commit()

        tr = ix.reader()
        self.assertEqual(tr.doc_frequency("content", u"B"), 2)
        self.assertEqual(tr.frequency("content", u"B"), 5)
        self.assertEqual(tr.doc_frequency("content", u"E"), 2)
        self.assertEqual(tr.frequency("content", u"E"), 2)
        self.assertEqual(tr.doc_frequency("content", u"A"), 1)
        self.assertEqual(tr.frequency("content", u"A"), 1)
        self.assertEqual(tr.doc_frequency("content", u"D"), 3)
        self.assertEqual(tr.frequency("content", u"D"), 4)
        self.assertEqual(tr.doc_frequency("content", u"F"), 1)
        self.assertEqual(tr.frequency("content", u"F"), 1)
        self.assertEqual(tr.doc_frequency("content", u"Z"), 0)
        self.assertEqual(tr.frequency("content", u"Z"), 0)
        self.assertEqual(list(tr), [(0, u"A", 1, 1), (0, u"B", 2, 5),
                                    (0, u"C", 2, 2), (0, u"D", 3, 4),
                                    (0, u"E", 2, 2), (0, u"F", 1, 1)])
        tr.close()
Exemple #16
0
    def test_merged_lengths(self):
        s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True),
                          f2=fields.KEYWORD(stored=True, scorable=True))
        st = RamStorage()
        ix = st.create_index(s)
        w = ix.writer()
        w.add_document(f1=u"A B C", f2=u"X")
        w.add_document(f1=u"B C D E", f2=u"Y Z")
        w.commit()

        w = ix.writer()
        w.add_document(f1=u"A", f2=u"B C D E X Y")
        w.add_document(f1=u"B C", f2=u"X")
        w.commit(NO_MERGE)

        w = ix.writer()
        w.add_document(f1=u"A B X Y Z", f2=u"B C")
        w.add_document(f1=u"Y X", f2=u"A B")
        w.commit(NO_MERGE)

        dr = ix.reader()
        self.assertEqual(dr.stored_fields(0)["f1"], u"A B C")
        self.assertEqual(dr.doc_field_length(0, "f1"), 3)
        self.assertEqual(dr.doc_field_length(2, "f2"), 6)
        self.assertEqual(dr.doc_field_length(4, "f1"), 5)
        dr.close()
Exemple #17
0
class ToolBoxSearch(object):
    """
    Support searching tools in a toolbox. This implementation uses
    the "whoosh" search library.
    """
    def __init__(self, toolbox):
        """
        Create a searcher for `toolbox`. 
        """
        self.toolbox = toolbox
        self.build_index()

    def build_index(self):
        self.storage = RamStorage()
        self.index = self.storage.create_index(schema)
        writer = self.index.writer()
        ## TODO: would also be nice to search section headers.
        for id, tool in self.toolbox.tools_by_id.iteritems():
            writer.add_document(id=id,
                                title=to_unicode(tool.name),
                                description=to_unicode(tool.description),
                                help=to_unicode(tool.help))
        writer.commit()

    def search(self, query, return_attribute='id'):
        # Change field boosts for searcher to place more weight on title, description than help.
        searcher = self.index.searcher( \
                        weighting=BM25F( field_B={ 'title_B' : 3, 'description_B' : 2, 'help_B' : 1 } \
                                    ) )
        # Set query to search title, description, and help.
        parser = MultifieldParser(['title', 'description', 'help'],
                                  schema=schema)
        results = searcher.search(parser.parse(query))
        return [result[return_attribute] for result in results]
Exemple #18
0
    def test_finalweighting(self):
        from whoosh.scoring import Weighting

        schema = fields.Schema(id=fields.ID(stored=True),
                               summary=fields.TEXT,
                               n_comments=fields.ID(stored=True))
        st = RamStorage()
        ix = st.create_index(schema)

        w = ix.writer()
        w.add_document(id=u"1", summary=u"alfa bravo", n_comments=u"5")
        w.add_document(id=u"2", summary=u"alfa", n_comments=u"12")
        w.add_document(id=u"3", summary=u"bravo", n_comments=u"2")
        w.add_document(id=u"4", summary=u"bravo bravo", n_comments=u"7")
        w.commit()

        class CommentWeighting(Weighting):
            def score(self, *args, **kwargs):
                return 0

            def final(self, searcher, docnum, score):
                ncomments = int(
                    searcher.stored_fields(docnum).get("n_comments"))
                return ncomments

        s = ix.searcher(weighting=CommentWeighting())
        r = s.search(qparser.QueryParser("summary").parse("alfa OR bravo"))
        ids = [fs["id"] for fs in r]
        self.assertEqual(ids, ["2", "4", "1", "3"])
Exemple #19
0
    def create_index(cls, app, wh):
        """Creates and opens an index for the given whoosheer and app.
        If the index already exists, it just opens it, otherwise it creates
        it first.

        :param app: The application instance.
        :param wh: The whoosheer instance for which a index should be created.
        """
        # TODO: do we really want/need to use camel casing?
        # everywhere else, there is just .lower()
        if app.extensions['whooshee']['memory_storage']:
            storage = RamStorage()
            index = storage.create_index(wh.schema)
            assert index
            return index
        else:
            index_path = os.path.join(
                app.extensions['whooshee']['index_path_root'],
                getattr(wh, 'index_subdir', cls.camel_to_snake(wh.__name__)))
            if whoosh.index.exists_in(index_path):
                index = whoosh.index.open_dir(index_path)
            else:
                if not os.path.exists(index_path):
                    os.makedirs(index_path)
                index = whoosh.index.create_in(index_path, wh.schema)
            return index
Exemple #20
0
    def test_pages(self):
        from whoosh.scoring import Frequency

        schema = fields.Schema(id=fields.ID(stored=True), c=fields.TEXT)
        st = RamStorage()
        ix = st.create_index(schema)

        w = ix.writer()
        w.add_document(id=u"1", c=u"alfa alfa alfa alfa alfa alfa")
        w.add_document(id=u"2", c=u"alfa alfa alfa alfa alfa")
        w.add_document(id=u"3", c=u"alfa alfa alfa alfa")
        w.add_document(id=u"4", c=u"alfa alfa alfa")
        w.add_document(id=u"5", c=u"alfa alfa")
        w.add_document(id=u"6", c=u"alfa")
        w.commit()

        s = ix.searcher(weighting=Frequency)
        q = query.Term("c", u"alfa")
        r = s.search(q)
        self.assertEqual([d["id"] for d in r], ["1", "2", "3", "4", "5", "6"])
        r = s.search_page(q, 2, pagelen=2)
        self.assertEqual([d["id"] for d in r], ["3", "4"])

        r = s.search_page(q, 10, pagelen=4)
        self.assertEqual(r.total, 6)
        self.assertEqual(r.pagenum, 2)
        self.assertEqual(r.pagelen, 2)
Exemple #21
0
def test_term_inspection():
    schema = fields.Schema(title=fields.TEXT(stored=True),
                           content=fields.TEXT)
    st = RamStorage()
    ix = st.create_index(schema)
    writer = ix.writer()
    writer.add_document(title=u("My document"),
                        content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE"))
    writer.add_document(title=u("My other document"),
                        content=u("AA AB BB CC EE EE AX AX DD"))
    writer.commit()

    reader = ix.reader()
    assert " ".join(reader.field_terms("content")) == "aa ab ax bb cc dd ee"
    assert list(reader.expand_prefix("content", "a")) == [b('aa'), b('ab'), b('ax')]
    assert set(reader.all_terms()) == set([('content', b('aa')), ('content', b('ab')),
                                           ('content', b('ax')), ('content', b('bb')),
                                           ('content', b('cc')), ('content', b('dd')),
                                           ('content', b('ee')), ('title', b('document')),
                                           ('title', b('my')), ('title', b('other'))])
    # (text, doc_freq, index_freq)
    assert _fstats(reader.iter_field("content")) == [(b('aa'), 2, 6), (b('ab'), 1, 1), (b('ax'), 1, 2),
                                                     (b('bb'), 2, 5), (b('cc'), 2, 3), (b('dd'), 2, 2),
                                                     (b('ee'), 2, 4)]
    assert _fstats(reader.iter_field("content", prefix="c")) == [(b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4)]
    assert list(reader.most_frequent_terms("content")) == [(6, b('aa')), (5, b('bb')), (4, b('ee')), (3, b('cc')), (2, b('dd'))]
    assert list(reader.most_frequent_terms("content", prefix="a")) == [(6, b('aa')), (2, b('ax')), (1, b('ab'))]
    assert list(reader.most_distinctive_terms("content", 3)) == [(1.3862943611198906, b('ax')), (0.6931471805599453, b('ab')), (0.0, b('ee'))]
def test_intersection():
    schema = fields.Schema(key=fields.ID(stored=True),
                           value=fields.TEXT(stored=True))
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    w.add_document(key=u("a"), value=u("alpha bravo charlie delta"))
    w.add_document(key=u("b"), value=u("echo foxtrot alpha bravo"))
    w.add_document(key=u("c"), value=u("charlie delta golf hotel"))
    w.commit()

    w = ix.writer()
    w.add_document(key=u("d"), value=u("india alpha bravo charlie"))
    w.add_document(key=u("e"), value=u("delta bravo india bravo"))
    w.commit()

    with ix.searcher() as s:
        q = And([Term("value", u("bravo")), Term("value", u("delta"))])
        m = q.matcher(s)
        assert _keys(s, m.all_ids()) == ["a", "e"]

        q = And([Term("value", u("bravo")), Term("value", u("alpha"))])
        m = q.matcher(s)
        assert _keys(s, m.all_ids()) == ["a", "b", "d"]
Exemple #23
0
def test_merged():
    sc = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT)
    st = RamStorage()
    ix = st.create_index(sc)
    w = ix.writer()
    w.add_document(id=u("alfa"), content=u("alfa"))
    w.add_document(id=u("bravo"), content=u("bravo"))
    w.add_document(id=u("charlie"), content=u("charlie"))
    w.add_document(id=u("delta"), content=u("delta"))
    w.commit()

    with ix.searcher() as s:
        r = s.search(Term("content", u("bravo")))
        assert_equal(len(r), 1)
        assert_equal(r[0]["id"], "bravo")

    w = ix.writer()
    w.add_document(id=u("echo"), content=u("echo"))
    w.commit()
    assert_equal(len(ix._segments()), 1)

    with ix.searcher() as s:
        r = s.search(Term("content", u("bravo")))
        assert_equal(len(r), 1)
        assert_equal(r[0]["id"], "bravo")
 def test_merged(self):
     sc = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT)
     st = RamStorage()
     ix = st.create_index(sc)
     w = ix.writer()
     w.add_document(id=u"alfa", content=u"alfa")
     w.add_document(id=u"bravo", content=u"bravo")
     w.add_document(id=u"charlie", content=u"charlie")
     w.add_document(id=u"delta", content=u"delta")
     w.commit()
     
     s = ix.searcher()
     r = s.search(query.Term("content", u"bravo"))
     self.assertEqual(len(r), 1)
     self.assertEqual(r[0]["id"], "bravo")
     
     w = ix.writer()
     w.add_document(id=u"echo", content=u"echo")
     w.commit()
     self.assertEqual(len(ix.segments), 1)
     
     s = ix.searcher()
     r = s.search(query.Term("content", u"bravo"))
     self.assertEqual(len(r), 1)
     self.assertEqual(r[0]["id"], "bravo")
Exemple #25
0
def test_weighting():
    from whoosh.scoring import Weighting, BaseScorer

    schema = fields.Schema(id=fields.ID(stored=True),
                           n_comments=fields.STORED)
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    w.add_document(id=u("1"), n_comments=5)
    w.add_document(id=u("2"), n_comments=12)
    w.add_document(id=u("3"), n_comments=2)
    w.add_document(id=u("4"), n_comments=7)
    w.commit()

    # Fake Weighting implementation
    class CommentWeighting(Weighting):
        def scorer(self, searcher, fieldname, text, qf=1):
            return self.CommentScorer(searcher.stored_fields)

        class CommentScorer(BaseScorer):
            def __init__(self, stored_fields):
                self.stored_fields = stored_fields

            def score(self, matcher):
                ncomments = self.stored_fields(matcher.id()).get("n_comments", 0)
                return ncomments

    with ix.searcher(weighting=CommentWeighting()) as s:
        q = TermRange("id", u("1"), u("4"), constantscore=False)

        r = s.search(q)
        ids = [fs["id"] for fs in r]
        assert_equal(ids, ["2", "4", "1", "3"])
Exemple #26
0
class ToolBoxSearch( object ):
    """
    Support searching tools in a toolbox. This implementation uses
    the Whoosh search library.
    """

    def __init__( self, toolbox, index_help=True ):
        """
        Create a searcher for `toolbox`.
        """
        self.toolbox = toolbox
        self.build_index( index_help )

    def build_index( self, index_help=True ):
        log.debug( 'Starting to build toolbox index.' )
        self.storage = RamStorage()
        self.index = self.storage.create_index( schema )
        writer = self.index.writer()
        for id, tool in self.toolbox.tools():
            #  Do not add data managers to the public index
            if tool.tool_type == 'manage_data':
                continue
            add_doc_kwds = {
                "id": id,
                "name": to_unicode( tool.name ),
                "description": to_unicode( tool.description ),
                "section": to_unicode( tool.get_panel_section()[1] if len( tool.get_panel_section() ) == 2 else '' ),
                "help": to_unicode( "" )
            }
            if tool.labels:
                add_doc_kwds['labels'] = to_unicode( " ".join( tool.labels ) )
            if index_help and tool.help:
                try:
                    add_doc_kwds['help'] = to_unicode( tool.help.render( host_url="", static_path="" ) )
                except Exception:
                    # Don't fail to build index just because a help message
                    # won't render.
                    pass
            writer.add_document( **add_doc_kwds )
        writer.commit()
        log.debug( 'Toolbox index finished.' )

    def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_help_boost, tool_search_limit ):
        """
        Perform search on the in-memory index. Weight in the given boosts.
        """
        # Change field boosts for searcher
        searcher = self.index.searcher(
            weighting=BM25F(
                field_B={ 'name_B': float( tool_name_boost ),
                          'section_B': float( tool_section_boost ),
                          'description_B': float( tool_description_boost ),
                          'help_B': float( tool_help_boost ) }
            )
        )
        # Set query to search name, description, section, help, and labels.
        parser = MultifieldParser( [ 'name', 'description', 'section', 'help', 'labels' ], schema=schema )
        # Perform the search
        hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) )
        return [ hit[ 'id' ] for hit in hits ]
Exemple #27
0
 def test_writer_delete(self):
     s = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT(stored=True))
     st = RamStorage()
     ix = st.create_index(s)
     
     w = ix.writer()
     w.add_document(key=u"1", value=u"alfa")
     w.add_document(key=u"2", value=u"bravo")
     w.add_document(key=u"3", value=u"charlie")
     w.commit()
     
     s = ix.searcher()
     self.assertEqual(s.document(key=u"1")["value"], "alfa")
     self.assertEqual(s.document(key=u"2")["value"], "bravo")
     self.assertEqual(s.document(key=u"3")["value"], "charlie")
     s.close()
     
     from whoosh.filedb.filewriting import OPTIMIZE
     w = ix.writer()
     w.delete_by_term("key", u"2")
     w.commit(OPTIMIZE)
     
     s = ix.searcher()
     self.assertEqual(s.document(key=u"1")["value"], "alfa")
     self.assertEqual(s.document(key=u"3")["value"], "charlie")
     self.assertEqual(list(s.reader().lexicon("key")), ["1", "3"])
     s.close()
Exemple #28
0
    def test_intersection(self):
        schema = fields.Schema(key=fields.ID(stored=True),
                               value=fields.TEXT(stored=True))
        st = RamStorage()
        ix = st.create_index(schema)

        w = ix.writer()
        w.add_document(key=u"a", value=u"alpha bravo charlie delta")
        w.add_document(key=u"b", value=u"echo foxtrot alpha bravo")
        w.add_document(key=u"c", value=u"charlie delta golf hotel")
        w.commit()

        w = ix.writer()
        w.add_document(key=u"d", value=u"india alpha bravo charlie")
        w.add_document(key=u"e", value=u"delta bravo india bravo")
        w.commit()

        searcher = ix.searcher()

        q = And([Term("value", u"bravo"), Term("value", u"delta")])
        sc = q.scorer(searcher)
        self.assertEqual(self._keys(searcher, sc.all_ids()), ["a", "e"])

        q = And([Term("value", u"bravo"), Term("value", u"alpha")])
        sc = q.scorer(searcher)
        self.assertEqual(self._keys(searcher, sc.all_ids()), ["a", "b", "d"])
    def create_index(cls, app, wh):
        """Creates and opens an index for the given whoosheer and app.
        If the index already exists, it just opens it, otherwise it creates
        it first.

        :param app: The application instance.
        :param wh: The whoosheer instance for which a index should be created.
        """
        # TODO: do we really want/need to use camel casing?
        # everywhere else, there is just .lower()
        if app.extensions['whooshee']['memory_storage']:
            storage = RamStorage()
            index = storage.create_index(wh.schema)
            assert index
            return index
        else:
            index_path = os.path.join(app.extensions['whooshee']['index_path_root'],
                                      getattr(wh, 'index_subdir', cls.camel_to_snake(wh.__name__)))
            if whoosh.index.exists_in(index_path):
                index = whoosh.index.open_dir(index_path)
            else:
                if not os.path.exists(index_path):
                    os.makedirs(index_path)
                index = whoosh.index.create_in(index_path, wh.schema)
            return index
Exemple #30
0
def test_datetime():
    dtf = fields.DATETIME(stored=True)
    schema = fields.Schema(id=fields.ID(stored=True), date=dtf)
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    for month in xrange(1, 12):
        for day in xrange(1, 28):
            w.add_document(id=u("%s-%s") % (month, day),
                           date=datetime(2010, month, day, 14, 0, 0))
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("date:20100523"))
        assert len(r) == 1
        assert r[0]["id"] == "5-23"
        assert r[0]["date"].__class__ is datetime
        assert r[0]["date"].month == 5
        assert r[0]["date"].day == 23

        r = s.search(qp.parse("date:'2010 02'"))
        assert len(r) == 27

        q = qp.parse(u("date:[2010-05 to 2010-08]"))
        startdt = datetime(2010, 5, 1, 0, 0, 0, 0)
        enddt = datetime(2010, 8, 31, 23, 59, 59, 999999)
        assert q.__class__ is query.NumericRange
        assert q.start == times.datetime_to_long(startdt)
        assert q.end == times.datetime_to_long(enddt)
Exemple #31
0
def test_term_inspection():
    schema = fields.Schema(title=fields.TEXT(stored=True),
                           content=fields.TEXT)
    st = RamStorage()
    ix = st.create_index(schema)
    writer = ix.writer()
    writer.add_document(title=u("My document"),
                        content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE"))
    writer.add_document(title=u("My other document"),
                        content=u("AA AB BB CC EE EE AX AX DD"))
    writer.commit()
    
    reader = ix.reader()
    assert_equal(list(reader.lexicon("content")), [u('aa'), u('ab'), u('ax'), u('bb'), u('cc'), u('dd'), u('ee')])
    assert_equal(list(reader.expand_prefix("content", "a")), [u('aa'), u('ab'), u('ax')])
    assert (set(reader.all_terms())
            == set([('content', u('aa')), ('content', u('ab')), ('content', u('ax')),
                    ('content', u('bb')), ('content', u('cc')), ('content', u('dd')),
                    ('content', u('ee')), ('title', u('document')), ('title', u('my')),
                    ('title', u('other'))]))
    # (text, doc_freq, index_freq)
    assert_equal(_fstats(reader.iter_field("content")),
                 [(u('aa'), 2, 6), (u('ab'), 1, 1), (u('ax'), 1, 2), (u('bb'), 2, 5),
                  (u('cc'), 2, 3), (u('dd'), 2, 2), (u('ee'), 2, 4)])
    assert_equal(_fstats(reader.iter_field("content", prefix="c")),
                 [(u('cc'), 2, 3), (u('dd'), 2, 2), (u('ee'), 2, 4)])
    assert_equal(list(reader.most_frequent_terms("content")),
                 [(6, u('aa')), (5, u('bb')), (4, u('ee')), (3, u('cc')), (2, u('dd'))])
    assert_equal(list(reader.most_frequent_terms("content", prefix="a")),
                 [(6, u('aa')), (2, u('ax')), (1, u('ab'))])
Exemple #32
0
def test_frequency_keyword():
    s = fields.Schema(content=fields.KEYWORD)
    st = RamStorage()
    ix = st.create_index(s)

    w = ix.writer()
    w.add_document(content=u("A B C D E"))
    w.add_document(content=u("B B B B C D D"))
    w.add_document(content=u("D E F"))
    w.commit()

    with ix.reader() as tr:
        assert tr.doc_frequency("content", u("B")) == 2
        assert tr.frequency("content", u("B")) == 5
        assert tr.doc_frequency("content", u("E")) == 2
        assert tr.frequency("content", u("E")) == 2
        assert tr.doc_frequency("content", u("A")) == 1
        assert tr.frequency("content", u("A")) == 1
        assert tr.doc_frequency("content", u("D")) == 3
        assert tr.frequency("content", u("D")) == 4
        assert tr.doc_frequency("content", u("F")) == 1
        assert tr.frequency("content", u("F")) == 1
        assert tr.doc_frequency("content", u("Z")) == 0
        assert tr.frequency("content", u("Z")) == 0

        stats = [(fname, text, ti.doc_frequency(), ti.weight())
                 for (fname, text), ti in tr]

        assert stats == [("content", b("A"), 1, 1), ("content", b("B"), 2, 5),
                         ("content", b("C"), 2, 2), ("content", b("D"), 3, 4),
                         ("content", b("E"), 2, 2), ("content", b("F"), 1, 1)]
Exemple #33
0
def test_stored_fields():
    s = fields.Schema(a=fields.ID(stored=True),
                      b=fields.STORED,
                      c=fields.KEYWORD,
                      d=fields.TEXT(stored=True))
    st = RamStorage()
    ix = st.create_index(s)

    writer = ix.writer()
    writer.add_document(a=u("1"), b="a", c=u("zulu"), d=u("Alfa"))
    writer.add_document(a=u("2"), b="b", c=u("yankee"), d=u("Bravo"))
    writer.add_document(a=u("3"), b="c", c=u("xray"), d=u("Charlie"))
    writer.commit()

    with ix.searcher() as sr:
        assert sr.stored_fields(0) == {"a": u("1"), "b": "a", "d": u("Alfa")}
        assert sr.stored_fields(2) == {
            "a": u("3"),
            "b": "c",
            "d": u("Charlie")
        }

        assert sr.document(a=u("1")) == {"a": u("1"), "b": "a", "d": u("Alfa")}
        assert sr.document(a=u("2")) == {
            "a": u("2"),
            "b": "b",
            "d": u("Bravo")
        }
Exemple #34
0
def test_frequency_text():
    s = fields.Schema(content=fields.KEYWORD)
    st = RamStorage()
    ix = st.create_index(s)

    w = ix.writer()
    w.add_document(content=u("alfa bravo charlie delta echo"))
    w.add_document(content=u("bravo bravo bravo bravo charlie delta delta"))
    w.add_document(content=u("delta echo foxtrot"))
    w.commit()

    with ix.reader() as tr:
        assert tr.doc_frequency("content", u("bravo")) == 2
        assert tr.frequency("content", u("bravo")) == 5
        assert tr.doc_frequency("content", u("echo")) == 2
        assert tr.frequency("content", u("echo")) == 2
        assert tr.doc_frequency("content", u("alfa")) == 1
        assert tr.frequency("content", u("alfa")) == 1
        assert tr.doc_frequency("content", u("delta")) == 3
        assert tr.frequency("content", u("delta")) == 4
        assert tr.doc_frequency("content", u("foxtrot")) == 1
        assert tr.frequency("content", u("foxtrot")) == 1
        assert tr.doc_frequency("content", u("zulu")) == 0
        assert tr.frequency("content", u("zulu")) == 0

        stats = [(fname, text, ti.doc_frequency(), ti.weight())
                 for (fname, text), ti in tr]

        assert stats == [("content", b("alfa"), 1, 1),
                         ("content", b("bravo"), 2, 5),
                         ("content", b("charlie"), 2, 2),
                         ("content", b("delta"), 3, 4),
                         ("content", b("echo"), 2, 2),
                         ("content", b("foxtrot"), 1, 1)]
Exemple #35
0
def _create_index():
    s = fields.Schema(f1=fields.KEYWORD(stored=True),
                      f2=fields.KEYWORD,
                      f3=fields.KEYWORD)
    st = RamStorage()
    ix = st.create_index(s)
    return ix
 def test_intersection(self):
     schema = fields.Schema(key = fields.ID(stored=True), value = fields.TEXT(stored=True))
     st = RamStorage()
     ix = st.create_index(schema)
     
     w = ix.writer()
     w.add_document(key=u"a", value=u"alpha bravo charlie delta")
     w.add_document(key=u"b", value=u"echo foxtrot alpha bravo")
     w.add_document(key=u"c", value=u"charlie delta golf hotel")
     w.commit()
     
     w = ix.writer()
     w.add_document(key=u"d", value=u"india alpha bravo charlie")
     w.add_document(key=u"e", value=u"delta bravo india bravo")
     w.commit()
     
     searcher = ix.searcher()
     
     q = And([Term("value", u"bravo"), Term("value", u"delta")])
     sc = q.scorer(searcher)
     self.assertEqual(self._keys(searcher, sc.all_ids()), ["a", "e"])
     
     q = And([Term("value", u"bravo"), Term("value", u"alpha")])
     sc = q.scorer(searcher)
     self.assertEqual(self._keys(searcher, sc.all_ids()), ["a", "b", "d"])
Exemple #37
0
def test_datetime():
    dtf = fields.DATETIME(stored=True)
    schema = fields.Schema(id=fields.ID(stored=True), date=dtf)
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    for month in xrange(1, 12):
        for day in xrange(1, 28):
            w.add_document(id=u("%s-%s") % (month, day),
                           date=datetime(2010, month, day, 14, 0, 0))
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("date:20100523"))
        assert len(r) == 1
        assert r[0]["id"] == "5-23"
        assert r[0]["date"].__class__ is datetime
        assert r[0]["date"].month == 5
        assert r[0]["date"].day == 23

        r = s.search(qp.parse("date:'2010 02'"))
        assert len(r) == 27

        q = qp.parse(u("date:[2010-05 to 2010-08]"))
        startdt = datetime(2010, 5, 1, 0, 0, 0, 0)
        enddt = datetime(2010, 8, 31, 23, 59, 59, 999999)
        assert q.__class__ is query.NumericRange
        assert q.start == times.datetime_to_long(startdt)
        assert q.end == times.datetime_to_long(enddt)
Exemple #38
0
def _create_index():
    s = fields.Schema(f1 = fields.KEYWORD(stored = True),
                      f2 = fields.KEYWORD,
                      f3 = fields.KEYWORD)
    st = RamStorage()
    ix = st.create_index(s)
    return ix
Exemple #39
0
def test_frequency_text():
    s = fields.Schema(content=fields.KEYWORD)
    st = RamStorage()
    ix = st.create_index(s)

    w = ix.writer()
    w.add_document(content=u("alfa bravo charlie delta echo"))
    w.add_document(content=u("bravo bravo bravo bravo charlie delta delta"))
    w.add_document(content=u("delta echo foxtrot"))
    w.commit()

    with ix.reader() as tr:
        assert tr.doc_frequency("content", u("bravo")) == 2
        assert tr.frequency("content", u("bravo")) == 5
        assert tr.doc_frequency("content", u("echo")) == 2
        assert tr.frequency("content", u("echo")) == 2
        assert tr.doc_frequency("content", u("alfa")) == 1
        assert tr.frequency("content", u("alfa")) == 1
        assert tr.doc_frequency("content", u("delta")) == 3
        assert tr.frequency("content", u("delta")) == 4
        assert tr.doc_frequency("content", u("foxtrot")) == 1
        assert tr.frequency("content", u("foxtrot")) == 1
        assert tr.doc_frequency("content", u("zulu")) == 0
        assert tr.frequency("content", u("zulu")) == 0

        stats = [(fname, text, ti.doc_frequency(), ti.weight())
                 for (fname, text), ti in tr]

        assert stats == [("content", b("alfa"), 1, 1),
                         ("content", b("bravo"), 2, 5),
                         ("content", b("charlie"), 2, 2),
                         ("content", b("delta"), 3, 4),
                         ("content", b("echo"), 2, 2),
                         ("content", b("foxtrot"), 1, 1)]
Exemple #40
0
def test_add_sortable():
    st = RamStorage()
    schema = fields.Schema(chapter=fields.ID(stored=True), price=fields.NUMERIC)
    ix = st.create_index(schema)
    with ix.writer() as w:
        w.add_document(chapter=u("alfa"), price=100)
        w.add_document(chapter=u("bravo"), price=200)
        w.add_document(chapter=u("charlie"), price=300)
        w.add_document(chapter=u("delta"), price=400)
    with ix.writer() as w:
        w.add_document(chapter=u("bravo"), price=500)
        w.add_document(chapter=u("alfa"), price=600)
        w.add_document(chapter=u("delta"), price=100)
        w.add_document(chapter=u("charlie"), price=200)
        w.merge = False

    with ix.reader() as r:
        assert not r.has_column("chapter")
        assert not r.has_column("price")

    with ix.writer() as w:
        sorting.add_sortable(w, "chapter", sorting.StoredFieldFacet("chapter"))
        sorting.add_sortable(w, "price", sorting.FieldFacet("price"))
        w.schema.test = 100

    with ix.reader() as r:
        assert r.has_column("chapter")
        assert r.has_column("price")

        chapr = r.column_reader("chapter")
        pricer = r.column_reader("price")
        assert chapr[0] == "alfa"
        assert pricer[0] == 100
Exemple #41
0
 def test_frequency_text(self):
     s = fields.Schema(content = fields.KEYWORD)
     st = RamStorage()
     ix = st.create_index(s)
     
     w = ix.writer()
     w.add_document(content = u"alfa bravo charlie delta echo")
     w.add_document(content = u"bravo bravo bravo bravo charlie delta delta")
     w.add_document(content = u"delta echo foxtrot")
     w.commit()
     
     tr = ix.reader()
     self.assertEqual(tr.doc_frequency("content", u"bravo"), 2)
     self.assertEqual(tr.frequency("content", u"bravo"), 5)
     self.assertEqual(tr.doc_frequency("content", u"echo"), 2)
     self.assertEqual(tr.frequency("content", u"echo"), 2)
     self.assertEqual(tr.doc_frequency("content", u"alfa"), 1)
     self.assertEqual(tr.frequency("content", u"alfa"), 1)
     self.assertEqual(tr.doc_frequency("content", u"delta"), 3)
     self.assertEqual(tr.frequency("content", u"delta"), 4)
     self.assertEqual(tr.doc_frequency("content", u"foxtrot"), 1)
     self.assertEqual(tr.frequency("content", u"foxtrot"), 1)
     self.assertEqual(tr.doc_frequency("content", u"zulu"), 0)
     self.assertEqual(tr.frequency("content", u"zulu"), 0)
     self.assertEqual(list(tr), [(0, u"alfa", 1, 1), (0, u"bravo", 2, 5),
                                 (0, u"charlie", 2, 2), (0, u"delta", 3, 4),
                                 (0, u"echo", 2, 2), (0, u"foxtrot", 1, 1)])
     tr.close()
Exemple #42
0
def test_frequency_keyword():
    s = fields.Schema(content=fields.KEYWORD)
    st = RamStorage()
    ix = st.create_index(s)

    w = ix.writer()
    w.add_document(content=u("A B C D E"))
    w.add_document(content=u("B B B B C D D"))
    w.add_document(content=u("D E F"))
    w.commit()

    with ix.reader() as tr:
        assert tr.doc_frequency("content", u("B")) == 2
        assert tr.frequency("content", u("B")) == 5
        assert tr.doc_frequency("content", u("E")) == 2
        assert tr.frequency("content", u("E")) == 2
        assert tr.doc_frequency("content", u("A")) == 1
        assert tr.frequency("content", u("A")) == 1
        assert tr.doc_frequency("content", u("D")) == 3
        assert tr.frequency("content", u("D")) == 4
        assert tr.doc_frequency("content", u("F")) == 1
        assert tr.frequency("content", u("F")) == 1
        assert tr.doc_frequency("content", u("Z")) == 0
        assert tr.frequency("content", u("Z")) == 0

        stats = [(fname, text, ti.doc_frequency(), ti.weight())
                 for (fname, text), ti in tr]

        assert stats == [("content", b("A"), 1, 1), ("content", b("B"), 2, 5),
                         ("content", b("C"), 2, 2), ("content", b("D"), 3, 4),
                         ("content", b("E"), 2, 2), ("content", b("F"), 1, 1)]
Exemple #43
0
 def test_frequency_keyword(self):
     s = fields.Schema(content = fields.KEYWORD)
     st = RamStorage()
     ix = st.create_index(s)
     
     w = ix.writer()
     w.add_document(content = u"A B C D E")
     w.add_document(content = u"B B B B C D D")
     w.add_document(content = u"D E F")
     w.commit()
     
     tr = ix.reader()
     self.assertEqual(tr.doc_frequency("content", u"B"), 2)
     self.assertEqual(tr.frequency("content", u"B"), 5)
     self.assertEqual(tr.doc_frequency("content", u"E"), 2)
     self.assertEqual(tr.frequency("content", u"E"), 2)
     self.assertEqual(tr.doc_frequency("content", u"A"), 1)
     self.assertEqual(tr.frequency("content", u"A"), 1)
     self.assertEqual(tr.doc_frequency("content", u"D"), 3)
     self.assertEqual(tr.frequency("content", u"D"), 4)
     self.assertEqual(tr.doc_frequency("content", u"F"), 1)
     self.assertEqual(tr.frequency("content", u"F"), 1)
     self.assertEqual(tr.doc_frequency("content", u"Z"), 0)
     self.assertEqual(tr.frequency("content", u"Z"), 0)
     self.assertEqual(list(tr), [(0, u"A", 1, 1), (0, u"B", 2, 5),
                                 (0, u"C", 2, 2), (0, u"D", 3, 4),
                                 (0, u"E", 2, 2), (0, u"F", 1, 1)])
     tr.close()
Exemple #44
0
 def test_merged_lengths(self):
     s = fields.Schema(f1 = fields.KEYWORD(stored = True, scorable = True),
                       f2 = fields.KEYWORD(stored = True, scorable = True))
     st = RamStorage()
     ix = st.create_index(s)
     w = ix.writer()
     w.add_document(f1 = u"A B C", f2 = u"X")
     w.add_document(f1 = u"B C D E", f2 = u"Y Z")
     w.commit()
     
     w = ix.writer()
     w.add_document(f1 = u"A", f2 = u"B C D E X Y")
     w.add_document(f1 = u"B C", f2 = u"X")
     w.commit(NO_MERGE)
     
     w = ix.writer()
     w.add_document(f1 = u"A B X Y Z", f2 = u"B C")
     w.add_document(f1 = u"Y X", f2 = u"A B")
     w.commit(NO_MERGE)
     
     dr = ix.reader()
     self.assertEqual(dr.stored_fields(0)["f1"], u"A B C")
     self.assertEqual(dr.doc_field_length(0, "f1"), 3)
     self.assertEqual(dr.doc_field_length(2, "f2"), 6)
     self.assertEqual(dr.doc_field_length(4, "f1"), 5)
     dr.close()
Exemple #45
0
class MemoryCodec(base.Codec):
    def __init__(self):
        from whoosh.filedb.filestore import RamStorage

        self.storage = RamStorage()
        self.segment = MemSegment(self, "blah")

    def writer(self, schema):
        ix = self.storage.create_index(schema)
        return MemWriter(ix, _lk=False, codec=self, docbase=self.segment._doccount)

    def reader(self, schema):
        return SegmentReader(self.storage, schema, self.segment, codec=self)

    def per_document_writer(self, storage, segment):
        return MemPerDocWriter(self.storage, self.segment)

    def field_writer(self, storage, segment):
        return MemFieldWriter(self.storage, self.segment)

    def per_document_reader(self, storage, segment):
        return MemPerDocReader(self.storage, self.segment)

    def terms_reader(self, storage, segment):
        return MemTermsReader(self.storage, self.segment)

    def new_segment(self, storage, indexname):
        return self.segment
Exemple #46
0
class MemoryCodec(base.Codec):
    def __init__(self):
        from whoosh.filedb.filestore import RamStorage

        self.storage = RamStorage()
        self.segment = MemSegment(self, "blah")

    def writer(self, schema):
        ix = self.storage.create_index(schema)
        return MemWriter(ix,
                         _lk=False,
                         codec=self,
                         docbase=self.segment._doccount)

    def reader(self, schema):
        return SegmentReader(self.storage, schema, self.segment, codec=self)

    def per_document_writer(self, storage, segment):
        return MemPerDocWriter(self.storage, self.segment)

    def field_writer(self, storage, segment):
        return MemFieldWriter(self.storage, self.segment)

    def per_document_reader(self, storage, segment):
        return MemPerDocReader(self.storage, self.segment)

    def terms_reader(self, storage, segment):
        return MemTermsReader(self.storage, self.segment)

    def new_segment(self, storage, indexname):
        return self.segment
Exemple #47
0
    def test_random_intersections(self):
        vals = [
            u"alpha", u"bravo", u"charlie", u"delta", u"echo", u"foxtrot",
            u"golf", u"hotel", u"india", u"juliet", u"kilo", u"lima", u"mike"
        ]
        segments = 5
        docsperseg = 50
        fieldlimits = (3, 10)
        documents = []

        schema = fields.Schema(key=fields.ID(stored=True),
                               value=fields.TEXT(stored=True))
        st = RamStorage()
        ix = st.create_index(schema)

        docnum = 0
        for segnum in xrange(segments):
            w = ix.writer()
            for d in xrange(docsperseg):
                doc = u" ".join(
                    choice(vals) for _ in xrange(randint(*fieldlimits)))
                w.add_document(key=unicode(docnum), value=doc)
                documents.append((str(docnum), doc))
                docnum += 1
            w.commit()
        self.assertNotEqual(len(ix.segments), 1)

        testcount = 50
        testlimits = (2, 5)

        searcher = ix.searcher()
        for testnum in xrange(testcount):
            matches = []
            while not matches:
                targets = sample(vals, randint(*testlimits))
                for docnum, doc in documents:
                    if all((doc.find(target) > -1) for target in targets):
                        matches.append(docnum)
            matches.sort()

            q = And([Term("value", target) for target in targets])
            sc = q.scorer(searcher)
            #t1 = now()
            ids1 = list(sc.all_ids())
            #t1 = now() - t1

            sc.reset()
            #t2 = now()
            ids2 = []
            while sc.id is not None:
                ids2.append(sc.id)
                sc.next()
            #t2 = now() - t2
            #print "t2=", t2
            self.assertEqual(ids1, ids2)
            #print t1, t2, t1/t2*100

            keys = self._keys(searcher, ids1)
            self.assertEqual(keys, matches)
Exemple #48
0
class ToolBoxSearch( object ):
    """
    Support searching tools in a toolbox. This implementation uses
    the Whoosh search library.
    """

    def __init__( self, toolbox, index_help=True ):
        """
        Create a searcher for `toolbox`.
        """
        self.toolbox = toolbox
        self.build_index( index_help )

    def build_index( self, index_help=True ):
        log.debug( 'Starting to build toolbox index.' )
        self.storage = RamStorage()
        self.index = self.storage.create_index( schema )
        writer = self.index.writer()
        for id, tool in self.toolbox.tools():
            #  Do not add data managers to the public index
            if tool.tool_type == 'manage_data':
                continue
            add_doc_kwds = {
                "id": id,
                "name": to_unicode( tool.name ),
                "description": to_unicode( tool.description ),
                "section": to_unicode( tool.get_panel_section()[1] if len( tool.get_panel_section() ) == 2 else '' ),
                "help": to_unicode( "" )
            }
            if index_help and tool.help:
                try:
                    add_doc_kwds['help'] = to_unicode( tool.help.render( host_url="", static_path="" ) )
                except Exception:
                    # Don't fail to build index just because a help message
                    # won't render.
                    pass
            writer.add_document( **add_doc_kwds )
        writer.commit()
        log.debug( 'Toolbox index finished.' )

    def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_help_boost, tool_search_limit ):
        """
        Perform search on the in-memory index. Weight in the given boosts.
        """
        # Change field boosts for searcher
        searcher = self.index.searcher(
            weighting=BM25F(
                field_B={ 'name_B': float( tool_name_boost ),
                          'section_B': float( tool_section_boost ),
                          'description_B': float( tool_description_boost ),
                          'help_B': float( tool_help_boost ) }
            )
        )
        # Set query to search name, description, section, and help.
        parser = MultifieldParser( [ 'name', 'description', 'section', 'help' ], schema=schema )
        # Perform the search
        hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) )
        return [ hit[ 'id' ] for hit in hits ]
Exemple #49
0
def create_tenant_schema(tenants):
    tenant_schema = Schema(name=TEXT(stored=True), id=NUMERIC(stored=True))
    tenant_storage = RamStorage()
    tenant_ix = tenant_storage.create_index(tenant_schema)
    tenant_writer = tenant_ix.writer()
    for t in tenants:
        tenant_writer.add_document(id=t["id"], name=t["name"].lower())
    tenant_writer.commit()
    return tenant_ix
Exemple #50
0
 def __init__(self):
     tenants = Tenant.objects.all().order_by('name')
     tenant_schema = Schema(name=TEXT(stored=True), id=NUMERIC(stored=True))
     tenant_storage = RamStorage()
     tenant_ix = tenant_storage.create_index(tenant_schema)
     tenant_writer = tenant_ix.writer()
     for t in tenants:
         tenant_writer.add_document(id=t.id, name=t.name.lower())
     tenant_writer.commit()
     self.index = tenant_ix
 def test_random_intersections(self):
     vals = [u"alpha", u"bravo", u"charlie", u"delta", u"echo", u"foxtrot", u"golf",
             u"hotel", u"india", u"juliet", u"kilo", u"lima", u"mike"]
     segments = 5
     docsperseg = 50
     fieldlimits = (3, 10)
     documents = []
     
     schema = fields.Schema(key = fields.ID(stored=True), value = fields.TEXT(stored=True))
     st = RamStorage()
     ix = st.create_index(schema)
     
     docnum = 0
     for segnum in xrange(segments):
         w = ix.writer()
         for d in xrange(docsperseg):
             doc = u" ".join(choice(vals) for _ in xrange(randint(*fieldlimits)))
             w.add_document(key=unicode(docnum), value = doc)
             documents.append((str(docnum), doc))
             docnum += 1
         w.commit()
     self.assertNotEqual(len(ix.segments), 1)
     
     testcount = 50
     testlimits = (2, 5)
     
     searcher = ix.searcher()
     for testnum in xrange(testcount):
         matches = []
         while not matches:
             targets = sample(vals, randint(*testlimits))
             for docnum, doc in documents:
                 if all((doc.find(target) > -1) for target in targets):
                     matches.append(docnum)
         matches.sort()
         
         q = And([Term("value", target) for target in targets])
         sc = q.scorer(searcher)
         #t1 = now()
         ids1 = list(sc.all_ids())
         #t1 = now() - t1
         
         sc.reset()
         #t2 = now()
         ids2 = []
         while sc.id is not None:
             ids2.append(sc.id)
             sc.next()
         #t2 = now() - t2
         #print "t2=", t2
         self.assertEqual(ids1, ids2)
         #print t1, t2, t1/t2*100
         
         keys = self._keys(searcher, ids1)
         self.assertEqual(keys, matches)
Exemple #52
0
class WhooshGuess(object):
    def __init__(self):
        self.storage = RamStorage()
        schema = Schema(key=ID(stored=True), \
                ask=BOOLEAN(stored=True), \
                content=TEXT(stored=True, analyzer=RegexTokenizer()))
        self.ix = self.storage.create_index(schema)
        self.writer = self.ix.writer()
        self.is_train = False

        for s in greeting.split('\n'):
            self.train(u'matchinggreeting', s)
    
    @property
    def is_ok(self):
        return self.is_train

    def train(self, key, line):
        splits = u' '.join(list(lang.tokenizezh(line)))
        ask = lang.is_question(key)
        #print ask
        #print splits
        self.writer.add_document(key=key, content=splits, ask=ask)

    def train_ok(self):
        self.writer.commit(optimize=True)
        self.searcher = self.ix.searcher()
        self.parser = QueryParser("content", schema=self.ix.schema)
        self.is_train = True

    def guess(self, s, is_ask = None):
        assert(self.is_train)

        keys = list(lang.keyword(s))
        if len(keys) == 0:
            return ''
        
        # MUST contain the keys
        keys = u' '.join(keys)
        splits = u' '.join(list(lang.tokenizezh(s)))
        #q = self.parser.parse(splits + ' OR ' + keys)
        q1 = self.parser.parse(keys)
        q2 = self.parser.parse(splits)
        q = q1 | q2
        #print unicode(q)

        if not is_ask:
            ask = query.Term(u"ask", lang.is_question(s))
        else:
            ask = query.Term(u"ask", is_ask)
        results = self.searcher.search(q, filter=ask)
        for hit in results:
            return hit['key']
        return ''