-
Notifications
You must be signed in to change notification settings - Fork 4
/
example.py
101 lines (83 loc) · 3.3 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from typing import Dict, List, Sequence
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import MultifieldParser
from whoosh.filedb.filestore import RamStorage
from whoosh.analysis import StemmingAnalyzer
import json
#
# Simple example indexing to an in-memory index and performing a search
# across multiple fields and returning an array of highlighted results.
#
# One lacking feature of Whoosh is the no-analyze option. In this example
# the SearchEngine modifies the given schema and adds a RAW field. When doc
# are added to the index only stored fields in the schema are passed to Whoosh
# along with json encoded version of the whole doc stashed in the RAW field.
#
# On query the <Hit> in the result is ignored and instead the RAW field is
# decoded containing any extra fields present in the original document.
#
class SearchEngine:
def __init__(self, schema):
self.schema = schema
schema.add('raw', TEXT(stored=True))
self.ix = RamStorage().create_index(self.schema)
def index_documents(self, docs: Sequence):
writer = self.ix.writer()
for doc in docs:
d = {k: v for k,v in doc.items() if k in self.schema.stored_names()}
d['raw'] = json.dumps(doc) # raw version of all of doc
writer.add_document(**d)
writer.commit(optimize=True)
def get_index_size(self) -> int:
return self.ix.doc_count_all()
def query(self, q: str, fields: Sequence, highlight: bool=True) -> List[Dict]:
search_results = []
with self.ix.searcher() as searcher:
results = searcher.search(MultifieldParser(fields, schema=self.schema).parse(q))
for r in results:
d = json.loads(r['raw'])
if highlight:
for f in fields:
if r[f] and isinstance(r[f], str):
d[f] = r.highlights(f) or r[f]
search_results.append(d)
return search_results
if __name__ == '__main__':
docs = [
{
"id": "1",
"title": "First document banana",
"description": "This is the first document we've added in San Francisco!",
"tags": ['foo', 'bar'],
"extra": "kittens and cats"
},
{
"id": "2",
"title": "Second document hatstand",
"description": "The second one is even more interesting!",
"tags": ['alice'],
"extra": "foals and horses"
},
{
"id": "3",
"title": "Third document slug",
"description": "The third one is less interesting!",
"tags": ['bob'],
"extra": "bunny and rabbit"
},
]
schema = Schema(
id=ID(stored=True),
title=TEXT(stored=True),
description=TEXT(stored=True, analyzer=StemmingAnalyzer()),
tags=KEYWORD(stored=True)
)
engine = SearchEngine(schema)
engine.index_documents(docs)
print(f"indexed {engine.get_index_size()} documents")
fields_to_search = ["title", "description", "tags"]
for q in ["hatstand", "banana", "first", "second", "alice", "bob", "san francisco"]:
print(f"Query:: {q}")
print("\t", engine.query(q, fields_to_search, highlight=True))
print("-"*70)