def search(humanReadableId): query = request.args.get('q', '').strip() pagination = None if query: index_base_dir = config().get_path("ZIM", "wikipedia_index_dir") index_dir = os.path.join(index_base_dir, humanReadableId) page = int(request.args.get('page', 1)) # Load index so we can query it for which fields exist ix = whoosh_open_dir_32_or_64(index_dir) # Set a higher value for the title field so it is weighted more weighting = scoring.BM25F(title_B=1.0) # Sort pages with "Image:" in their title after # regular articles def image_pages_last(searcher, docnum): fields = searcher.stored_fields(docnum) if fields['title'].find("Image:") == 0: return 1 else: return 0 # Support older whoosh indexes that do not have a reverse_links field if 'reverse_links' in ix.schema.names(): sortedby = sorting.MultiFacet([ sorting.FunctionFacet(image_pages_last), sorting.ScoreFacet(), sorting.FieldFacet("reverse_links", reverse=True), ]) else: sortedby = sorting.MultiFacet([ sorting.FunctionFacet(image_pages_last), sorting.ScoreFacet(), ]) (pagination, suggestion) = paginated_search(ix, ["title", "content"], query, page, weighting=weighting, sort_column=sortedby) else: flash(_('Please input keyword(s)'), 'error') return render_template('zim/search.html', humanReadableId=humanReadableId, pagination=pagination, suggestion=suggestion, keywords=query, endpoint_desc=EndPointDescription( 'zim_views.search', {'humanReadableId': humanReadableId}))
def test(ix): with ix.searcher() as s: # Sort by title r = s.search(query.Every(), sortedby="title") assert [hit["title"] for hit in r] == sorted_titles # Sort by reverse title facet = sorting.FieldFacet("title", reverse=True) r = s.search(query.Every(), sortedby=facet) assert [hit["title"] for hit in r] == list(reversed(sorted_titles)) # Sort by num (-10 to 10) first, and within that, by reverse title facet = sorting.MultiFacet() facet.add_field("num") facet.add_field("title", reverse=True) r = s.search(query.Every(), sortedby=facet) target = ["Visual and Statistical Thinking", "Cognitive Style of Powerpoint", "Beautiful Evidence", "Visual Explanations", "Visual Display of Quantitative Information, The", "Envisioning Information", ] assert [hit["title"] for hit in r] == target
def test_multifacet(): schema = fields.Schema(tag=fields.ID(stored=True), size=fields.ID(stored=True)) with TempIndex(schema, "multifacet") as ix: w = ix.writer() w.add_document(tag=u("alfa"), size=u("small")) w.add_document(tag=u("bravo"), size=u("medium")) w.add_document(tag=u("alfa"), size=u("large")) w.add_document(tag=u("bravo"), size=u("small")) w.add_document(tag=u("alfa"), size=u("medium")) w.add_document(tag=u("bravo"), size=u("medium")) w.commit() correct = { (u('bravo'), u('medium')): [1, 5], (u('alfa'), u('large')): [2], (u('alfa'), u('medium')): [4], (u('alfa'), u('small')): [0], (u('bravo'), u('small')): [3] } with ix.searcher() as s: facet = sorting.MultiFacet(["tag", "size"]) r = s.search(query.Every(), groupedby={"tag/size": facet}) cats = r.groups(("tag/size")) assert_equal(cats, correct)
def search(term, ix='indexdir', limit=None): # Load the index. if isinstance(ix, str): ix = index.open_dir(ix) # Parse the search terms. s = ix.searcher() parser = QueryParser('line', schema=ix.schema) q = parser.parse(term) # Search and sort the results. mf = sorting.MultiFacet() mf.add_field('filename') mf.add_field('number') return s.search(q, limit=limit, sortedby=mf)
def search(self, query, section=None, page=1, per_page=20, excerpt_fragmenter=None, excerpt_maxchars=None, excerpt_surround=None): qp = MultifieldParser(['title', 'content'], self.schema) q = qp.parse(unicode(query)) mf = sorting.MultiFacet() mf.add_field("priority", reverse=True) if section is not None: q = And([q, Term('section', unicode(section))]) def _make_item(hit): text = self.get_content(hit['path'], hit['section']) if text is not None: excerpt = hit.highlights('content', text=text) else: excerpt = None return { 'path': hit['path'], 'title': hit['title'], 'excerpt': excerpt, 'section': section, } with self.whoosh_index.searcher() as searcher: rv = searcher.search_page(q, page, sortedby=mf, pagelen=per_page) frag, anal = make_fragmenter_and_analyzer(excerpt_fragmenter, excerpt_maxchars, excerpt_surround) rv.results.formatter = make_html_formatter() if frag is not None: rv.results.fragmenter = frag if anal is not None: rv.results.analyzer = anal return { 'items': [_make_item(x) for x in rv.results], 'pages': rv.pagecount, 'page': page, 'per_page': per_page }
def test_sorted_extend(): from whoosh import sorting schema = fields.Schema(title=fields.TEXT(stored=True), keywords=fields.TEXT, num=fields.NUMERIC(stored=True, sortable=True)) domain = u"alfa bravo charlie delta echo foxtrot golf hotel india".split() keys = u"juliet kilo lima november oskar papa quebec romeo".split() combined = 0 tcount = 0 kcount = 0 with TempIndex(schema) as ix: with ix.writer() as w: for i, words in enumerate(permutations(domain, 3)): key = keys[i % (len(domain) - 1)] if "bravo" in words: tcount += 1 if key == "kilo": kcount += 1 if "bravo" in words or key == "kilo": combined += 1 w.add_document(title=u" ".join(words), keywords=key, num=i) with ix.searcher() as s: facet = sorting.MultiFacet([ sorting.FieldFacet("num", reverse=True), sorting.ScoreFacet() ]) r1 = s.search(query.Term("title", "bravo"), limit=None, sortedby=facet) r2 = s.search(query.Term("keywords", "kilo"), limit=None, sortedby=facet) assert len(r1) == tcount assert len(r2) == kcount r1.extend(r2) assert len(r1) == combined
def test_numeric_field_facet(): schema = fields.Schema(id=fields.STORED, v1=fields.NUMERIC, v2=fields.NUMERIC) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, v1=2, v2=100) w.add_document(id=2, v1=1, v2=50) w.commit() w = ix.writer() w.add_document(id=3, v1=2, v2=200) w.add_document(id=4, v1=1, v2=100) w.commit() w = ix.writer(merge=False) w.add_document(id=5, v1=2, v2=50) w.add_document(id=6, v1=1, v2=200) w.commit() with ix.searcher() as s: mf = sorting.MultiFacet().add_field("v1").add_field("v2", reverse=True) r = s.search(query.Every(), sortedby=mf) assert [hit["id"] for hit in r] == [6, 4, 2, 3, 1, 5]
def test_score_facet(): schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT, c=fields.ID) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, a=u("alfa alfa bravo"), b=u("bottle"), c=u("c")) w.add_document(id=2, a=u("alfa alfa alfa"), b=u("bottle"), c=u("c")) w.commit() w = ix.writer() w.add_document(id=3, a=u("alfa bravo bravo"), b=u("bottle"), c=u("c")) w.add_document(id=4, a=u("alfa bravo alfa"), b=u("apple"), c=u("c")) w.commit(merge=False) w = ix.writer() w.add_document(id=5, a=u("alfa bravo bravo"), b=u("apple"), c=u("c")) w.add_document(id=6, a=u("alfa alfa alfa"), b=u("apple"), c=u("c")) w.commit(merge=False) with ix.searcher() as s: facet = sorting.MultiFacet(["b", sorting.ScoreFacet()]) r = s.search(q=query.Term("a", u("alfa")), sortedby=facet) assert [h["id"] for h in r] == [6, 4, 5, 2, 1, 3]
def test_multisort(): mf = sorting.MultiFacet(["tag", "id"]) try_sort(mf, lambda d: (d["tag"], d["id"])) try_sort(mf, lambda d: (d["tag"], d["id"]), reverse=True) try_sort(mf, lambda d: (d["tag"], d["id"]), limit=5) try_sort(mf, lambda d: (d["tag"], d["id"]), reverse=True, limit=5)
# # tags=",".join(one['tags']), # content=one['index_content'] # ) # counts += 1 # if counts == 200: # break count = 0 start = time.time() with ix.searcher(weighting=scoring.BM25F()) as searcher: query = MultifieldParser(["title", "content"], ix.schema).parse("xss") #query = QueryParser("content", ix.schema).parse("xss") mf = sorting.MultiFacet() mf.add_field("date", reverse=True) results = searcher.search(query, limit=10, sortedby=mf) #results = searcher.search_page(query, 2, pagelen=10) #print(results) print(len(results)) #results = results[-10:] for one in results: # print(one['content']) # print(one.highlights("content")) _id = ObjectId(one['nid']) res = collections.find({'_id':_id})[0] print(res['date'] + res['title']) print('-----------------------\n')