def query_search(indexdir, queries, n=10, function='BM25F'): ix = index.open_dir(indexdir) search_fields = ['resname', 'categories', 'address', 'city', 'state'] # search fields og = qparser.OrGroup.factory(0.9) qp = MultifieldParser(search_fields, ix.schema, termclass=query.Variations, group=og) qp.add_plugin(DateParserPlugin(free=True)) q = qp.parse(queries) result_index = [] if function == 'BM25F': with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s: rates = sorting.FieldFacet('rating', reverse=True) scores = sorting.ScoreFacet() results = s.search(q, limit=n, sortedby=[scores, rates]) k = min(len(results), n) for i in range(k): result_index.append(int(results[i]['ID'])) if function == 'TF_IDF': with ix.searcher(weighting=scoring.TF_IDF()) as s: rates = sorting.FieldFacet('rating', reverse=True) scores = sorting.ScoreFacet() results = s.search(q, limit=n, sortedby=[scores, rates]) k = min(len(results), n) for i in range(k): result_index.append(int(results[i]['ID'])) return result_index
def person_query_search(indexdir, queries, user_id, E, n=10, function='BM25F'): prediction = user_cf(E, user_id, 3) ix = index.open_dir(indexdir) search_fields = ['resname', 'categories', 'address', 'city', 'state'] # search fields og = qparser.OrGroup.factory(0.9) qp = MultifieldParser(search_fields, ix.schema, termclass=query.Variations, group=og) qp.add_plugin(DateParserPlugin(free=True)) q = qp.parse(queries) result_index = [] if function == 'BM25F': # with ix.searcher(weighting=scoring.BM25F(B=0.75, resname_B = 1.0, categories_B = 0.8, K1=1.2)) as s: # add weight for the resname and the categories_B with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s: scores = sorting.ScoreFacet() results = s.search(q, limit=None, sortedby=[scores]) m = len(results) if m != 0: relevance = np.zeros(m) expected = np.zeros(m) for i in range(m): relevance[i] = -results[i].score relevance = (relevance - relevance.min()) / (relevance.max() - relevance.min()) # normalized score from 0 to 1 for i in range(m): expected[i] = relevance[i] * prediction[int( results[i]['ID'])] indorder = np.argsort(expected) k = min(m, n) for i in range(k): result_index.append(int(results[indorder[-1 - i]]['ID'])) if function == 'TF_IDF': with ix.searcher(weighting=scoring.TF_IDF()) as s: scores = sorting.ScoreFacet() results = s.search(q, limit=m, sortedby=[scores]) m = len(results) if m != 0: relevance = np.zeros(m) expected = np.zeros(m) for i in range(m): relevance[i] = -results[i].score relevance = (relevance - relevance.min()) / ( relevance.max() - relevance.min() ) # normalized score from 0 to 1 for i in range(m): expected[i] = relevance[i] * prediction[int( results[i]['ID'])] indorder = np.argsort(expected) k = min(m, n) for i in range(k): result_index.append(int(results[indorder[-1 - i]]['ID'])) return result_index
def search(humanReadableId): query = request.args.get('q', '').strip() pagination = None if query: index_base_dir = config().get_path("ZIM", "wikipedia_index_dir") index_dir = os.path.join(index_base_dir, humanReadableId) page = int(request.args.get('page', 1)) # Load index so we can query it for which fields exist ix = whoosh_open_dir_32_or_64(index_dir) # Set a higher value for the title field so it is weighted more weighting = scoring.BM25F(title_B=1.0) # Sort pages with "Image:" in their title after # regular articles def image_pages_last(searcher, docnum): fields = searcher.stored_fields(docnum) if fields['title'].find("Image:") == 0: return 1 else: return 0 # Support older whoosh indexes that do not have a reverse_links field if 'reverse_links' in ix.schema.names(): sortedby = sorting.MultiFacet([ sorting.FunctionFacet(image_pages_last), sorting.ScoreFacet(), sorting.FieldFacet("reverse_links", reverse=True), ]) else: sortedby = sorting.MultiFacet([ sorting.FunctionFacet(image_pages_last), sorting.ScoreFacet(), ]) (pagination, suggestion) = paginated_search(ix, ["title", "content"], query, page, weighting=weighting, sort_column=sortedby) else: flash(_('Please input keyword(s)'), 'error') return render_template('zim/search.html', humanReadableId=humanReadableId, pagination=pagination, suggestion=suggestion, keywords=query, endpoint_desc=EndPointDescription( 'zim_views.search', {'humanReadableId': humanReadableId}))
def post_search(pn=1, size=10): """ :param pn: :param size: :return: """ keyword = request.values.get('kw') if keyword is None: return render_template('search/list.html', title='搜索', message='搜索关键字不能为空') with whoosh_searcher.get_searcher('posts') as searcher: parser = qparser.MultifieldParser( ['title', 'content'], whoosh_searcher.get_index('posts').schema) q = parser.parser(keyword) result = searcher.search_page(q, pagenum=pn, pagelen=size, sortedby=sorting.ScoreFacet()) result_list = [x.fields() for x in result.results] page = models.Page(page_num=pn, per_page=size, result_list=result_list, has_more=result.pagecount > pn, total_page=result.pagecount, total=result.total) return render_template('search/list.html', title=keyword + '搜索结果', page=page, kw=keyword)
def post_search(pn=1, size=10): keyword = request.values.get('kw') if keyword is None: return render_template('search/list.html', title='搜索', message='搜索关键字不能为空!') with whoosh_searcher.get_searcher('posts') as searcher: # q = query.Or([query.Term('title', keyword), query.Term('content', keyword)]) parser = qparser.MultifieldParser( ['title', 'content'], whoosh_searcher.get_index('posts').schema) q = parser.parse(keyword) result = searcher.search_page(q, pagenum=pn, pagelen=size, sortedby=sorting.ScoreFacet()) result_list = [x.fields() for x in result.results] page = models.Page(pn, size, result=result_list, has_more=result.pagecount > pn, total_page=result.pagecount, total=result.total) print(page.result) # return jsonify(page) return render_template('search/list.html', title=keyword + '搜索结果', page=page, kw=keyword)
def listarPorAtributo(busqueda="", categoria=[], order="", groupDic={}, nElementosPagina=20, pagina=1): tam = 0 ix = index.open_dir("whooshLicor/licoresIndex") lista = [] busqueda = busqueda.strip() with ix.searcher() as searcher: if (not (busqueda) and not (categoria)): query = QueryParser("titulo", ix.schema).parse("*") elif (not (busqueda) and categoria): query = QueryParser( "titulo", ix.schema).parse("*") & queryCategoryGenerator(categoria) elif (busqueda and not (categoria)): query = querySearchGenerator(busqueda) elif (busqueda and categoria): query = querySearchGenerator(busqueda) & queryCategoryGenerator( categoria) query.normalize() if not order: order = sorting.ScoreFacet() groupMap = agruparLista(groupDic) results = searcher.search(query, groupedby=groupMap, sortedby=[faceta_enStock(), order], limit=4000) grupo = range(0, searcher.doc_count()) tam = len(results) if (groupMap): try: if "precio/graduacion" in groupMap.keys(): tuplaKey = (groupDic["precio"], groupDic["graduacion"]) elif ("precio" in groupMap.keys()): tuplaKey = groupDic["precio"] else: tuplaKey = groupDic["graduacion"] grupo = results.groups(next(iter(groupMap)))[tuplaKey] except: grupo = [] for documentIndex in grupo[(pagina - 1) * nElementosPagina:pagina * nElementosPagina]: elemento = searcher.stored_fields(documentIndex) lista.append(elemento['id']) tam = len(grupo) elif not (groupDic): for r in results[(pagina - 1) * nElementosPagina:pagina * nElementosPagina]: lista.append(r['id']) return (lista, tam)
def search_name(self, key_word): qp = QueryParser("novelName", schema=self.ix.schema) q = qp.parse(key_word) # score scores = sorting.ScoreFacet() results = self.searcher.search(q, limit=LIMIT, sortedby=[scores]) print(len(results)) for i in results: print(i) return results
def search(self, query_string, page="1", limit=20): results = [] query_string = unicode(query_string, 'utf-8') with self.index.searcher() as searcher: query = QueryParser("content", self.index.schema).parse(query_string) scores = sorting.ScoreFacet() sortperson = sorting.FieldFacet("person") sortcollection = sorting.FieldFacet("collection", reverse=True) resultset = searcher.search_page( query, int(page), pagelen=int(limit), sortedby=[sortcollection, scores, sortperson]) # NOTE: Need to copy plain dicts out, since once the searcher # dies (end of with block), the Hit results lose their reference to # the data. for hit in resultset[0:]: # Grab a copy of the results as a plain dict. result = hit.fields() # Also grab the surrounding fragment as a highlight. # NOTE: This is pretty much the only point we know # "where" in the matched document the hit occurs. # The raw content we indexed is stored in 'content', # so we tell the Hit instance to pull the surrounding # text fragments from there. # Also: # These highlights are pretty much the only reason # we need to bother stashing the entire document. # Otherwise, the index can be even smaller. # Whoosh allows to hunt for the content in the # original files, if they're available. But as our # text content isn't large -- keeping it in the # index seems faster. result['highlights'] = hit.highlights('content') results.append(result) results = { 'matches': results, 'matches_returned': resultset.scored_length(), 'total_matches': len(resultset), 'query': query_string } return results
def search(self, key_word): # qp = QueryParser("novelName", schema=self.ix.schema) qp = MultifieldParser( ["novelName", "novelAuthor", "novelIntroduction"], schema=self.ix.schema) q = qp.parse(key_word) # score scores = sorting.ScoreFacet() results = self.searcher.search(q, limit=LIMIT, sortedby=[scores]) print(len(results)) for i in results: print(i) # print(i.highlights()) # print(i.more_like_this("novelAuthor")) return results
def autocomplete(query_str, results=10): query_str = u' '.join([ t.text for t in _analyzer(query_str) if not 'university'.startswith(t.text) ]) q = _query_parser.parse(query_str) return [ _ror_rows[row['ror']] for row in _searcher.search_page( q, 1, results, sortedby=[ sorting.FieldFacet('citation_score', reverse=True), sorting.FieldFacet('num_students', reverse=True), sorting.ScoreFacet(), ]) ]
def test_sorted_extend(): from whoosh import sorting schema = fields.Schema(title=fields.TEXT(stored=True), keywords=fields.TEXT, num=fields.NUMERIC(stored=True, sortable=True)) domain = u"alfa bravo charlie delta echo foxtrot golf hotel india".split() keys = u"juliet kilo lima november oskar papa quebec romeo".split() combined = 0 tcount = 0 kcount = 0 with TempIndex(schema) as ix: with ix.writer() as w: for i, words in enumerate(permutations(domain, 3)): key = keys[i % (len(domain) - 1)] if "bravo" in words: tcount += 1 if key == "kilo": kcount += 1 if "bravo" in words or key == "kilo": combined += 1 w.add_document(title=u" ".join(words), keywords=key, num=i) with ix.searcher() as s: facet = sorting.MultiFacet([ sorting.FieldFacet("num", reverse=True), sorting.ScoreFacet() ]) r1 = s.search(query.Term("title", "bravo"), limit=None, sortedby=facet) r2 = s.search(query.Term("keywords", "kilo"), limit=None, sortedby=facet) assert len(r1) == tcount assert len(r2) == kcount r1.extend(r2) assert len(r1) == combined
def post_search(pn=1, size=10): keyword = request.values.get('kw') if keyword is None: return render_template('search/list.html', title='搜索', message='搜索关键字不能为空!') whoosh_searcher.clear('posts') writer = whoosh_searcher.get_writer('posts') for item in mongo.db['posts'].find( {}, ['_id', 'title', 'content', 'create_at', 'user_id', 'catalog_id']): item['obj_id'] = str(item['_id']) item['user_id'] = str(item['user_id']) item['catalog_id'] = str(item['catalog_id']) item.pop('_id') writer.add_document(**item) # 保存修改 writer.commit() with whoosh_searcher.get_searcher('posts') as searcher: # 解析查询字符串 parser = qparser.MultifieldParser( ['title', 'content'], whoosh_searcher.get_index('posts').schema) q = parser.parse(keyword) print('q:', q) # 搜索得到结果 result = searcher.search_page(q, pagenum=pn, pagelen=size, sortedby=sorting.ScoreFacet()) result_list = [x.fields() for x in result.results] # 构建页面对象 page = Page(pn, size, result=result_list, has_more=result.pagecount > pn, page_count=result.pagecount, total=result.total) return render_template('search/list.html', title=keyword + '搜索结果', page=page, kw=keyword)
def test_score_facet(): schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT, c=fields.ID) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, a=u("alfa alfa bravo"), b=u("bottle"), c=u("c")) w.add_document(id=2, a=u("alfa alfa alfa"), b=u("bottle"), c=u("c")) w.commit() w = ix.writer() w.add_document(id=3, a=u("alfa bravo bravo"), b=u("bottle"), c=u("c")) w.add_document(id=4, a=u("alfa bravo alfa"), b=u("apple"), c=u("c")) w.commit(merge=False) w = ix.writer() w.add_document(id=5, a=u("alfa bravo bravo"), b=u("apple"), c=u("c")) w.add_document(id=6, a=u("alfa alfa alfa"), b=u("apple"), c=u("c")) w.commit(merge=False) with ix.searcher() as s: facet = sorting.MultiFacet(["b", sorting.ScoreFacet()]) r = s.search(q=query.Term("a", u("alfa")), sortedby=facet) assert [h["id"] for h in r] == [6, 4, 5, 2, 1, 3]
def search(self, parameter): # 提取查询字段,创建检索器 keys = parameter['keys'] parser = None if len(keys) == 1: parser = QueryParser(keys[0], schema=self.index.schema) elif len(keys) > 1: parser = MultifieldParser(keys, schema=self.index.schema) # 搜索参数(排序、分页) score = sorting.ScoreFacet() # 相关度 id = sorting.FieldFacet('id', reverse=False) # 标题字段 _limit = None # 分页限制 if 'page' in parameter and 'pagesize' in parameter: page = parameter['page'] pagesize = parameter['pagesize'] if page > 0 and pagesize != 0: _limit = page * pagesize # 执行搜索 query = parser.parse(parameter['keywords']) result = self.searcher.search( query, limit=_limit, sortedby=[score] # 按相关度排序 ) # 返回结果 res = list() for hit in result: res.append({ 'title': hit['title'], 'url': hit['url'], 'content': re.sub(r'<[^>]+>', ' | ', hit.highlights('content'), re.S), 'score': str(hit.score) }) return res
# coding=utf-8 from whoosh.qparser import QueryParser from whoosh import qparser, sorting from whoosh.index import open_dir from whoosh.sorting import FieldFacet index_filepath = "./index/" # source_filepath=index_filepath+"0407_songs_dr2.csv" default_index = open_dir(index_filepath, indexname='book') # 读取建立好的索引 # 默认排序为得分+album+song default_facet = [] default_facet.append(sorting.ScoreFacet()) # default_facet.append(FieldFacet("album_title", reverse=True)) # 按序排列搜索结果 default_facet.append(FieldFacet("book_tittle", reverse=True)) # 默认查询为and模式,默认范围为全选 default_group = qparser.syntax.AndGroup default_range = ['book_tittle', 'book_author', 'year', 'publisher', 'ISBN'] # 基本的单曲查询 def basic_search(query, query_parse, group=default_group, facet=default_facet, index=default_index): searcher = index.searcher() parser = QueryParser(query_parse, index.schema, group=group) myquery = parser.parse(query) parser.remove_plugin_class(qparser.PhrasePlugin)
from whoosh.qparser import QueryParser from whoosh import index, sorting, scoring from whoosh import qparser from config import SEARCH_INDEX_DIR import math from test_search import CWeighting ix = index.open_dir(SEARCH_INDEX_DIR) qp = QueryParser("name", schema=ix.schema, group=qparser.OrGroup) facet = sorting.FieldFacet("zvalue", reverse=True) scores = sorting.ScoreFacet() def do_search(txt, sumlevel=None, kind=None, tries=0): if kind: txt += " AND kind:{}".format(kind) if sumlevel: txt += " AND sumlevel:{}".format(sumlevel) if tries > 2: return [], [] q = qp.parse(txt) with ix.searcher(weighting=CWeighting(txt)) as s: corrector = s.corrector("display") suggs = corrector.suggest(txt, limit=10, maxdist=2, prefix=3) results = s.search(q, sortedby=[scores]) data = [[ r["id"], r["name"], r["zvalue"], r["kind"], r["display"], r["sumlevel"] ] for r in results]
def sortResults(sortMethod, searcher, query): # ascending order by price if sortMethod == "1": try: results = searcher.search(query, limit=20, sortedby="price") except TermNotFound: results = [] # descending order by price elif sortMethod == "2": try: results = searcher.search(query, limit=20, sortedby="price", reverse=True) except TermNotFound: results = [] # sort by review polarity value with highest values first elif sortMethod == "3": try: results = searcher.search(query, limit=20, sortedby="reviewPolarity", reverse=True) except TermNotFound: results = [] # sort by rating with highest values first elif sortMethod == "4": try: results = searcher.search(query, limit=20, sortedby="rating", reverse=True) except TermNotFound: results = [] # sort by highest savings absolute value elif sortMethod == "5": try: results = searcher.search(query, limit=20, sortedby="savings", reverse=True) except TermNotFound: results = [] # sort by highest savings percentage elif sortMethod == "6": try: results = searcher.search(query, limit=20, sortedby="percentageSavings", reverse=True) except TermNotFound: results = [] # sort by highest review polarity value and highest ratings elif sortMethod == "7": try: rP = sorting.FieldFacet("reviewPolarity", reverse=True) r = sorting.FieldFacet("rating", reverse=True) weightedValue = sorting.TranslateFacet(average, rP, r) results = searcher.search(query, limit=20, sortedby=weightedValue) except TermNotFound: results = [] # sort by lowest price, highest review polarity value, highest ratings elif sortMethod == "8": try: rP = sorting.FieldFacet("reviewPolarity", reverse=True) r = sorting.FieldFacet("rating", reverse=True) weightedValue = sorting.TranslateFacet(average, rP, r) results = searcher.search(query, limit=20, sortedby=[weightedValue, "price"]) except TermNotFound: results = [] elif sortMethod == "9": try: priceRangeLow = int(input("Enter the minimum price : ")) priceRangeHigh = int(input("Enter the maximum price : ")) numResultsDisplayed = int( input("Enter the number of records to be shown : ")) results = searcher.search(query, limit=None, sortedby="price") except TermNotFound: results = [] elif sortMethod == "10": try: countryData = input("Enter the country of origin : ") if re.match(r"(?i)(us)", countryData): countryData = "usa" numResultsDisplayed = int( input("Enter the number of records to be shown : ")) scores = sorting.ScoreFacet() results = searcher.search(qp.parse(user_query + " " + countryData), limit=None, sortedby=scores) except TermNotFound: results = [] return results