def doSearch(self, text): q = self.qp.parse(text) # build query with self.ix.searcher( weighting=scoring.Frequency) as s: # simple scorer may help c = s.collector(limit=self.MaxResults) c = TimeLimitCollector(c, 0.5) try: s.search_with_collector(q, c) except: print("TIMEOUT!") results = c.results() # partial results if hung self.searchResults.clear() #my_cf = highlight.PinpointFragmenter(maxchars=100, surround=60) my_cf = highlight.ContextFragmenter(maxchars=160, surround=30) #my_cf = highlight.SentenceFragmenter(maxchars=200, sentencechars='\n') results.fragmenter = my_cf if len(results) > 0: for res in results: res.fragmenter = my_cf # self.searchResults.append(res.highlights('Text',top=1) + '*--*\n' + res['MeetingLink']+ '\n') self.searchResults.append(res.highlights('Text', top=1)) self.searchResults.append('-Link to Meeting -') self.searchResults.append(res['MeetingLink'] + '\n') self.searchResults.append('----------') self.searchResults.append('----------') cursor = self.searchResults.moveCursor(QtGui.QTextCursor.Start)
def search(searcher, query_string): parser = qparser.QueryParser("title", schema=ix.schema) q = parser.parse(u(query_string)) result = searcher.search(q, terms=True) result.fragmenter = highlight.ContextFragmenter() result.formatter = highlight.UppercaseFormatter() return result
def test_issue324(): sa = analysis.StemmingAnalyzer() result = highlight.highlight(u("Indexed!\n1"), [u("index")], sa, fragmenter=highlight.ContextFragmenter(), formatter=highlight.UppercaseFormatter()) assert result == "INDEXED!\n1"
def test_context_fragment(): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, cf, uc) assert htext == "alfa BRAVO charlie...hotel INDIA juliet"
def test_context_at_start(): terms = frozenset(["alfa"]) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=15) uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, cf, uc) assert htext == "ALFA bravo charlie delta echo foxtrot"
def test_html_format(): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) hf = highlight.HtmlFormatter() htext = highlight.highlight(_doc, terms, sa, cf, hf) assert htext == 'alfa <strong class="match term0">bravo</strong> charlie...hotel <strong class="match term1">india</strong> juliet'
def test_maxclasses(): terms = frozenset(("alfa", "bravo", "charlie", "delta", "echo")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) hf = highlight.HtmlFormatter(tagname="b", termclass="t", maxclasses=2) htext = highlight.highlight(_doc, terms, sa, cf, hf) assert htext == '<b class="match t0">alfa</b> <b class="match t1">bravo</b> <b class="match t0">charlie</b>...<b class="match t1">delta</b> <b class="match t0">echo</b> foxtrot'
def __getitem__(self, item): if item.start in self.saved_results: return self.saved_results[item.start] q, mask = self._query sortedby, reverse = self._query_sortedby page: ResultsPage = self.searcher.search_page( q, mask=mask, filter=self._query_filter, pagenum=math.floor(item.start / self.page_size) + 1, pagelen=self.page_size, sortedby=sortedby, reverse=reverse) page.results.fragmenter = highlight.ContextFragmenter(surround=50) page.results.formatter = HtmlFormatter(tagname="span", between=" ... ") if not self.first_score and len(page.results) > 0: self.first_score = page.results[0].score if self.first_score: page.results.top_n = list( map(lambda hit: (hit[0] / self.first_score, hit[1]), page.results.top_n)) self.saved_results[item.start] = page return page
def search_law(self, q, callback): idx = index.open_dir(self.law_idx_path) with idx.searcher() as searcher: parser = QueryParser("LEGAL_TEXT", idx.schema).parse(q) results = searcher.search(parser) # results.fragmenter = highlight.PinpointFragmenter(surround=64, autotrim=True) results.fragmenter = highlight.ContextFragmenter(surround=128) results.formatter = highlight.UppercaseFormatter() callback(results)
def search(self): """ Search function Searches all indexes for self.keyword and prints them. """ try: ix = self.get_ix() search_term = str(self.keyword) from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse("%s" % search_term) results = searcher.search(query, terms=True, limit=self.limit) results.fragmenter = highlight.ContextFragmenter(maxchars=200, surround=20) #If stdin == stdout, the programs output is not being #piped and colored output is fine color = (self.color == 'always' or (self.color == 'auto' and os.fstat(0) == os.fstat(1))) results.formatter = ColorFormatter(color=color) #Remove excluded filetypes from search results if self.exclude: results = [ f for f in results if not os.path.splitext(f["path"])[1][1:] in self.exclude ] if self.include: results = [ f for f in results if os.path.splitext(f["path"])[1][1:] in self.include ] print(results) for i, result in enumerate(results, start=1): if color: print("Result %i: %s" % (i, colorama.Fore.GREEN + result["path"] + colorama.Fore.RESET)) else: print("Result %i: %s" % (i, result["path"])) with codecs.open(result["path"], encoding='utf-8', errors='ignore') as f: file_content = f.read() print( result.highlights("content", text=file_content, top=10)) print("\n") finally: ix.close()
def query_page(ix, query, page): searcher = ix.searcher() try: query_parser = MultifieldParser(["content", "title", "correspondent"], ix.schema).parse(query) result_page = searcher.search_page(query_parser, page) result_page.results.fragmenter = highlight.ContextFragmenter( surround=50) result_page.results.formatter = JsonFormatter() yield result_page finally: searcher.close()
def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content): searcher = ix.searcher() try: if querystring: qp = MultifieldParser( ["content", "title", "correspondent", "tag", "type"], ix.schema) qp.add_plugin(DateParserPlugin()) str_q = qp.parse(querystring) corrected = searcher.correct_query(str_q, querystring) else: str_q = None corrected = None if more_like_doc_id: docnum = searcher.document_number(id=more_like_doc_id) kts = searcher.key_terms_from_text('content', more_like_doc_content, numterms=20, model=classify.Bo1Model, normalize=False) more_like_q = query.Or([ query.Term('content', word, boost=weight) for word, weight in kts ]) result_page = searcher.search_page(more_like_q, page, filter=str_q, mask={docnum}) elif str_q: result_page = searcher.search_page(str_q, page) else: raise ValueError( "Either querystring or more_like_doc_id is required.") result_page.results.fragmenter = highlight.ContextFragmenter( surround=50) result_page.results.formatter = JsonFormatter() if corrected and corrected.query != str_q: corrected_query = corrected.string else: corrected_query = None yield result_page, corrected_query finally: searcher.close()
def query_page(ix, querystring, page): searcher = ix.searcher() try: qp = MultifieldParser( ["content", "title", "correspondent", "tag", "type"], ix.schema) qp.add_plugin(DateParserPlugin()) q = qp.parse(querystring) result_page = searcher.search_page(q, page) result_page.results.fragmenter = highlight.ContextFragmenter( surround=50) result_page.results.formatter = JsonFormatter() corrected = searcher.correct_query(q, querystring) if corrected.query != q: corrected_query = corrected.string else: corrected_query = None yield result_page, corrected_query finally: searcher.close()
def search(self, q: str): qp = MultifieldParser(['body', 'tags'], schema=self._schema) q_obj = qp.parse(q) ans = [] with self._index.searcher() as srch: results = srch.search(q_obj, terms=True) results.fragmenter = highlight.ContextFragmenter(maxchars=100, surround=30) for hit in results: item = dict(hit) full_path = os.path.join(self._data_dir, hit['path']) with open(full_path) as fr: full_text = fr.read() matched = dict(hit.matched_terms()) if 'body' in matched: item['highlight'] = '...{} ... '.format( hit.highlights('body', text=full_text)) else: item['highlight'] = None item['path'] = item['path'].rsplit('.', 1)[0] ans.append(item) return ans
def search(query, page): ix = index.open_dir(index_dir) res = dict() with ix.searcher() as se: q = MultifieldParser(["filepath", "document"], schema=ix.schema).parse(query) p = se.search_page(q, page, pagelen=page_len) fg = highlight.ContextFragmenter(maxchars=200, surround=100, charlimit=100000) p.results.fragmenter = fg res['cur_page'] = p.pagenum res['tot_page'] = p.pagecount res['list'] = list() for hit in p: t = time.localtime(hit['moditime']) timestr = time.strftime('%Y-%m-%d %H:%M:%S', t) res['list'].append({ "path": hit['filepath'], "time": timestr, "path_highlight": hit.highlights('filepath'), "highlight": hit.highlights('document') }) return res
def base_query(): assert request.path == '/index' #print(dict(request.form)["query"][0]) #print(dict(request.form)) query_sentence = str(dict(request.form)["query"][0]) logging.info("Query sentence: %s" % query_sentence) res = [] with ix.searcher() as searcher: # 对输入的查询文本进行解析,如果存在按域查询的需求则区分按域查询,默认采用多属性查询模式 # mark 表示是否需要高亮学院查询区域,默认情况下需要 highlight_xy = True # 默认的多域查询 query = qparser.MultifieldParser( ["content", "title", "mtext", "xueyuan"], ix.schema) if query_sentence.endswith("$姓名$"): # 按名字查询 query = qparser.SimpleParser("title", ix.schema) query_sentence = query_sentence.strip('$姓名$') elif query_sentence.endswith("$学院$"): # 按学院查询 query = qparser.SimpleParser("xueyuan", ix.schema) query_sentence = query_sentence.strip('$学院$') elif query_sentence.endswith("$网页$"): # 按网页内容查询 query = qparser.SimpleParser("content", ix.schema) query_sentence = query_sentence.strip('$网页$') #print(query_sentence) # 引入查询解析器插件 query.add_plugin(qparser.WildcardPlugin) # query.remove_plugin_class(qparser.WildcardPlugin) query.add_plugin(qparser.PrefixPlugin()) query.add_plugin(qparser.OperatorsPlugin) query.add_plugin(qparser.RegexPlugin) query.add_plugin(qparser.PhrasePlugin) # 解析得到查询器 q = query.parse(query_sentence) logging.info("Query parse result: %s" % str(q)) print(q) # 获取查询结果 result = searcher.search(q, limit=20) # print(result) # 设置碎片的属性 # Allow larger fragments my_cf = highlight.ContextFragmenter(maxchars=200, surround=30) hf = highlight.HtmlFormatter(tagname='em', classname='match', termclass='term') hi = highlight.Highlighter(fragmenter=my_cf, formatter=hf) for hit in result: print(hit["picpath"]) print(hit["title"]) print(escape(hi.highlight_hit(hit, "content"))) if hit['picpath'] == '#': if highlight_xy: res.append({ "title": hit['title'], "xueyuan": Markup(hi.highlight_hit(hit, "xueyuan")), "url": hit["url"], 'shotpath': hit['shotpath'], "content": Markup(hi.highlight_hit(hit, "content")), "parenturl": hit["parenturl"], "picpath": '#', "pagerank": scores[url_dict[hit["url"]]] }) else: res.append({ "title": hit['title'], "xueyuan": hit["xueyuan"], "url": hit["url"], 'shotpath': hit['shotpath'], "content": Markup(hi.highlight_hit(hit, "content")), "parenturl": hit["parenturl"], "picpath": '#', "pagerank": scores[url_dict[hit["url"]]] }) else: if highlight_xy: res.append({ "title": hit['title'], "xueyuan": Markup(hi.highlight_hit(hit, "xueyuan")), "url": hit["url"], 'shotpath': hit['shotpath'], "content": Markup(hi.highlight_hit(hit, "content")), "parenturl": hit["parenturl"], "picpath": "images/%s/%s" % (hit['picpath'].split('/')[-3], hit['picpath'].split('/')[-1]), "pagerank": scores[url_dict[hit["url"]]] }) else: res.append({ "title": hit['title'], "xueyuan": hit["xueyuan"], "url": hit["url"], 'shotpath': hit['shotpath'], "content": Markup(hi.highlight_hit(hit, "content")), "parenturl": hit["parenturl"], "picpath": "images/%s/%s" % (hit['picpath'].split('/')[-3], hit['picpath'].split('/')[-1]), "pagerank": scores[url_dict[hit["url"]]] }) print(len(result)) print(res) count = len(result) if count == 0: logging.warning("%d,没有查询到相关内容!" % 404) return "没有查询到相关内容!", 404 else: # 记录查询日志 log = "Response: " for item in res: log = log + " (name:%s,url:%s) " % (item["title"], item["url"]) logging.info(log) # # 基于page rank 对链接进行排序 # res.sort(key=lambda k:(k.get("pagerank",0)),reverse = True) # print(res) mysession["data"] = res # 使用会话session传递参数 return jsonify({"url": "/display/%d&%s" % (count, query_sentence)})
qp.add_plugin(qparser.PlusMinusPlugin) query = qp.parse(search_input) # print(query) if search_type == "BM25": w = BM25F(B=0.75, K1=1.5) elif search_type == "TFIDF": w = TF_IDF() else: w = BM25F( B=0.75, K1=1.5, ) with ix.searcher(weighting=w) as searcher: results = searcher.search(query, terms=True) results.fragmenter = highlight.ContextFragmenter( maxchars=50, surround=50, ) # print(list(searcher.lexicon("content"))) found_doc_num = results.scored_length() run_time = results.runtime # -------------------------------for html use--------------------------------- if found_doc_num == 0: final_top_output = "<h1> Sorry " + str(found_doc_num) + " Search Results Found.</h1>" \ "<h5>Search Results for "+search_input+" using "+search_type+" (" + str(run_time) + " seconds)</h5><br>" else: final_top_output = "<h1> Top " + str(found_doc_num) + " Search Results </h1>" \ "<h5>Search Results for "+search_input+" using "+search_type +" Ranking and "+ operation_type+\ " operation to score (" + str(run_time) + " seconds)</h5><br>" print(final_top_output) if results:
print query.search_terms myquery = parser.parse( query.search_terms ) results = [] reponse = [] rr={} with self.queryIndex.searcher() as searcher: results = searcher.search( myquery ) for result in results: temp = result['content'] temp = temp.split("\t") sugg= temp[0] print result rr['content']= sugg results.fragmenter = highlight.ContextFragmenter(surround=40) results.formatter = highlight.UppercaseFormatter() response = parse_whoosh_trec('WhooshQueryEngine', query.search_terms, results) return response # ----- The Following are Whoosh errors ----- # There's a problem with the Whoosh query created from the users query except QueryError, e: raise SearchEngineError("Whoosh Query Suggest Engine", e, errorType="Whoosh", query=query) # Our Whoosh Index is empty except EmptyIndexError, e: raise SearchEngineError("Whoosh Query Suggest Engine", e, errorType="Whoosh")
class WhooshTRECNewsEngine(SearchEngine): """Whoosh Query log search engine.""" def __init__(self, service, whoosh_trec_news_index_dir=""): super(WhooshTRECNewsEngine, self).__init__(service) try: self.docIndex = open_dir(whoosh_trec_news_index_dir) print "Whoosh Document index open" print self.docIndex.doc_count() except: print "Could not open Whoosh Document index at: " + whoosh_trec_news_index_dir def search(self, query, pos=0): """ Search service for query log data held in a Whoosh TREC News Document index with a Schema() Parameters: * query (puppy.model.Query) Returns: * results puppy.model.Response Raises: * ? """ def parse_whoosh_trec(site, query, results): response = Response() response.version = 'trec' response.feed.setdefault('title', "{0}: {1}".format(site, query)) response.feed.setdefault('link', '') response.feed.setdefault( 'description', "Search results for '{0}' at {1}".format(query, site)) response.namespaces.setdefault( "opensearch", "http://a9.com/-/spec/opensearch/1.1/") response.feed.setdefault("opensearch_totalresults", results.pagecount) response.feed.setdefault("opensearch_itemsperpage", pagelen) response.feed.setdefault("opensearch_startindex", results.pagenum) response.feed.setdefault('query', query) try: r = 0 if len(results) > 1: for hit in results: r = r + 1 title = hit["title"] title = title.strip() if len(title) < 1: title = query rank = ( (int(results.pagenum) - 1) * results.pagelen) + r link = "/treconomics/" + str( hit.docnum) + "?rank=" + str(rank) desc = hit.highlights("content") docid = hit["docid"] docid = docid.strip() source = hit["source"] response.entries.append({ 'title': title, 'link': link, 'summary': desc, 'docid': docid, 'source': source }) else: print "No hits found for query: " + query except Exception, e: print "Converting results to OpenSearch Failed" return response # end parse_whoosh_trec try: parser = QueryParser("content", self.docIndex.schema) #mparser = MultifieldParser(["title", "content"], schema=self.docIndex.schema) print "In WhooshTRECNewsEngine: " + query.search_terms query_terms = parser.parse(query.search_terms) page = query.start_page pagelen = query.page_len #print query_terms #print "page len" + str(pagelen) results = [] reponse = [] with self.docIndex.searcher() as searcher: results = searcher.search_page(query_terms, page, pagelen=pagelen) # results = searcher.search( query_terms ) results.fragmenter = highlight.ContextFragmenter(maxchars=300, surround=300) results.formatter = highlight.HtmlFormatter() results.fragmenter.charlimit = 100000 print "WhooshTRECNewsEngine found: " + str( len(results)) + " results" print "Page %d of %d - PageLength of %d" % ( results.pagenum, results.pagecount, results.pagelen) response = parse_whoosh_trec('WhooshTRECNewsEngine', query.search_terms, results) return response except: print "Error in Search Service: Whoosh TREC News search failed"