def __init__(self, pickle_path='index', index_name='telegram_searcher', from_scratch=False): analyzer = ChineseAnalyzer() schema = Schema( content=TEXT(stored=True, analyzer=analyzer), url=ID(stored=True, unique=True), chat_id=STORED(), post_time=DATETIME(stored=True), ) if not Path(pickle_path).exists(): Path(pickle_path).mkdir() def _clear(): pattern = re.compile(f'^_?{index_name}.*') for file in Path(pickle_path).iterdir(): if pattern.match(file.name): os.remove(str(file)) self.ix = create_in(pickle_path, schema, index_name) if from_scratch: _clear() self.ix = open_dir(pickle_path, index_name) \ if exists_in(pickle_path, index_name) \ else create_in(pickle_path, schema, index_name) self._clear = _clear # use closure to avoid introducing to much members self.query_parser = QueryParser('content', schema) self.highlighter = highlight.Highlighter()
def test_pinpoint(): domain = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet " "kilo lima mike november oskar papa quebec romeo sierra tango") schema = fields.Schema(text=fields.TEXT(stored=True, chars=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=domain) w.commit() assert ix.schema["text"].supports("characters") with ix.searcher() as s: r = s.search(query.Term("text", "juliet"), terms=True) hit = r[0] hi = highlight.Highlighter() hi.formatter = highlight.UppercaseFormatter() assert not hi.can_load_chars(r, "text") assert hi.highlight_hit( hit, "text") == "golf hotel india JULIET kilo lima mike november" hi.fragmenter = highlight.PinpointFragmenter() assert hi.can_load_chars(r, "text") assert hi.highlight_hit( hit, "text") == "ot golf hotel india JULIET kilo lima mike nove" hi.fragmenter.autotrim = True assert hi.highlight_hit( hit, "text") == "golf hotel india JULIET kilo lima mike"
def render_results(s, qs, template): qp = qparser.QueryParser("content", s.schema) qp = qparser.MultifieldParser(["tgrams", "content"], s.schema) # Add the DateParserPlugin to the parser qp.add_plugin(DateParserPlugin()) q = qp.parse(qs) results = s.search(q, limit=100) results = s.search(q, limit=100, sortedby="title", reverse=True) results = s.search(q, limit=100, groupedby="chapter") q = results.q hf = highlight.HtmlFormatter() results.highlighter = highlight.Highlighter(formatter=hf) qc = None if not results: corrected = s.correct_query(q, qs, prefix=1) if corrected.query != q: qc = corrected.format_string(hf) def hilite(hit): with open(SOURCEDIR + hit["path"], "rb") as hitfile: text = hitfile.read().decode("utf-8") return hit.highlights("content", text) return render_template(template, qs=qs, q=q, results=results, hilite=hilite, corrected=qc, args=request.args)
def highlight_all(result, field): text = result[field] return Markup( highlight.Highlighter( fragmenter=highlight.WholeFragmenter(), formatter=result.results.highlighter.formatter).highlight_hit( result, field, text=text)) or text
def __init__(self, index_dir: Path, from_scratch: bool = False): index_name = 'index' if not Path(index_dir).exists(): Path(index_dir).mkdir() def _clear(): import shutil shutil.rmtree(index_dir) index_dir.mkdir() self.ix = index.create_in(index_dir, IndexMsg.schema, index_name) if from_scratch: _clear() self.ix = index.open_dir(index_dir, index_name) \ if index.exists_in(index_dir, index_name) \ else index.create_in(index_dir, IndexMsg.schema, index_name) assert repr(self.ix.schema.names) == repr(IndexMsg.schema.names), \ f"Incompatible schema in your index '{index_dir}'\n" \ f"\tExpected: {IndexMsg.schema}\n" \ f"\tOn disk: {self.ix.schema}" self._clear = _clear # use closure to avoid introducing too much members self.query_parser = QueryParser('content', IndexMsg.schema) self.highlighter = highlight.Highlighter()
def test_highlight_setters(): schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("Hello")) w.commit() r = ix.searcher().search(query.Term("text", "hello")) hl = highlight.Highlighter() ucf = highlight.UppercaseFormatter() r.highlighter = hl r.formatter = ucf assert hl.formatter is ucf
def search(keyword): results = [] dirname = "index" ix = open_dir(dirname=dirname) querystring = u"{}".format(keyword) searcher = ix.searcher() hi = highlight.Highlighter() brf = BracketFormatter() parser = QueryParser("content", ix.schema) myquery = parser.parse(querystring) with ix.searcher() as searcher: hits = searcher.search(myquery, limit=None) hits.formatter = brf for hit in hits: result = { 'title': hit['title'], 'abstract': hit.highlights('content', top=1), 'url': hit['url'] } results.append(result) return results
def base_query(): assert request.path == '/index' #print(dict(request.form)["query"][0]) #print(dict(request.form)) query_sentence = str(dict(request.form)["query"][0]) logging.info("Query sentence: %s" % query_sentence) res = [] with ix.searcher() as searcher: # 对输入的查询文本进行解析,如果存在按域查询的需求则区分按域查询,默认采用多属性查询模式 # mark 表示是否需要高亮学院查询区域,默认情况下需要 highlight_xy = True # 默认的多域查询 query = qparser.MultifieldParser( ["content", "title", "mtext", "xueyuan"], ix.schema) if query_sentence.endswith("$姓名$"): # 按名字查询 query = qparser.SimpleParser("title", ix.schema) query_sentence = query_sentence.strip('$姓名$') elif query_sentence.endswith("$学院$"): # 按学院查询 query = qparser.SimpleParser("xueyuan", ix.schema) query_sentence = query_sentence.strip('$学院$') elif query_sentence.endswith("$网页$"): # 按网页内容查询 query = qparser.SimpleParser("content", ix.schema) query_sentence = query_sentence.strip('$网页$') #print(query_sentence) # 引入查询解析器插件 query.add_plugin(qparser.WildcardPlugin) # query.remove_plugin_class(qparser.WildcardPlugin) query.add_plugin(qparser.PrefixPlugin()) query.add_plugin(qparser.OperatorsPlugin) query.add_plugin(qparser.RegexPlugin) query.add_plugin(qparser.PhrasePlugin) # 解析得到查询器 q = query.parse(query_sentence) logging.info("Query parse result: %s" % str(q)) print(q) # 获取查询结果 result = searcher.search(q, limit=20) # print(result) # 设置碎片的属性 # Allow larger fragments my_cf = highlight.ContextFragmenter(maxchars=200, surround=30) hf = highlight.HtmlFormatter(tagname='em', classname='match', termclass='term') hi = highlight.Highlighter(fragmenter=my_cf, formatter=hf) for hit in result: print(hit["picpath"]) print(hit["title"]) print(escape(hi.highlight_hit(hit, "content"))) if hit['picpath'] == '#': if highlight_xy: res.append({ "title": hit['title'], "xueyuan": Markup(hi.highlight_hit(hit, "xueyuan")), "url": hit["url"], 'shotpath': hit['shotpath'], "content": Markup(hi.highlight_hit(hit, "content")), "parenturl": hit["parenturl"], "picpath": '#', "pagerank": scores[url_dict[hit["url"]]] }) else: res.append({ "title": hit['title'], "xueyuan": hit["xueyuan"], "url": hit["url"], 'shotpath': hit['shotpath'], "content": Markup(hi.highlight_hit(hit, "content")), "parenturl": hit["parenturl"], "picpath": '#', "pagerank": scores[url_dict[hit["url"]]] }) else: if highlight_xy: res.append({ "title": hit['title'], "xueyuan": Markup(hi.highlight_hit(hit, "xueyuan")), "url": hit["url"], 'shotpath': hit['shotpath'], "content": Markup(hi.highlight_hit(hit, "content")), "parenturl": hit["parenturl"], "picpath": "images/%s/%s" % (hit['picpath'].split('/')[-3], hit['picpath'].split('/')[-1]), "pagerank": scores[url_dict[hit["url"]]] }) else: res.append({ "title": hit['title'], "xueyuan": hit["xueyuan"], "url": hit["url"], 'shotpath': hit['shotpath'], "content": Markup(hi.highlight_hit(hit, "content")), "parenturl": hit["parenturl"], "picpath": "images/%s/%s" % (hit['picpath'].split('/')[-3], hit['picpath'].split('/')[-1]), "pagerank": scores[url_dict[hit["url"]]] }) print(len(result)) print(res) count = len(result) if count == 0: logging.warning("%d,没有查询到相关内容!" % 404) return "没有查询到相关内容!", 404 else: # 记录查询日志 log = "Response: " for item in res: log = log + " (name:%s,url:%s) " % (item["title"], item["url"]) logging.info(log) # # 基于page rank 对链接进行排序 # res.sort(key=lambda k:(k.get("pagerank",0)),reverse = True) # print(res) mysession["data"] = res # 使用会话session传递参数 return jsonify({"url": "/display/%d&%s" % (count, query_sentence)})