Example #1
0
    def __init__(self, pickle_path='index', index_name='telegram_searcher', from_scratch=False):
        analyzer = ChineseAnalyzer()
        schema = Schema(
            content=TEXT(stored=True, analyzer=analyzer),
            url=ID(stored=True, unique=True),
            chat_id=STORED(),
            post_time=DATETIME(stored=True),
        )

        if not Path(pickle_path).exists():
            Path(pickle_path).mkdir()

        def _clear():
            pattern = re.compile(f'^_?{index_name}.*')
            for file in Path(pickle_path).iterdir():
                if pattern.match(file.name):
                    os.remove(str(file))
            self.ix = create_in(pickle_path, schema, index_name)

        if from_scratch:
            _clear()

        self.ix = open_dir(pickle_path, index_name) \
            if exists_in(pickle_path, index_name) \
            else create_in(pickle_path, schema, index_name)

        self._clear = _clear  # use closure to avoid introducing to much members
        self.query_parser = QueryParser('content', schema)
        self.highlighter = highlight.Highlighter()
Example #2
0
def test_pinpoint():
    domain = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet "
               "kilo lima mike november oskar papa quebec romeo sierra tango")
    schema = fields.Schema(text=fields.TEXT(stored=True, chars=True))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(text=domain)
    w.commit()

    assert ix.schema["text"].supports("characters")
    with ix.searcher() as s:
        r = s.search(query.Term("text", "juliet"), terms=True)
        hit = r[0]
        hi = highlight.Highlighter()
        hi.formatter = highlight.UppercaseFormatter()

        assert not hi.can_load_chars(r, "text")
        assert hi.highlight_hit(
            hit, "text") == "golf hotel india JULIET kilo lima mike november"

        hi.fragmenter = highlight.PinpointFragmenter()
        assert hi.can_load_chars(r, "text")
        assert hi.highlight_hit(
            hit, "text") == "ot golf hotel india JULIET kilo lima mike nove"

        hi.fragmenter.autotrim = True
        assert hi.highlight_hit(
            hit, "text") == "golf hotel india JULIET kilo lima mike"
Example #3
0
def render_results(s, qs, template):
    qp = qparser.QueryParser("content", s.schema)
    qp = qparser.MultifieldParser(["tgrams", "content"], s.schema)

    # Add the DateParserPlugin to the parser
    qp.add_plugin(DateParserPlugin())

    q = qp.parse(qs)

    results = s.search(q, limit=100)
    results = s.search(q, limit=100, sortedby="title", reverse=True)
    results = s.search(q, limit=100, groupedby="chapter")
    q = results.q

    hf = highlight.HtmlFormatter()
    results.highlighter = highlight.Highlighter(formatter=hf)

    qc = None
    if not results:
        corrected = s.correct_query(q, qs, prefix=1)
        if corrected.query != q:
            qc = corrected.format_string(hf)

    def hilite(hit):
        with open(SOURCEDIR + hit["path"], "rb") as hitfile:
            text = hitfile.read().decode("utf-8")
        return hit.highlights("content", text)

    return render_template(template,
                           qs=qs,
                           q=q,
                           results=results,
                           hilite=hilite,
                           corrected=qc,
                           args=request.args)
Example #4
0
def highlight_all(result, field):
    text = result[field]
    return Markup(
        highlight.Highlighter(
            fragmenter=highlight.WholeFragmenter(),
            formatter=result.results.highlighter.formatter).highlight_hit(
                result, field, text=text)) or text
Example #5
0
    def __init__(self, index_dir: Path, from_scratch: bool = False):
        index_name = 'index'
        if not Path(index_dir).exists():
            Path(index_dir).mkdir()

        def _clear():
            import shutil
            shutil.rmtree(index_dir)
            index_dir.mkdir()
            self.ix = index.create_in(index_dir, IndexMsg.schema, index_name)

        if from_scratch:
            _clear()

        self.ix = index.open_dir(index_dir, index_name) \
            if index.exists_in(index_dir, index_name) \
            else index.create_in(index_dir, IndexMsg.schema, index_name)

        assert repr(self.ix.schema.names) == repr(IndexMsg.schema.names), \
            f"Incompatible schema in your index '{index_dir}'\n" \
            f"\tExpected: {IndexMsg.schema}\n" \
            f"\tOn disk:  {self.ix.schema}"

        self._clear = _clear  # use closure to avoid introducing too much members
        self.query_parser = QueryParser('content', IndexMsg.schema)
        self.highlighter = highlight.Highlighter()
def test_highlight_setters():
    schema = fields.Schema(text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(text=u("Hello"))
    w.commit()

    r = ix.searcher().search(query.Term("text", "hello"))
    hl = highlight.Highlighter()
    ucf = highlight.UppercaseFormatter()
    r.highlighter = hl
    r.formatter = ucf
    assert hl.formatter is ucf
Example #7
0
def search(keyword):

    results = []
    dirname = "index"
    ix = open_dir(dirname=dirname)
    querystring = u"{}".format(keyword)
    searcher = ix.searcher()
    hi = highlight.Highlighter()
    brf = BracketFormatter()

    parser = QueryParser("content", ix.schema)
    myquery = parser.parse(querystring)
    with ix.searcher() as searcher:
        hits = searcher.search(myquery, limit=None)
        hits.formatter = brf
        for hit in hits:
            result = {
                'title': hit['title'],
                'abstract': hit.highlights('content', top=1),
                'url': hit['url']
            }
            results.append(result)
    return results
Example #8
0
def base_query():
    assert request.path == '/index'
    #print(dict(request.form)["query"][0])
    #print(dict(request.form))
    query_sentence = str(dict(request.form)["query"][0])
    logging.info("Query sentence: %s" % query_sentence)
    res = []
    with ix.searcher() as searcher:
        # 对输入的查询文本进行解析,如果存在按域查询的需求则区分按域查询,默认采用多属性查询模式
        # mark 表示是否需要高亮学院查询区域,默认情况下需要
        highlight_xy = True
        # 默认的多域查询
        query = qparser.MultifieldParser(
            ["content", "title", "mtext", "xueyuan"], ix.schema)
        if query_sentence.endswith("$姓名$"):
            # 按名字查询
            query = qparser.SimpleParser("title", ix.schema)
            query_sentence = query_sentence.strip('$姓名$')
        elif query_sentence.endswith("$学院$"):
            # 按学院查询
            query = qparser.SimpleParser("xueyuan", ix.schema)
            query_sentence = query_sentence.strip('$学院$')

        elif query_sentence.endswith("$网页$"):
            # 按网页内容查询
            query = qparser.SimpleParser("content", ix.schema)
            query_sentence = query_sentence.strip('$网页$')

        #print(query_sentence)
        # 引入查询解析器插件
        query.add_plugin(qparser.WildcardPlugin)

        # query.remove_plugin_class(qparser.WildcardPlugin)
        query.add_plugin(qparser.PrefixPlugin())
        query.add_plugin(qparser.OperatorsPlugin)
        query.add_plugin(qparser.RegexPlugin)
        query.add_plugin(qparser.PhrasePlugin)

        # 解析得到查询器
        q = query.parse(query_sentence)
        logging.info("Query parse result: %s" % str(q))
        print(q)
        # 获取查询结果
        result = searcher.search(q, limit=20)
        # print(result)
        # 设置碎片的属性
        # Allow larger fragments
        my_cf = highlight.ContextFragmenter(maxchars=200, surround=30)
        hf = highlight.HtmlFormatter(tagname='em',
                                     classname='match',
                                     termclass='term')

        hi = highlight.Highlighter(fragmenter=my_cf, formatter=hf)
        for hit in result:
            print(hit["picpath"])
            print(hit["title"])
            print(escape(hi.highlight_hit(hit, "content")))
            if hit['picpath'] == '#':
                if highlight_xy:
                    res.append({
                        "title":
                        hit['title'],
                        "xueyuan":
                        Markup(hi.highlight_hit(hit, "xueyuan")),
                        "url":
                        hit["url"],
                        'shotpath':
                        hit['shotpath'],
                        "content":
                        Markup(hi.highlight_hit(hit, "content")),
                        "parenturl":
                        hit["parenturl"],
                        "picpath":
                        '#',
                        "pagerank":
                        scores[url_dict[hit["url"]]]
                    })
                else:
                    res.append({
                        "title":
                        hit['title'],
                        "xueyuan":
                        hit["xueyuan"],
                        "url":
                        hit["url"],
                        'shotpath':
                        hit['shotpath'],
                        "content":
                        Markup(hi.highlight_hit(hit, "content")),
                        "parenturl":
                        hit["parenturl"],
                        "picpath":
                        '#',
                        "pagerank":
                        scores[url_dict[hit["url"]]]
                    })
            else:
                if highlight_xy:
                    res.append({
                        "title":
                        hit['title'],
                        "xueyuan":
                        Markup(hi.highlight_hit(hit, "xueyuan")),
                        "url":
                        hit["url"],
                        'shotpath':
                        hit['shotpath'],
                        "content":
                        Markup(hi.highlight_hit(hit, "content")),
                        "parenturl":
                        hit["parenturl"],
                        "picpath":
                        "images/%s/%s" % (hit['picpath'].split('/')[-3],
                                          hit['picpath'].split('/')[-1]),
                        "pagerank":
                        scores[url_dict[hit["url"]]]
                    })
                else:
                    res.append({
                        "title":
                        hit['title'],
                        "xueyuan":
                        hit["xueyuan"],
                        "url":
                        hit["url"],
                        'shotpath':
                        hit['shotpath'],
                        "content":
                        Markup(hi.highlight_hit(hit, "content")),
                        "parenturl":
                        hit["parenturl"],
                        "picpath":
                        "images/%s/%s" % (hit['picpath'].split('/')[-3],
                                          hit['picpath'].split('/')[-1]),
                        "pagerank":
                        scores[url_dict[hit["url"]]]
                    })
        print(len(result))
        print(res)
    count = len(result)

    if count == 0:
        logging.warning("%d,没有查询到相关内容!" % 404)
        return "没有查询到相关内容!", 404
    else:
        # 记录查询日志
        log = "Response: "
        for item in res:
            log = log + " (name:%s,url:%s) " % (item["title"], item["url"])
        logging.info(log)

        # # 基于page rank 对链接进行排序
        # res.sort(key=lambda k:(k.get("pagerank",0)),reverse = True)
        # print(res)

        mysession["data"] = res  # 使用会话session传递参数
        return jsonify({"url": "/display/%d&%s" % (count, query_sentence)})