Python ContextFragmenterの例、whoosh.highlight.ContextFragmenter Pythonの例

コード例 #1

0

ファイルを表示

ファイル: SSAWGSearcherGUI.py プロジェクト: sot/text-tools

 def doSearch(self, text):
     q = self.qp.parse(text)  # build query
     with self.ix.searcher(
             weighting=scoring.Frequency) as s:  # simple scorer may help
         c = s.collector(limit=self.MaxResults)
         c = TimeLimitCollector(c, 0.5)
         try:
             s.search_with_collector(q, c)
         except:
             print("TIMEOUT!")
         results = c.results()  # partial results if hung
         self.searchResults.clear()
         #my_cf = highlight.PinpointFragmenter(maxchars=100, surround=60)
         my_cf = highlight.ContextFragmenter(maxchars=160, surround=30)
         #my_cf = highlight.SentenceFragmenter(maxchars=200, sentencechars='\n')
         results.fragmenter = my_cf
         if len(results) > 0:
             for res in results:
                 res.fragmenter = my_cf
                 # self.searchResults.append(res.highlights('Text',top=1) + '*--*\n' + res['MeetingLink']+ '\n')
                 self.searchResults.append(res.highlights('Text', top=1))
                 self.searchResults.append('-Link to Meeting -')
                 self.searchResults.append(res['MeetingLink'] + '\n')
                 self.searchResults.append('----------')
                 self.searchResults.append('----------')
         cursor = self.searchResults.moveCursor(QtGui.QTextCursor.Start)

コード例 #2

0

ファイルを表示

ファイル: test_highlighting.py プロジェクト: sangensong/whoosh-1

 def search(searcher, query_string):
     parser = qparser.QueryParser("title", schema=ix.schema)
     q = parser.parse(u(query_string))
     result = searcher.search(q, terms=True)
     result.fragmenter = highlight.ContextFragmenter()
     result.formatter = highlight.UppercaseFormatter()
     return result

コード例 #3

0

ファイルを表示

def test_issue324():
    sa = analysis.StemmingAnalyzer()
    result = highlight.highlight(u("Indexed!\n1"), [u("index")],
                                 sa,
                                 fragmenter=highlight.ContextFragmenter(),
                                 formatter=highlight.UppercaseFormatter())
    assert result == "INDEXED!\n1"

コード例 #4

0

ファイルを表示

def test_context_fragment():
    terms = frozenset(("bravo", "india"))
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=6)
    uc = highlight.UppercaseFormatter()
    htext = highlight.highlight(_doc, terms, sa, cf, uc)
    assert htext == "alfa BRAVO charlie...hotel INDIA juliet"

コード例 #5

0

ファイルを表示

def test_context_at_start():
    terms = frozenset(["alfa"])
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=15)
    uc = highlight.UppercaseFormatter()
    htext = highlight.highlight(_doc, terms, sa, cf, uc)
    assert htext == "ALFA bravo charlie delta echo foxtrot"

コード例 #6

0

ファイルを表示

def test_html_format():
    terms = frozenset(("bravo", "india"))
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=6)
    hf = highlight.HtmlFormatter()
    htext = highlight.highlight(_doc, terms, sa, cf, hf)
    assert htext == 'alfa <strong class="match term0">bravo</strong> charlie...hotel <strong class="match term1">india</strong> juliet'

コード例 #7

0

ファイルを表示

def test_maxclasses():
    terms = frozenset(("alfa", "bravo", "charlie", "delta", "echo"))
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=6)
    hf = highlight.HtmlFormatter(tagname="b", termclass="t", maxclasses=2)
    htext = highlight.highlight(_doc, terms, sa, cf, hf)
    assert htext == '<b class="match t0">alfa</b> <b class="match t1">bravo</b> <b class="match t0">charlie</b>...<b class="match t1">delta</b> <b class="match t0">echo</b> foxtrot'

コード例 #8

0

ファイルを表示

    def __getitem__(self, item):
        if item.start in self.saved_results:
            return self.saved_results[item.start]

        q, mask = self._query
        sortedby, reverse = self._query_sortedby

        page: ResultsPage = self.searcher.search_page(
            q,
            mask=mask,
            filter=self._query_filter,
            pagenum=math.floor(item.start / self.page_size) + 1,
            pagelen=self.page_size,
            sortedby=sortedby,
            reverse=reverse)
        page.results.fragmenter = highlight.ContextFragmenter(surround=50)
        page.results.formatter = HtmlFormatter(tagname="span", between=" ... ")

        if not self.first_score and len(page.results) > 0:
            self.first_score = page.results[0].score

        if self.first_score:
            page.results.top_n = list(
                map(lambda hit: (hit[0] / self.first_score, hit[1]),
                    page.results.top_n))

        self.saved_results[item.start] = page

        return page

コード例 #9

0

ファイルを表示

 def search_law(self, q, callback):
     idx = index.open_dir(self.law_idx_path)
     with idx.searcher() as searcher:
         parser = QueryParser("LEGAL_TEXT", idx.schema).parse(q)
         results = searcher.search(parser)
         # results.fragmenter = highlight.PinpointFragmenter(surround=64, autotrim=True)
         results.fragmenter = highlight.ContextFragmenter(surround=128)
         results.formatter = highlight.UppercaseFormatter()
         callback(results)

コード例 #10

0

ファイルを表示

    def search(self):
        """
        Search function
        Searches all indexes for self.keyword and prints them.
        """

        try:
            ix = self.get_ix()
            search_term = str(self.keyword)

            from whoosh.qparser import QueryParser
            with ix.searcher() as searcher:
                query = QueryParser("content",
                                    ix.schema).parse("%s" % search_term)
                results = searcher.search(query, terms=True, limit=self.limit)
                results.fragmenter = highlight.ContextFragmenter(maxchars=200,
                                                                 surround=20)

                #If stdin == stdout, the programs output is not being
                #piped and colored output is fine
                color = (self.color == 'always' or
                         (self.color == 'auto' and os.fstat(0) == os.fstat(1)))
                results.formatter = ColorFormatter(color=color)

                #Remove excluded filetypes from search results
                if self.exclude:
                    results = [
                        f for f in results if
                        not os.path.splitext(f["path"])[1][1:] in self.exclude
                    ]
                if self.include:
                    results = [
                        f for f in results
                        if os.path.splitext(f["path"])[1][1:] in self.include
                    ]

                print(results)
                for i, result in enumerate(results, start=1):
                    if color:
                        print("Result %i: %s" %
                              (i, colorama.Fore.GREEN + result["path"] +
                               colorama.Fore.RESET))
                    else:
                        print("Result %i: %s" % (i, result["path"]))
                    with codecs.open(result["path"],
                                     encoding='utf-8',
                                     errors='ignore') as f:
                        file_content = f.read()
                        print(
                            result.highlights("content",
                                              text=file_content,
                                              top=10))
                        print("\n")
        finally:
            ix.close()

コード例 #11

0

ファイルを表示

def query_page(ix, query, page):
    searcher = ix.searcher()
    try:
        query_parser = MultifieldParser(["content", "title", "correspondent"],
                                        ix.schema).parse(query)
        result_page = searcher.search_page(query_parser, page)
        result_page.results.fragmenter = highlight.ContextFragmenter(
            surround=50)
        result_page.results.formatter = JsonFormatter()
        yield result_page
    finally:
        searcher.close()

コード例 #12

0

ファイルを表示

ファイル: index.py プロジェクト: tkuennen/paperless-ng

def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):
    searcher = ix.searcher()
    try:
        if querystring:
            qp = MultifieldParser(
                ["content", "title", "correspondent", "tag", "type"],
                ix.schema)
            qp.add_plugin(DateParserPlugin())
            str_q = qp.parse(querystring)
            corrected = searcher.correct_query(str_q, querystring)
        else:
            str_q = None
            corrected = None

        if more_like_doc_id:
            docnum = searcher.document_number(id=more_like_doc_id)
            kts = searcher.key_terms_from_text('content',
                                               more_like_doc_content,
                                               numterms=20,
                                               model=classify.Bo1Model,
                                               normalize=False)
            more_like_q = query.Or([
                query.Term('content', word, boost=weight)
                for word, weight in kts
            ])
            result_page = searcher.search_page(more_like_q,
                                               page,
                                               filter=str_q,
                                               mask={docnum})
        elif str_q:
            result_page = searcher.search_page(str_q, page)
        else:
            raise ValueError(
                "Either querystring or more_like_doc_id is required.")

        result_page.results.fragmenter = highlight.ContextFragmenter(
            surround=50)
        result_page.results.formatter = JsonFormatter()

        if corrected and corrected.query != str_q:
            corrected_query = corrected.string
        else:
            corrected_query = None

        yield result_page, corrected_query
    finally:
        searcher.close()

コード例 #13

0

ファイルを表示

def query_page(ix, querystring, page):
    searcher = ix.searcher()
    try:
        qp = MultifieldParser(
            ["content", "title", "correspondent", "tag", "type"], ix.schema)
        qp.add_plugin(DateParserPlugin())

        q = qp.parse(querystring)
        result_page = searcher.search_page(q, page)
        result_page.results.fragmenter = highlight.ContextFragmenter(
            surround=50)
        result_page.results.formatter = JsonFormatter()

        corrected = searcher.correct_query(q, querystring)
        if corrected.query != q:
            corrected_query = corrected.string
        else:
            corrected_query = None

        yield result_page, corrected_query
    finally:
        searcher.close()

コード例 #14

0

ファイルを表示

 def search(self, q: str):
     qp = MultifieldParser(['body', 'tags'], schema=self._schema)
     q_obj = qp.parse(q)
     ans = []
     with self._index.searcher() as srch:
         results = srch.search(q_obj, terms=True)
         results.fragmenter = highlight.ContextFragmenter(maxchars=100,
                                                          surround=30)
         for hit in results:
             item = dict(hit)
             full_path = os.path.join(self._data_dir, hit['path'])
             with open(full_path) as fr:
                 full_text = fr.read()
                 matched = dict(hit.matched_terms())
                 if 'body' in matched:
                     item['highlight'] = '...{} ... '.format(
                         hit.highlights('body', text=full_text))
                 else:
                     item['highlight'] = None
             item['path'] = item['path'].rsplit('.', 1)[0]
             ans.append(item)
     return ans

コード例 #15

0

ファイルを表示

def search(query, page):
    ix = index.open_dir(index_dir)
    res = dict()
    with ix.searcher() as se:
        q = MultifieldParser(["filepath", "document"],
                             schema=ix.schema).parse(query)
        p = se.search_page(q, page, pagelen=page_len)
        fg = highlight.ContextFragmenter(maxchars=200,
                                         surround=100,
                                         charlimit=100000)
        p.results.fragmenter = fg
        res['cur_page'] = p.pagenum
        res['tot_page'] = p.pagecount
        res['list'] = list()
        for hit in p:
            t = time.localtime(hit['moditime'])
            timestr = time.strftime('%Y-%m-%d %H:%M:%S', t)
            res['list'].append({
                "path": hit['filepath'],
                "time": timestr,
                "path_highlight": hit.highlights('filepath'),
                "highlight": hit.highlights('document')
            })
    return res

コード例 #16

0

ファイルを表示

def base_query():
    assert request.path == '/index'
    #print(dict(request.form)["query"][0])
    #print(dict(request.form))
    query_sentence = str(dict(request.form)["query"][0])
    logging.info("Query sentence: %s" % query_sentence)
    res = []
    with ix.searcher() as searcher:
        # 对输入的查询文本进行解析，如果存在按域查询的需求则区分按域查询，默认采用多属性查询模式
        # mark 表示是否需要高亮学院查询区域，默认情况下需要
        highlight_xy = True
        # 默认的多域查询
        query = qparser.MultifieldParser(
            ["content", "title", "mtext", "xueyuan"], ix.schema)
        if query_sentence.endswith("$姓名$"):
            # 按名字查询
            query = qparser.SimpleParser("title", ix.schema)
            query_sentence = query_sentence.strip('$姓名$')
        elif query_sentence.endswith("$学院$"):
            # 按学院查询
            query = qparser.SimpleParser("xueyuan", ix.schema)
            query_sentence = query_sentence.strip('$学院$')

        elif query_sentence.endswith("$网页$"):
            # 按网页内容查询
            query = qparser.SimpleParser("content", ix.schema)
            query_sentence = query_sentence.strip('$网页$')

        #print(query_sentence)
        # 引入查询解析器插件
        query.add_plugin(qparser.WildcardPlugin)

        # query.remove_plugin_class(qparser.WildcardPlugin)
        query.add_plugin(qparser.PrefixPlugin())
        query.add_plugin(qparser.OperatorsPlugin)
        query.add_plugin(qparser.RegexPlugin)
        query.add_plugin(qparser.PhrasePlugin)

        # 解析得到查询器
        q = query.parse(query_sentence)
        logging.info("Query parse result: %s" % str(q))
        print(q)
        # 获取查询结果
        result = searcher.search(q, limit=20)
        # print(result)
        # 设置碎片的属性
        # Allow larger fragments
        my_cf = highlight.ContextFragmenter(maxchars=200, surround=30)
        hf = highlight.HtmlFormatter(tagname='em',
                                     classname='match',
                                     termclass='term')

        hi = highlight.Highlighter(fragmenter=my_cf, formatter=hf)
        for hit in result:
            print(hit["picpath"])
            print(hit["title"])
            print(escape(hi.highlight_hit(hit, "content")))
            if hit['picpath'] == '#':
                if highlight_xy:
                    res.append({
                        "title":
                        hit['title'],
                        "xueyuan":
                        Markup(hi.highlight_hit(hit, "xueyuan")),
                        "url":
                        hit["url"],
                        'shotpath':
                        hit['shotpath'],
                        "content":
                        Markup(hi.highlight_hit(hit, "content")),
                        "parenturl":
                        hit["parenturl"],
                        "picpath":
                        '#',
                        "pagerank":
                        scores[url_dict[hit["url"]]]
                    })
                else:
                    res.append({
                        "title":
                        hit['title'],
                        "xueyuan":
                        hit["xueyuan"],
                        "url":
                        hit["url"],
                        'shotpath':
                        hit['shotpath'],
                        "content":
                        Markup(hi.highlight_hit(hit, "content")),
                        "parenturl":
                        hit["parenturl"],
                        "picpath":
                        '#',
                        "pagerank":
                        scores[url_dict[hit["url"]]]
                    })
            else:
                if highlight_xy:
                    res.append({
                        "title":
                        hit['title'],
                        "xueyuan":
                        Markup(hi.highlight_hit(hit, "xueyuan")),
                        "url":
                        hit["url"],
                        'shotpath':
                        hit['shotpath'],
                        "content":
                        Markup(hi.highlight_hit(hit, "content")),
                        "parenturl":
                        hit["parenturl"],
                        "picpath":
                        "images/%s/%s" % (hit['picpath'].split('/')[-3],
                                          hit['picpath'].split('/')[-1]),
                        "pagerank":
                        scores[url_dict[hit["url"]]]
                    })
                else:
                    res.append({
                        "title":
                        hit['title'],
                        "xueyuan":
                        hit["xueyuan"],
                        "url":
                        hit["url"],
                        'shotpath':
                        hit['shotpath'],
                        "content":
                        Markup(hi.highlight_hit(hit, "content")),
                        "parenturl":
                        hit["parenturl"],
                        "picpath":
                        "images/%s/%s" % (hit['picpath'].split('/')[-3],
                                          hit['picpath'].split('/')[-1]),
                        "pagerank":
                        scores[url_dict[hit["url"]]]
                    })
        print(len(result))
        print(res)
    count = len(result)

    if count == 0:
        logging.warning("%d,没有查询到相关内容！" % 404)
        return "没有查询到相关内容！", 404
    else:
        # 记录查询日志
        log = "Response: "
        for item in res:
            log = log + " (name:%s,url:%s) " % (item["title"], item["url"])
        logging.info(log)

        # # 基于page rank 对链接进行排序
        # res.sort(key=lambda k:(k.get("pagerank",0)),reverse = True)
        # print(res)

        mysession["data"] = res  # 使用会话session传递参数
        return jsonify({"url": "/display/%d&%s" % (count, query_sentence)})

コード例 #17

0

ファイルを表示

qp.add_plugin(qparser.PlusMinusPlugin)
query = qp.parse(search_input)
# print(query)
if search_type == "BM25":
    w = BM25F(B=0.75, K1=1.5)
elif search_type == "TFIDF":
    w = TF_IDF()
else:
    w = BM25F(
        B=0.75,
        K1=1.5,
    )
with ix.searcher(weighting=w) as searcher:
    results = searcher.search(query, terms=True)
    results.fragmenter = highlight.ContextFragmenter(
        maxchars=50,
        surround=50,
    )
    # print(list(searcher.lexicon("content")))
    found_doc_num = results.scored_length()
    run_time = results.runtime
    #  -------------------------------for html use---------------------------------
    if found_doc_num == 0:
        final_top_output = "<h1> Sorry " + str(found_doc_num) + " Search Results Found.</h1>" \
            "<h5>Search Results for "+search_input+" using "+search_type+" (" + str(run_time) + " seconds)</h5><br>"

    else:
        final_top_output = "<h1> Top " + str(found_doc_num) + " Search Results </h1>" \
            "<h5>Search Results for "+search_input+" using "+search_type +" Ranking and "+ operation_type+\
                           " operation to score (" + str(run_time) + " seconds)</h5><br>"
    print(final_top_output)
    if results:

コード例 #18

0

ファイルを表示

            print query.search_terms
            myquery = parser.parse( query.search_terms  )
            results = []
            reponse = []
            rr={}
            with self.queryIndex.searcher() as searcher:
                results = searcher.search( myquery )
                
                
                for result in results:
		  temp = result['content']
		  temp = temp.split("\t")
		  sugg= temp[0]
		  print result
		  rr['content']= sugg
                results.fragmenter = highlight.ContextFragmenter(surround=40)
                results.formatter = highlight.UppercaseFormatter()
                
                response = parse_whoosh_trec('WhooshQueryEngine', query.search_terms, results)
            return response

        # -----  The Following are Whoosh errors -----

        # There's a problem with the Whoosh query created from the users query
        except QueryError, e:
          raise SearchEngineError("Whoosh Query Suggest Engine", e, errorType="Whoosh", query=query)

        # Our Whoosh Index is empty
        except EmptyIndexError, e: 		
          raise SearchEngineError("Whoosh Query Suggest Engine", e, errorType="Whoosh")

コード例 #19

0

ファイルを表示

ファイル: service.py プロジェクト: EuanCockburn/ifind

class WhooshTRECNewsEngine(SearchEngine):
    """Whoosh Query log search engine."""
    def __init__(self, service, whoosh_trec_news_index_dir=""):
        super(WhooshTRECNewsEngine, self).__init__(service)
        try:
            self.docIndex = open_dir(whoosh_trec_news_index_dir)
            print "Whoosh Document index open"
            print self.docIndex.doc_count()
        except:
            print "Could not open Whoosh Document index at: " + whoosh_trec_news_index_dir

    def search(self, query, pos=0):
        """
        Search service for query log data held in a Whoosh TREC News Document index
        with a Schema()

        Parameters:

        * query (puppy.model.Query)

        Returns:

        * results puppy.model.Response

        Raises:

        * ?
        """
        def parse_whoosh_trec(site, query, results):
            response = Response()
            response.version = 'trec'
            response.feed.setdefault('title', "{0}: {1}".format(site, query))
            response.feed.setdefault('link', '')
            response.feed.setdefault(
                'description',
                "Search results for '{0}' at {1}".format(query, site))
            response.namespaces.setdefault(
                "opensearch", "http://a9.com/-/spec/opensearch/1.1/")
            response.feed.setdefault("opensearch_totalresults",
                                     results.pagecount)
            response.feed.setdefault("opensearch_itemsperpage", pagelen)
            response.feed.setdefault("opensearch_startindex", results.pagenum)
            response.feed.setdefault('query', query)
            try:
                r = 0
                if len(results) > 1:
                    for hit in results:
                        r = r + 1
                        title = hit["title"]
                        title = title.strip()
                        if len(title) < 1:
                            title = query
                        rank = (
                            (int(results.pagenum) - 1) * results.pagelen) + r
                        link = "/treconomics/" + str(
                            hit.docnum) + "?rank=" + str(rank)
                        desc = hit.highlights("content")
                        docid = hit["docid"]
                        docid = docid.strip()
                        source = hit["source"]
                        response.entries.append({
                            'title': title,
                            'link': link,
                            'summary': desc,
                            'docid': docid,
                            'source': source
                        })
                else:
                    print "No hits found for query: " + query
            except Exception, e:
                print "Converting results to OpenSearch Failed"
            return response
            # end parse_whoosh_trec

        try:
            parser = QueryParser("content", self.docIndex.schema)
            #mparser = MultifieldParser(["title", "content"], schema=self.docIndex.schema)
            print "In WhooshTRECNewsEngine: " + query.search_terms
            query_terms = parser.parse(query.search_terms)

            page = query.start_page
            pagelen = query.page_len
            #print query_terms
            #print "page len" + str(pagelen)
            results = []
            reponse = []
            with self.docIndex.searcher() as searcher:
                results = searcher.search_page(query_terms,
                                               page,
                                               pagelen=pagelen)
                #             results = searcher.search( query_terms )

                results.fragmenter = highlight.ContextFragmenter(maxchars=300,
                                                                 surround=300)
                results.formatter = highlight.HtmlFormatter()
                results.fragmenter.charlimit = 100000

                print "WhooshTRECNewsEngine found: " + str(
                    len(results)) + " results"
                print "Page %d of %d - PageLength of %d" % (
                    results.pagenum, results.pagecount, results.pagelen)
                response = parse_whoosh_trec('WhooshTRECNewsEngine',
                                             query.search_terms, results)
            return response
        except:
            print "Error in Search Service: Whoosh TREC News search failed"