Exemple #1
0
def search(request):
    indexNewsObject = IndexNews()
    ix = indexNewsObject.ix
    if request.method == 'POST':
        inputQuery = request.POST['inputQuerySearchPage']
        request.session['inputQuery'] = inputQuery
        if inputQuery == '':
            context = {
                'message' : 'لطفا عبارت مورد نظر خود را وارد کنید'
            }
            return render(request,'searchPage/searchPage.html',context=context)
        else:
            # queryParser = QueryParser(fieldname='content',schema=ix.schema,group=OrGroup)
            # queryParser = MultifieldParser(['title','content'],schema=ix.schema,group=OrGroup)
            queryParser = MultifieldParser(['title','content','summary'],schema=ix.schema)
            query = queryParser.parse(inputQuery)
            with ix.searcher(weighting=scoring.BM25F()) as searcher:
                results = searcher.search(query,terms=True,limit=None)
                
                #for customize html tag form highlight matched terms 
                htmlFormat = highlight.HtmlFormatter('b')
                results.formatter = htmlFormat
                results.fragmenter.maxchars = 300
                results.fragmenter.surround = 150
                paginator = Paginator(results,15)
                page = request.GET.get('page')
                resultWithPage = paginator.get_page(page)
                context = {
                'results':resultWithPage,
                'inputQuery':inputQuery
                }
                return render(request,'searchPage/searchPage.html',context=context)
    else:
        inputQuery = request.session['inputQuery']
        # queryParser = QueryParser(fieldname='content',schema=ix.schema,group=OrGroup)
        queryParser = MultifieldParser(['title','content','summary'],schema=ix.schema)
        query = queryParser.parse(inputQuery)
        with ix.searcher(weighting=scoring.BM25F()) as searcher:
            results = searcher.search(query,terms=True,limit=None)

            #for customize html tag form highlight matched terms 
            htmlFormat = highlight.HtmlFormatter('b')
            results.formatter = htmlFormat
            results.fragmenter.maxchars = 300
            results.fragmenter.surround = 150
            paginator = Paginator(results,15)
            page = request.GET.get('page')
            resultWithPage = paginator.get_page(page)
            context = {
            'results':resultWithPage,
            'inputQuery':inputQuery
            }
            return render(request,'searchPage/searchPage.html',context=context)
Exemple #2
0
def test_maxclasses():
    terms = frozenset(("alfa", "bravo", "charlie", "delta", "echo"))
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=6)
    hf = highlight.HtmlFormatter(tagname="b", termclass="t", maxclasses=2)
    htext = highlight.highlight(_doc, terms, sa, cf, hf)
    assert htext == '<b class="match t0">alfa</b> <b class="match t1">bravo</b> <b class="match t0">charlie</b>...<b class="match t1">delta</b> <b class="match t0">echo</b> foxtrot'
Exemple #3
0
def test_html_format():
    terms = frozenset(("bravo", "india"))
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=6)
    hf = highlight.HtmlFormatter()
    htext = highlight.highlight(_doc, terms, sa, cf, hf)
    assert htext == 'alfa <strong class="match term0">bravo</strong> charlie...hotel <strong class="match term1">india</strong> juliet'
Exemple #4
0
def test_correct_query():
    schema = fields.Schema(a=fields.TEXT(spelling=True), b=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(a=u("alfa bravo charlie delta"))
    w.add_document(a=u("delta echo foxtrot golf"))
    w.add_document(a=u("golf hotel india juliet"))
    w.add_document(a=u("juliet kilo lima mike"))
    w.commit()

    s = ix.searcher()
    qp = QueryParser("a", ix.schema)
    qtext = u('alpha ("brovo november" OR b:dolta) detail')
    q = qp.parse(qtext, ix.schema)

    c = s.correct_query(q, qtext)
    assert c.query.__unicode__(
    ) == '(a:alfa AND (a:"bravo november" OR b:dolta) AND a:detail)'
    assert c.string == 'alfa ("bravo november" OR b:dolta) detail'

    qtext = u('alpha b:("brovo november" a:delta) detail')
    q = qp.parse(qtext, ix.schema)
    c = s.correct_query(q, qtext)
    assert c.query.__unicode__(
    ) == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)'
    assert c.string == 'alfa b:("brovo november" a:delta) detail'

    hf = highlight.HtmlFormatter(classname="c")
    assert c.format_string(
        hf
    ) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
Exemple #5
0
def get_html_correction(searcher, query_str, qp):
    exact_qp = QueryParser('exact', my_index.search_schema)
    exact_qp.add_plugin(DateParserPlugin())
    exact_qp = exact_qp.parse(query_str)
    try:
        corrected_query = searcher.correct_query(exact_qp, query_str, prefix=1)
    except:
        return ""

    for token in corrected_query.tokens:
        # is this some sort of bug with Whoosh? startchar:8, endchar:9 original:'tes?' the hell?
        if query_str[token.startchar:token.endchar] != token.original:
            return ""
        for variations in (uk_variations, us_variations):
            if token.original in variations and searcher.ixreader.frequency(
                    'exact', variations[token.original]) > 0:
                token.text = variations[token.original]
                break
        # not sure this code ever gets a chance to run due to above possible bug
        if re.search(r'\W', token.original):
            token.text = token.original
    corrected_query_str = replace_tokens(query_str, corrected_query.tokens)
    corrected_qp = QueryParser('stemmed', my_index.search_schema)
    corrected_qp.add_plugin(DateParserPlugin())
    corrected_qp = corrected_qp.parse(corrected_query_str)
    if corrected_qp == qp:
        return ""

    result = '<h3>Did you mean <a href="{}">{}</a>?</strong></h3>'.format(
        stateful_url_for('search_form', q_query=urlize(corrected_query_str)),
        corrected_query.format_string(
            highlight.HtmlFormatter(classname="change")))
    return result
Exemple #6
0
def render_results(s, qs, template):
    qp = qparser.QueryParser("content", s.schema)
    qp = qparser.MultifieldParser(["tgrams", "content"], s.schema)

    # Add the DateParserPlugin to the parser
    qp.add_plugin(DateParserPlugin())

    q = qp.parse(qs)

    results = s.search(q, limit=100)
    results = s.search(q, limit=100, sortedby="title", reverse=True)
    results = s.search(q, limit=100, groupedby="chapter")
    q = results.q

    hf = highlight.HtmlFormatter()
    results.highlighter = highlight.Highlighter(formatter=hf)

    qc = None
    if not results:
        corrected = s.correct_query(q, qs, prefix=1)
        if corrected.query != q:
            qc = corrected.format_string(hf)

    def hilite(hit):
        with open(SOURCEDIR + hit["path"], "rb") as hitfile:
            text = hitfile.read().decode("utf-8")
        return hit.highlights("content", text)

    return render_template(template,
                           qs=qs,
                           q=q,
                           results=results,
                           hilite=hilite,
                           corrected=qc,
                           args=request.args)
Exemple #7
0
def test_correct_spell_field():
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True))
    with TempIndex(schema) as ix:
        with ix.writer() as w:
            w.add_document(text=u"rendering shading modeling reactions")

        with ix.searcher() as s:
            text = s.schema["text"]
            spell_text = s.schema["spell_text"]

            r = s.reader()
            words = [text.from_bytes(t) for t in r.lexicon("text")]
            assert words == ["model", "reaction", "render", "shade"]

            words = [spell_text.from_bytes(t) for t in r.lexicon("spell_text")]
            assert words == ["modeling", "reactions", "rendering", "shading"]

            qp = QueryParser("text", s.schema)
            qtext = u"renderink"
            q = qp.parse(qtext, s.schema)

            r = s.search(q)
            assert len(r) == 0

            c = s.correct_query(q, qtext)
            assert c.string == "rendering"
            assert c.query == query.Term("text", "rendering")

            hf = highlight.HtmlFormatter(classname="c")
            assert c.format_string(
                hf) == '<strong class="c term0">rendering</strong>'
Exemple #8
0
def test_correct_query():
    schema = fields.Schema(a=fields.TEXT(), b=fields.TEXT)
    with TempIndex(schema) as ix:
        with ix.writer() as w:
            w.add_document(a=u"alfa bravo charlie delta")
            w.add_document(a=u"delta echo foxtrot golf")
            w.add_document(a=u"golf hotel india juliet")
            w.add_document(a=u"juliet kilo lima mike")

        with ix.searcher() as s:
            qp = QueryParser("a", ix.schema)
            qtext = u'alpha ("brovo november" OR b:dolta) detail'
            q = qp.parse(qtext, ix.schema)

            c = s.correct_query(q, qtext)
            cq = c.query
            assert isinstance(cq, query.And)
            assert cq[0].text == "alfa"
            assert isinstance(cq[1], query.Or)
            assert isinstance(cq[1][0], query.Phrase)
            assert cq[1][0].words == ["bravo", "november"]

            qtext = u'alpha b:("brovo november" a:delta) detail'
            q = qp.parse(qtext, ix.schema)
            c = s.correct_query(q, qtext)
            assert c.query.__unicode__(
            ) == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)'
            assert c.string == 'alfa b:("brovo november" a:delta) detail'

            hf = highlight.HtmlFormatter(classname="c")
            assert c.format_string(
                hf
            ) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
    def create_search_result(self, results):
        # Allow larger fragments
        results.fragmenter.maxchars = 300

        # Show more context before and after
        results.fragmenter.surround = 50

        # Set result formatter
        results.formatter = hl.HtmlFormatter(tagname="mark")

        search_results = []
        for r in results:
            sr = SearchResult()
            sr.score = r.score
            sr.tags = r["tags"]
            sr.path = r["path"]
            sr.content = r["content"]
            highlights = r.highlights("content")
            if not highlights:
                highlights = self.cap(r["content"], 500)
            # unescape
            highlights = self.html_parser.unescape(highlights)
            html = self.markdown(highlights)
            sr.content_highlight = html
            if "headlines" in r:
                sr.headlines = r["headlines"]
            search_results.append(sr)

        return search_results
Exemple #10
0
def test_html_escape():
    terms = frozenset(["bravo"])
    sa = analysis.StandardAnalyzer()
    wf = highlight.WholeFragmenter()
    hf = highlight.HtmlFormatter()
    htext = highlight.highlight(u('alfa <bravo "charlie"> delta'), terms, sa,
                                wf, hf)
    assert htext == 'alfa &lt;<strong class="match term0">bravo</strong> "charlie"&gt; delta'
Exemple #11
0
def test_query_highlight():
    qp = QueryParser("a", None)
    hf = highlight.HtmlFormatter()

    def do(text, terms):
        q = qp.parse(text)
        tks = [tk for tk in q.all_tokens() if tk.text in terms]
        for tk in tks:
            if tk.startchar is None or tk.endchar is None:
                assert False, tk
        fragment = highlight.Fragment(text, tks)
        return hf.format_fragment(fragment)

    assert do("a b c d",
              ["b"]) == 'a <strong class="match term0">b</strong> c d'
    assert do(
        'a (x:b OR y:"c d") e', ("b", "c")
    ) == 'a (x:<strong class="match term0">b</strong> OR y:"<strong class="match term1">c</strong> d") e'
Exemple #12
0
def search(query, page=1, per_page=20):
    with index.searcher() as s:
        qp = qparser.MultifieldParser(['title', 'content'], index.schema)
        q = qp.parse(unicode(query))
        try:
            result_page = s.search_page(q, page, pagelen=per_page)
        except ValueError:
            if page == 1:
                return SearchResultPage(None, page)
            return None
        results = result_page.results
        results.highlighter.fragmenter.maxchars = 512
        results.highlighter.fragmenter.surround = 40
        results.highlighter.formatter = highlight.HtmlFormatter(
            'em',
            classname='search-match',
            termclass='search-term',
            between=u'<span class=ellipsis> … </span>')
        return SearchResultPage(result_page, page)
Exemple #13
0
    def get_search_result(self, kws_query, page=1, page_len=10):
        page -= 1
        res = []

        score_docs = kws_query[1]

        score_docs.fragmenter.maxchars = cfg.max_result_return
        score_docs.fragmenter.surround = cfg.preview_surround_length

        score_docs.formatter = highlight.HtmlFormatter()

        try:
            for i in range(page * page_len, (page + 1) * page_len):
                score_doc = score_docs[i]
                info = self.content_reader.read(score_doc["store_path"])

                if info is None:
                    continue

                title_start = info['content'].find('<title>')
                title_end = info['content'].find('</title>')

                if title_start != -1 and title_end != -1:
                    title = info['content'][title_start + 7:title_end]
                else:
                    title = kws_query[0]

                if len(title) > cfg.max_title_length:
                    title = title[:cfg.max_title_length] + '.....'

                text = helper.remove_html_js(info['content'])
                class_label = self.bayesData.contextTest(text)

                preview = score_doc.highlights("content", text=text)
                res.append({
                    'url': score_doc["url"],
                    'title': title,
                    'preview': preview,
                    'classLable': class_label,
                    'snapshot': ''
                })
        except Exception, e:
            print "Get search result failed", e
Exemple #14
0
def base_query():
    assert request.path == '/index'
    #print(dict(request.form)["query"][0])
    #print(dict(request.form))
    query_sentence = str(dict(request.form)["query"][0])
    logging.info("Query sentence: %s" % query_sentence)
    res = []
    with ix.searcher() as searcher:
        # 对输入的查询文本进行解析,如果存在按域查询的需求则区分按域查询,默认采用多属性查询模式
        # mark 表示是否需要高亮学院查询区域,默认情况下需要
        highlight_xy = True
        # 默认的多域查询
        query = qparser.MultifieldParser(
            ["content", "title", "mtext", "xueyuan"], ix.schema)
        if query_sentence.endswith("$姓名$"):
            # 按名字查询
            query = qparser.SimpleParser("title", ix.schema)
            query_sentence = query_sentence.strip('$姓名$')
        elif query_sentence.endswith("$学院$"):
            # 按学院查询
            query = qparser.SimpleParser("xueyuan", ix.schema)
            query_sentence = query_sentence.strip('$学院$')

        elif query_sentence.endswith("$网页$"):
            # 按网页内容查询
            query = qparser.SimpleParser("content", ix.schema)
            query_sentence = query_sentence.strip('$网页$')

        #print(query_sentence)
        # 引入查询解析器插件
        query.add_plugin(qparser.WildcardPlugin)

        # query.remove_plugin_class(qparser.WildcardPlugin)
        query.add_plugin(qparser.PrefixPlugin())
        query.add_plugin(qparser.OperatorsPlugin)
        query.add_plugin(qparser.RegexPlugin)
        query.add_plugin(qparser.PhrasePlugin)

        # 解析得到查询器
        q = query.parse(query_sentence)
        logging.info("Query parse result: %s" % str(q))
        print(q)
        # 获取查询结果
        result = searcher.search(q, limit=20)
        # print(result)
        # 设置碎片的属性
        # Allow larger fragments
        my_cf = highlight.ContextFragmenter(maxchars=200, surround=30)
        hf = highlight.HtmlFormatter(tagname='em',
                                     classname='match',
                                     termclass='term')

        hi = highlight.Highlighter(fragmenter=my_cf, formatter=hf)
        for hit in result:
            print(hit["picpath"])
            print(hit["title"])
            print(escape(hi.highlight_hit(hit, "content")))
            if hit['picpath'] == '#':
                if highlight_xy:
                    res.append({
                        "title":
                        hit['title'],
                        "xueyuan":
                        Markup(hi.highlight_hit(hit, "xueyuan")),
                        "url":
                        hit["url"],
                        'shotpath':
                        hit['shotpath'],
                        "content":
                        Markup(hi.highlight_hit(hit, "content")),
                        "parenturl":
                        hit["parenturl"],
                        "picpath":
                        '#',
                        "pagerank":
                        scores[url_dict[hit["url"]]]
                    })
                else:
                    res.append({
                        "title":
                        hit['title'],
                        "xueyuan":
                        hit["xueyuan"],
                        "url":
                        hit["url"],
                        'shotpath':
                        hit['shotpath'],
                        "content":
                        Markup(hi.highlight_hit(hit, "content")),
                        "parenturl":
                        hit["parenturl"],
                        "picpath":
                        '#',
                        "pagerank":
                        scores[url_dict[hit["url"]]]
                    })
            else:
                if highlight_xy:
                    res.append({
                        "title":
                        hit['title'],
                        "xueyuan":
                        Markup(hi.highlight_hit(hit, "xueyuan")),
                        "url":
                        hit["url"],
                        'shotpath':
                        hit['shotpath'],
                        "content":
                        Markup(hi.highlight_hit(hit, "content")),
                        "parenturl":
                        hit["parenturl"],
                        "picpath":
                        "images/%s/%s" % (hit['picpath'].split('/')[-3],
                                          hit['picpath'].split('/')[-1]),
                        "pagerank":
                        scores[url_dict[hit["url"]]]
                    })
                else:
                    res.append({
                        "title":
                        hit['title'],
                        "xueyuan":
                        hit["xueyuan"],
                        "url":
                        hit["url"],
                        'shotpath':
                        hit['shotpath'],
                        "content":
                        Markup(hi.highlight_hit(hit, "content")),
                        "parenturl":
                        hit["parenturl"],
                        "picpath":
                        "images/%s/%s" % (hit['picpath'].split('/')[-3],
                                          hit['picpath'].split('/')[-1]),
                        "pagerank":
                        scores[url_dict[hit["url"]]]
                    })
        print(len(result))
        print(res)
    count = len(result)

    if count == 0:
        logging.warning("%d,没有查询到相关内容!" % 404)
        return "没有查询到相关内容!", 404
    else:
        # 记录查询日志
        log = "Response: "
        for item in res:
            log = log + " (name:%s,url:%s) " % (item["title"], item["url"])
        logging.info(log)

        # # 基于page rank 对链接进行排序
        # res.sort(key=lambda k:(k.get("pagerank",0)),reverse = True)
        # print(res)

        mysession["data"] = res  # 使用会话session传递参数
        return jsonify({"url": "/display/%d&%s" % (count, query_sentence)})
Exemple #15
0
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_similarity_score
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh import index, searching
from whoosh import qparser
from whoosh import highlight

path1 = "documents"
path = "docs"
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
ix = index.create_in(path, schema)
writer = ix.writer()
dir = os.listdir(path1)
st = set(stopwords.words('english'))
hf = highlight.HtmlFormatter()


def queryformation(qstring, content):
    query_content = []
    i = ""
    wd = word_tokenize(qstring)
    for w in wd:
        if w not in st:
            i = i + " " + w
    query_content.append(i)
    X = CountVectorizer().fit_transform(query_content)
    t = TfidfTransformer(smooth_idf=False).fit_transform(X)
    te = []
    for i in t:
        te.append(list(i.A[0]))
Exemple #16
0
class WhooshTRECNewsEngine(SearchEngine):
    """Whoosh Query log search engine."""
    def __init__(self, service, whoosh_trec_news_index_dir=""):
        super(WhooshTRECNewsEngine, self).__init__(service)
        try:
            self.docIndex = open_dir(whoosh_trec_news_index_dir)
            print "Whoosh Document index open"
            print self.docIndex.doc_count()
        except:
            print "Could not open Whoosh Document index at: " + whoosh_trec_news_index_dir

    def search(self, query, pos=0):
        """
        Search service for query log data held in a Whoosh TREC News Document index
        with a Schema()

        Parameters:

        * query (puppy.model.Query)

        Returns:

        * results puppy.model.Response

        Raises:

        * ?
        """
        def parse_whoosh_trec(site, query, results):
            response = Response()
            response.version = 'trec'
            response.feed.setdefault('title', "{0}: {1}".format(site, query))
            response.feed.setdefault('link', '')
            response.feed.setdefault(
                'description',
                "Search results for '{0}' at {1}".format(query, site))
            response.namespaces.setdefault(
                "opensearch", "http://a9.com/-/spec/opensearch/1.1/")
            response.feed.setdefault("opensearch_totalresults",
                                     results.pagecount)
            response.feed.setdefault("opensearch_itemsperpage", pagelen)
            response.feed.setdefault("opensearch_startindex", results.pagenum)
            response.feed.setdefault('query', query)
            try:
                r = 0
                if len(results) > 1:
                    for hit in results:
                        r = r + 1
                        title = hit["title"]
                        title = title.strip()
                        if len(title) < 1:
                            title = query
                        rank = (
                            (int(results.pagenum) - 1) * results.pagelen) + r
                        link = "/treconomics/" + str(
                            hit.docnum) + "?rank=" + str(rank)
                        desc = hit.highlights("content")
                        docid = hit["docid"]
                        docid = docid.strip()
                        source = hit["source"]
                        response.entries.append({
                            'title': title,
                            'link': link,
                            'summary': desc,
                            'docid': docid,
                            'source': source
                        })
                else:
                    print "No hits found for query: " + query
            except Exception, e:
                print "Converting results to OpenSearch Failed"
            return response
            # end parse_whoosh_trec

        try:
            parser = QueryParser("content", self.docIndex.schema)
            #mparser = MultifieldParser(["title", "content"], schema=self.docIndex.schema)
            print "In WhooshTRECNewsEngine: " + query.search_terms
            query_terms = parser.parse(query.search_terms)

            page = query.start_page
            pagelen = query.page_len
            #print query_terms
            #print "page len" + str(pagelen)
            results = []
            reponse = []
            with self.docIndex.searcher() as searcher:
                results = searcher.search_page(query_terms,
                                               page,
                                               pagelen=pagelen)
                #             results = searcher.search( query_terms )

                results.fragmenter = highlight.ContextFragmenter(maxchars=300,
                                                                 surround=300)
                results.formatter = highlight.HtmlFormatter()
                results.fragmenter.charlimit = 100000

                print "WhooshTRECNewsEngine found: " + str(
                    len(results)) + " results"
                print "Page %d of %d - PageLength of %d" % (
                    results.pagenum, results.pagecount, results.pagelen)
                response = parse_whoosh_trec('WhooshTRECNewsEngine',
                                             query.search_terms, results)
            return response
        except:
            print "Error in Search Service: Whoosh TREC News search failed"