def testHighlighting(self):

        text = "The quick brown fox jumps over the lazy dog"

        query = TermQuery(Term("field", "fox"))
        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream("field", StringReader(text))

        self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog",
                         highlighter.getBestFragment(tokenStream, text))
Exemple #2
0
    def testHighlighting(self):

        text = "The quick brown fox jumps over the lazy dog"

        query = TermQuery(Term("field", "fox"))
        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream(
            "field", StringReader(text))

        self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog",
                         highlighter.getBestFragment(tokenStream, text))
    def testHits(self):

        searcher = self.getSearcher()
        query = TermQuery(Term("title", "action"))
        scoreDocs = searcher.search(query, 50).scoreDocs

        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            title = doc["title"]
            stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream("title", StringReader(title))
            fragment = highlighter.getBestFragment(stream, title)
    
            print fragment
Exemple #4
0
    def testHits(self):

        searcher = IndexSearcher(self.directory, True)
        query = TermQuery(Term("title", "action"))
        scoreDocs = searcher.search(query, 50).scoreDocs

        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            title = doc["title"]
            stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream(
                "title", StringReader(title))
            fragment = highlighter.getBestFragment(stream, title)

            print fragment
def run(command,pageindex=1,pagesize=15):
    global searcher, analyzer,old_command,old_res_list
    global STORE_DIR,directory,searcher,analyzer
    if command == '':
        return

    print "Searching for:", command  
    
    querys = BooleanQuery()
    command_dict = parseCommand(command)
    for k,v in command_dict.iteritems():            
        if(k=='site'):
            t = Term('url','*'+v.strip()+'*')
            query = WildcardQuery(t)

        else:
            query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    
    scoreDocs = searcher.search(querys, 4000).scoreDocs
    print "%s total matching documents." % len(scoreDocs)
    
    res_list = []
    simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>", "</font>")
    queryToHigh = QueryParser(Version.LUCENE_CURRENT,"lrc",analyzer).parse(command_dict['content'])
    hlter = Highlighter(simpleHTMLFormatter,QueryScorer(queryToHigh))
    hlter.setTextFragmenter(SimpleFragmenter(200))
    start = (pageindex-1)*pagesize
    end = start+pagesize
    print start,end
    for scoreDoc in scoreDocs[start:end+10]:
        doc = searcher.doc(scoreDoc.doc)
        res = []
        res.append(doc.get('url'))
        res.append(doc.get('music_name'))
        res.append(doc.get('artist'))
        res.append(doc.get('album_name'))
        res.append(doc.get('lrc'))
        output = hlter.getBestFragment(analyzer,"lrc",clear(doc.get('lrc')))
        res.append(output)
        res.append(doc.get('musicID'))
        if(res[5]!=None):
            res_list.append(res) 
        if(len(res_list)==8):
            break
    return res_list,len(scoreDocs)
Exemple #6
0
def run(command, pageindex=1, pagesize=15):
    global searcher, analyzer, old_command, old_res_list
    global STORE_DIR, directory, searcher, analyzer
    if command == '':
        return

    print "Searching for:", command  #朱莉与茱莉娅

    # final = jieba.cut(command)
    # query = QueryParser(Version.LUCENE_CURRENT, "contents",
    #                     analyzer).parse(' '.join(final))

    querys = BooleanQuery()
    command_dict = parseCommand(command)
    for k, v in command_dict.iteritems():
        if (k == 'site'):
            t = Term('url', '*' + v.strip() + '*')
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 4000).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    res_list = []
    simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>",
                                              "</font>")

    queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents",
                              analyzer).parse(command_dict['contents'])

    hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh))
    hlter.setTextFragmenter(SimpleFragmenter(200))
    start = (pageindex - 1) * pagesize
    end = start + pagesize
    for scoreDoc in scoreDocs[start:end + 1]:
        doc = searcher.doc(scoreDoc.doc)
        res = []
        res.append(doc.get('title'))
        res.append(doc.get('url'))
        output = hlter.getBestFragment(analyzer, "contents",
                                       clear(doc.get('contents')))
        res.append(output)
        res_list.append(res)
    return res_list, len(scoreDocs)
Exemple #7
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command  #朱莉与茱莉娅

        # final = jieba.cut(command)
        # query = QueryParser(Version.LUCENE_CURRENT, "contents",
        #                     analyzer).parse(' '.join(final))

        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k, v in command_dict.iteritems():
            if (k == 'site'):
                t = Term('url', '*' + v.strip() + '*')
                query = WildcardQuery(t)
            else:
                query = QueryParser(Version.LUCENE_CURRENT, k,
                                    analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 10).scoreDocs

        print "%s total matching documents." % len(scoreDocs)
        simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>",
                                                  "</font>")

        queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents",
                                  analyzer).parse(command_dict['contents'])

        hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh))
        hlter.setTextFragmenter(SimpleFragmenter(500))
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------'
            #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site')
            print 'title:', doc.get('title'),
            print 'url:', doc.get('url')
            ori_text = clear(doc.get('contents'))
            output = hlter.getBestFragment(analyzer, "contents", ori_text)
            print output
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command #朱莉与茱莉娅

        # final = jieba.cut(command)
        # query = QueryParser(Version.LUCENE_CURRENT, "contents",
        #                     analyzer).parse(' '.join(final))
        
        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k,v in command_dict.iteritems():            
            if(k=='site'):
                t = Term('url','*'+v.strip()+'*')
                query = WildcardQuery(t)
            else:
                query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 10).scoreDocs
        
        print "%s total matching documents." % len(scoreDocs)
        simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>", "</font>")

        queryToHigh = QueryParser(Version.LUCENE_CURRENT,"contents",analyzer).parse(command_dict['contents'])

        hlter = Highlighter(simpleHTMLFormatter,QueryScorer(queryToHigh))
        hlter.setTextFragmenter(SimpleFragmenter(500))
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------'
            #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site')
            print 'title:',doc.get('title'),
            print 'url:',doc.get('url')
            ori_text = clear(doc.get('contents'))
            output = hlter.getBestFragment(analyzer,"contents",ori_text)
            print output
Exemple #9
0
def search(request):

    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()

    ret = {}
    maxLength = 38

    search_content = request.GET.get('content')
    if len(search_content) > maxLength:
        pass

    query = QueryParser(Version.LUCENE_CURRENT, "contentKeyword",
                        analyzer).parse(search_content)
    scoreDocs = searcher.search(query, 50).scoreDocs

    scorer = QueryScorer(query)
    formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>")
    highlighter = Highlighter(formatter, scorer)
    fragmenter = SimpleFragmenter(50)
    highlighter.setTextFragmenter(fragmenter)

    ret['NumOfDocs'] = str(len(scoreDocs)) + "total matching documents."

    print ret['NumOfDocs']

    conn = pymysql.connect(host='localhost',
                           user=user,
                           password=password,
                           db=db_name,
                           charset='utf8mb4',
                           cursorclass=pymysql.cursors.DictCursor)

    rst = ''
    ret['search_list'] = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        _id = str(doc.get("id"))
        print _id
        sql = 'select * from webpage where id=%s'

        with conn.cursor() as cursor:
            cursor.execute(sql, (_id))
            rst = cursor.fetchone()

        titleStream = ChineseAnalyzer(Version.LUCENE_CURRENT).tokenStream(
            "title", StringReader(rst['title']))
        titleFragment = highlighter.getBestFragment(titleStream, rst['title'])
        if titleFragment is None:
            titleFragment = rst['title']

        contentStream = ChineseAnalyzer(Version.LUCENE_CURRENT).tokenStream(
            "content", StringReader(rst['content']))
        contentFragment = highlighter.getBestFragments(contentStream,
                                                       rst['content'], 5,
                                                       '...')

        ret['search_list'].append({
            'title': titleFragment,
            'url': rst['url'],
            'content': contentFragment
        })
    #searcher.close()
    conn.close()

    return render(request, 'tjut/result.html', {
        'search_list': ret['search_list'],
        'search_content': search_content
    })