Exemple #1
0
def highlighting(analyzer,contents,query):
    formatter=SimpleHTMLFormatter("<b><font color='black'>","</font></b>")
    highlighter=Highlighter(formatter,QueryScorer(query))
    highlighter.setTextFragmenter(SimpleFragmenter(30))
    tokenStream=analyzer.tokenStream('contents',contents)
    light_content=highlighter.getBestFragments(tokenStream,contents,3,'...')
    return light_content 
Exemple #2
0
    def output(self, score_docs, command):
        '''
        Highlight and return the search results.

        Input: `score_docs`: search results from the index
        Output: list of documents info found in the index,
                details includes `title`, `url` and `abstract`
        '''
        query = QueryParser('contents', self.analyzer).parse(command)
        highlighter = Highlighter(self.formatter, QueryScorer(query))
        highlighter.setTextFragmenter(
            SimpleFragmenter(200))  # Limit the max number of characters

        results = []
        for score_doc in score_docs:
            doc = self.searcher.doc(score_doc.doc)
            contents = doc.get('contents')
            stream = self.analyzer.tokenStream("contents", contents)
            abstract = highlighter.getBestFragment(
                stream, contents)  # Get the abstract and highlight
            result = {
                'title': doc.get('title'),
                'url': doc.get('url'),
                'abstract': abstract.replace(' ', '')
            }
            results.append(result)
        return results
Exemple #3
0
    def search(self, q, page=1, duplicates=False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)

        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream(
                "contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream,
                                                     doc['contents'], 3, "...")

            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight
            })

        del self.searcher

        totalPages = int(math.ceil(results.getTotalHits() / float(perPage)))

        return totalPages, docs
def text_search(command):
    envir.vm_env.attachCurrentThread()
    command_dict = parseCommand(command, "contents")
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k,
                            envir.analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = envir.text_searcher.search(querys, 30).scoreDocs
    res = []

    query_highlight = QueryParser(Version.LUCENE_CURRENT, k,
                                  envir.analyzer).parse(command_dict["contents"])
    myhighlighter = Highlighter(
        SimpleHTMLFormatter(), QueryScorer(query_highlight))
    myhighlighter.setTextFragmenter(SimpleFragmenter(50))
    for scoreDoc in scoreDocs:
        # find texts which are around the keyword
        doc = envir.text_searcher.doc(scoreDoc.doc)
        text = doc.get("contents")
        key_text = "".join((myhighlighter.getBestFragments(
            envir.analyzer, "contents", text, 3)))
        key_text = re.sub('\s', '', key_text)
        temp = [doc.get("title"), doc.get('url'), key_text]
        res.append(temp)
    return res
Exemple #5
0
    def testSimpleHighlighter(self):

        self.doSearching("Wicked")
        highlighter = Highlighter(QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))
        maxNumFragmentsRequired = 2

        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  "...")
            print "\t", result
Exemple #6
0
    def doStandardHighlights(self):

        formatter = TestFormatter(self)

        highlighter = Highlighter(formatter, QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(20))
        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            maxNumFragmentsRequired = 2
            fragmentSeparator = "..."
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  fragmentSeparator)
            print "\t", result
 def search_by(self, **kwargs):
     command = kwargs.get('command', '')
     if command == '':
         return None
     field = kwargs.get('field')
     query_type = kwargs.get('query_type', 'chi')
     if query_type == 'chi':
         if field in ['token_taglist', 'token_content', 'token_title', 'token_author']:
             command = ' '.join(jieba.cut_for_search(command))
         hlt_analyzer = self.analyzer['ChineseAnalyzer']
     else:
         if field in ['token_content', 'token_title']:
             command = ' '.join(map(stem, command.split()))
         hlt_analyzer = self.analyzer['StandardAnalyzer']
     analyzer = self.analyzer['SimpleAnalyzer']
     num = kwargs.get('num', 50)
     attrs = kwargs.get('attrs', ['url', 'title'])
     print "[%s]\tSearching for '%s' in field '%s'" % (query_type, command, field)
     query = QueryParser(Version.LUCENE_CURRENT, field, analyzer).parse(command)
     if field in ['token_content', 'token_title']:
         getAbs = True
         query_for_highlight = QueryParser(Version.LUCENE_CURRENT, 'content', hlt_analyzer).parse(command)
         scorer = QueryScorer(query_for_highlight)
         formatter = SimpleHTMLFormatter("<strong>", "</strong>")
         # formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>")
         highlighter = Highlighter(formatter, scorer)
         fragmenter = SimpleFragmenter(20)
         highlighter.setTextFragmenter(fragmenter)
     else:
         getAbs = False
     scoreDocs = self.searcher.search(query, num).scoreDocs
     print "%s total matching documents." % len(scoreDocs)
     articles = []
     for scoreDoc in scoreDocs:
         doc = self.searcher.doc(scoreDoc.doc)
         article = {}
         for attr in attrs:
             article[attr] = doc.get(attr)
         if getAbs is True:
             content = doc.get('content')
             tokenStream = hlt_analyzer.tokenStream("content", StringReader(content))
             article['abstract'] = highlighter.getBestFragments(tokenStream, content, 3, "...")
         articles.append(article)
     return articles
def get_lm_doc_snippets(query,
                        searcher,
                        qparser,
                        analyzer,
                        preprocessor,
                        topk=10):
    """
    Fetches the topk document snippets given query, searcher and qparser and
    returns (did, text) pair list
    :param query:
    :param searcher:
    :param qparser:
    :param topk:
    :return:
    """

    dids_text = []

    query = qparser.parse(query)
    scoreDocs = searcher.search(query, topk).scoreDocs

    highlighter = Highlighter(QueryScorer(query))
    highlighter.setTextFragmenter(SimpleFragmenter(100))

    for scoreDoc in scoreDocs:

        doc = searcher.doc(scoreDoc.doc)
        did = doc.get("id")

        text = doc.get("raw")
        token_stream = analyzer.tokenStream("raw", StringReader(text))
        result = highlighter.getBestFragments(token_stream, text, 4, "... ")
        text = get_parsed_text(result)
        text = preprocess_text(preprocessor, [text])
        text = " ".join(text)

        dids_text.append((did, text))

    return dids_text
Exemple #9
0
    def run(self, writer=None, analyzer=None):

        if writer is None:
            writer = self.writer

        if analyzer is None:
            analyzer = self.analyzer

        searcher = IndexSearcher(DirectoryReader.open(\
        SimpleFSDirectory.open(File(self.store_dir))))
        while True:
            print()
            print("Hit enter with no input to quit.")
            command = input("Query:")
            if command == '':
                return

            print("Searching for:", command)
            query = QueryParser(Version.LUCENE_43, "contents",
                                analyzer).parse(command)

            # We'll just show the top 10 matching documents for now
            scoreDocs = searcher.search(query, 10).scoreDocs
            print("%s total matching documents." % len(scoreDocs))

            # Highlight the matching text in red
            highlighter = Highlighter(
                SimpleHTMLFormatter('<b><font color\
            ="red">', '</font></b>'), QueryScorer(query))

            # Using NullFragmenter since we still want to see
            # the whole document
            highlighter.setTextFragmenter(NullFragmenter())

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream(
                    "contents", StringReader(doc.get("contents")))

                # arg 3: the maximum number of fragments
                # arg 4: the separator used to intersperse the
                # document fragments (typically "...")
                # arg 3 and 4 don't really matter with NullFragmenter
                result = highlighter.getBestFragments(tokenStream,
                                                      doc.get("contents"), 2,
                                                      "...")

                if len(result) > 10:
                    file_handler = open(self.hits_dir + '/' + doc.get("name"),
                                        'w+')
                    file_handler.write(result)

            # create hit fragments, if we want to show them
            # arg 1: fragment size
            highlighter.setTextFragmenter(SimpleFragmenter(200))

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream(
                    "contents", StringReader(doc.get("contents")))

                result = highlighter.getBestFragments(tokenStream,
                                                      doc.get("contents"), 2,
                                                      "...")

                if len(result) > 10:
                    file_handler = open(self.frags_dir + '/' + doc.get("name"),
                                        'w+')
                    file_handler.write(result)
def run(searcher, analyzer, command, urlclick):

    if command == '':
        return []
    res = firstsearch(searcher, analyzer, command)
    command = ''.join(my_jieba.cut(command))
    command = " ".join(jieba.cut(command, cut_all=True))
    if len(res) > 0:
        scoreDocs = res
    else:
        querys = BooleanQuery()
        for k in tag:
            query = QueryParser(Version.LUCENE_CURRENT, k,
                                analyzer).parse(command)
            if k == 'taste' or k == 'tech':
                query.setBoost(0.5)
            querys.add(query, BooleanClause.Occur.SHOULD)
        scoreDocs = searcher.search(querys, 10000).scoreDocs

    swxc_res = findres(command, scoreDocs, searcher)
    formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>")
    formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>")
    scorer = QueryScorer(
        QueryParser(Version.LUCENE_CURRENT, 'name', analyzer).parse(command))
    highlighter1 = Highlighter(formatter_name, scorer)
    highlighter2 = Highlighter(
        formatter_name,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'content',
                        analyzer).parse(command)))
    highlighter3 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'ingredient',
                        analyzer).parse(command)))
    highlighter4 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'taste',
                        analyzer).parse(command)))
    highlighter5 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'tech',
                        analyzer).parse(command)))
    highlighter6 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'others',
                        analyzer).parse(command)))

    fragmenter = SimpleFragmenter(1000)
    highlighter1.setTextFragmenter(fragmenter)
    highlighter2.setTextFragmenter(fragmenter)
    highlighter3.setTextFragmenter(fragmenter)
    highlighter4.setTextFragmenter(fragmenter)
    highlighter5.setTextFragmenter(fragmenter)
    highlighter6.setTextFragmenter(fragmenter)

    results = []
    for scoreDoc in scoreDocs:
        if (len(scoreDocs) > 200 and
                len(scoreDocs) * scoreDoc.score < 2) or scoreDoc.score < 0.002:
            continue
        doc = searcher.doc(scoreDoc.doc)

        highlighterContent = highlighter1.getBestFragment(
            analyzer, 'name', doc.get('name'))
        highlighterContent2 = highlighter2.getBestFragment(
            analyzer, 'content', doc.get('content'))
        highlighterContent3 = highlighter3.getBestFragment(
            analyzer, 'ingredient', doc.get('ingredient'))
        highlighterContent4 = highlighter4.getBestFragment(
            analyzer, 'taste', doc.get('taste'))
        highlighterContent5 = highlighter5.getBestFragment(
            analyzer, 'tech', doc.get('tech'))
        highlighterContent6 = highlighter6.getBestFragment(
            analyzer, 'others', doc.get('others'))

        if highlighterContent:
            highlighterContent = highlighterContent.replace(' ', '')
            highlighterContent = highlighterContent.replace(',', ' ')
        else:
            highlighterContent = doc.get('name').replace(' ', '')

        if highlighterContent2:
            highlighterContent2 = highlighterContent2.replace(' ', '')
            highlighterContent2 = highlighterContent2.replace(',', ' ')
        else:
            highlighterContent2 = doc.get('content').replace(' ', '')
        if highlighterContent3:
            highlighterContent3 = highlighterContent3.replace(',', '')
        else:
            highlighterContent3 = (doc.get('ingredient')).replace(',', '')
        if highlighterContent4:
            pass
        else:
            highlighterContent4 = doc.get('taste')
        if highlighterContent5:
            pass
        else:
            highlighterContent5 = doc.get('tech')
        if highlighterContent6:
            highlighterContent6 = highlighterContent6.replace(',', '')
        else:
            highlighterContent6 = (doc.get('others')).replace(',', '')

        results.append(
            (highlighterContent, doc.get('img'), highlighterContent2,
             highlighterContent3, highlighterContent4, highlighterContent5,
             highlighterContent6, doc.get('url'), scoreDoc.score))

        for i in range(0, min(20, len(results)) - 1):
            flag = True
            for j in range(0, min(20, len(results)) - i - 1):
                if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[
                        results[j][7]] < urlclick[results[j + 1][7]]:
                    flag = False
                    results[j], results[j + 1] = results[j + 1], results[j]
            if flag:
                break

    return results, swxc_res
def superSearch(command, command_dict, urlclick):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File('index2.3'))
    print "run super search..."
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
    command = ' '.join(jieba.cut_for_search(command))
    querys = BooleanQuery()
    if command:
        query = QueryParser(Version.LUCENE_CURRENT, 'nameforsearch',
                            analyzer).parse(command)
        querys.add(query, BooleanClause.Occur.SHOULD)
    for k, v in (command_dict[0]).items():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        query.setBoost(0.1)
        querys.add(query, BooleanClause.Occur.MUST)
    for k, v in (command_dict[1]).items():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST_NOT)
    scoreDocs = searcher.search(querys, 10000).scoreDocs
    swxc_res = findres(command + ' ' + command_dict[0].get("ingredient", ''),
                       scoreDocs, searcher)
    formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>")
    formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>")
    if command:
        scorer = QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'name',
                        analyzer).parse(command))
        highlighters = [Highlighter(formatter_name, scorer)]
    else:
        highlighters = ['']
    if command_dict[0].get('ingredient'):
        highlighters.append(
            Highlighter(
                formatter,
                QueryScorer(
                    QueryParser(Version.LUCENE_CURRENT, 'ingredient',
                                analyzer).parse(
                                    command_dict[0]['ingredient']))))
    else:
        highlighters.append('')
    if command_dict[0].get('taste'):
        highlighters.append(
            Highlighter(
                formatter,
                QueryScorer(
                    QueryParser(Version.LUCENE_CURRENT, 'taste',
                                analyzer).parse(command_dict[0]['taste']))))
    else:
        highlighters.append('')
    if command_dict[0].get('tech'):
        highlighters.append(
            Highlighter(
                formatter,
                QueryScorer(
                    QueryParser(Version.LUCENE_CURRENT, 'tech',
                                analyzer).parse(command_dict[0]['tech']))))
    else:
        highlighters.append('')
    fragmenter = SimpleFragmenter(1000)
    for h in highlighters:
        if h:
            h.setTextFragmenter(fragmenter)

    results = []
    for scoreDoc in scoreDocs:
        if (scoreDoc.score * len(scoreDocs) < 200
                and len(scoreDocs) > 200) or scoreDoc.score < 0.1:
            continue
        doc = searcher.doc(scoreDoc.doc)
        if command:
            highlighterContent = highlighters[0].getBestFragment(
                analyzer, 'name', doc.get('name'))
        else:
            highlighterContent = ''
        if highlighters[1]:
            highlighterContent2 = highlighters[1].getBestFragment(
                analyzer, 'ingredient', doc.get('ingredient'))
        else:
            highlighterContent2 = ''
        if highlighters[2]:
            highlighterContent3 = highlighters[2].getBestFragment(
                analyzer, 'taste', doc.get('taste'))
        else:
            highlighterContent3 = ''
        if highlighters[3]:
            highlighterContent4 = highlighters[3].getBestFragment(
                analyzer, 'tech', doc.get('tech'))
        else:
            highlighterContent4 = ''

        if highlighterContent:
            highlighterContent = highlighterContent.replace(' ', '')
            highlighterContent = highlighterContent.replace(',', ' ')
        else:
            highlighterContent = doc.get('name').replace(' ', '')
        if highlighterContent2:
            highlighterContent2 = highlighterContent2.replace(',', '')
        else:
            highlighterContent2 = (doc.get('ingredient')).replace(',', '')
        if highlighterContent3:
            pass
        else:
            highlighterContent3 = doc.get('taste')
        if highlighterContent4:
            pass
        else:
            highlighterContent4 = doc.get('tech')
        results.append(
            (highlighterContent, doc.get('img'),
             doc.get('content').replace(' ', ''),
             highlighterContent2, highlighterContent3, highlighterContent4,
             doc.get('others').replace(',',
                                       ''), doc.get('url'), scoreDoc.score))

        for i in range(0, min(20, len(results)) - 1):
            flag = True
            for j in range(0, min(20, len(results)) - i - 1):
                if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[
                        results[j][7]] < urlclick[results[j + 1][7]]:
                    flag = False
                    results[j], results[j + 1] = results[j + 1], results[j]
            if flag:
                break

    return results, swxc_res