def text_search(command):
    envir.vm_env.attachCurrentThread()
    command_dict = parseCommand(command, "contents")
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k,
                            envir.analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = envir.text_searcher.search(querys, 30).scoreDocs
    res = []

    query_highlight = QueryParser(Version.LUCENE_CURRENT, k,
                                  envir.analyzer).parse(command_dict["contents"])
    myhighlighter = Highlighter(
        SimpleHTMLFormatter(), QueryScorer(query_highlight))
    myhighlighter.setTextFragmenter(SimpleFragmenter(50))
    for scoreDoc in scoreDocs:
        # find texts which are around the keyword
        doc = envir.text_searcher.doc(scoreDoc.doc)
        text = doc.get("contents")
        key_text = "".join((myhighlighter.getBestFragments(
            envir.analyzer, "contents", text, 3)))
        key_text = re.sub('\s', '', key_text)
        temp = [doc.get("title"), doc.get('url'), key_text]
        res.append(temp)
    return res
Beispiel #2
0
def highlighting(analyzer,contents,query):
    formatter=SimpleHTMLFormatter("<b><font color='black'>","</font></b>")
    highlighter=Highlighter(formatter,QueryScorer(query))
    highlighter.setTextFragmenter(SimpleFragmenter(30))
    tokenStream=analyzer.tokenStream('contents',contents)
    light_content=highlighter.getBestFragments(tokenStream,contents,3,'...')
    return light_content 
Beispiel #3
0
    def search(self, q, page=1, duplicates=False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)

        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream(
                "contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream,
                                                     doc['contents'], 3, "...")

            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight
            })

        del self.searcher

        totalPages = int(math.ceil(results.getTotalHits() / float(perPage)))

        return totalPages, docs
Beispiel #4
0
    def search(self, q, page = 1, duplicates = False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)
        
        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...")
            
            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight}
            )

        del self.searcher
        
        totalPages = int(math.ceil(results.getTotalHits()/float(perPage)))

        return totalPages, docs
Beispiel #5
0
    def output(self, score_docs, command):
        '''
        Highlight and return the search results.

        Input: `score_docs`: search results from the index
        Output: list of documents info found in the index,
                details includes `title`, `url` and `abstract`
        '''
        query = QueryParser('contents', self.analyzer).parse(command)
        highlighter = Highlighter(self.formatter, QueryScorer(query))
        highlighter.setTextFragmenter(
            SimpleFragmenter(200))  # Limit the max number of characters

        results = []
        for score_doc in score_docs:
            doc = self.searcher.doc(score_doc.doc)
            contents = doc.get('contents')
            stream = self.analyzer.tokenStream("contents", contents)
            abstract = highlighter.getBestFragment(
                stream, contents)  # Get the abstract and highlight
            result = {
                'title': doc.get('title'),
                'url': doc.get('url'),
                'abstract': abstract.replace(' ', '')
            }
            results.append(result)
        return results
    def testSimpleHighlighter(self):

        self.doSearching("Wicked")
        highlighter = Highlighter(QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))
        maxNumFragmentsRequired = 2

        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  "...")
            print "\t", result
Beispiel #7
0
    def testSimpleHighlighter(self):

        self.doSearching("Wicked")
        highlighter = Highlighter(QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))
        maxNumFragmentsRequired = 2

        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  "...")
            print "\t", result
Beispiel #8
0
    def doStandardHighlights(self):

        formatter = TestFormatter(self)

        highlighter = Highlighter(formatter, QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(20))
        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            maxNumFragmentsRequired = 2
            fragmentSeparator = "..."
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  fragmentSeparator)
            print "\t", result
    def doStandardHighlights(self):
        
        formatter = TestFormatter(self)
        
        highlighter = Highlighter(formatter, QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(20))
        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            maxNumFragmentsRequired = 2
            fragmentSeparator = "..."
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream,
                                                  text,
                                                  maxNumFragmentsRequired,
                                                  fragmentSeparator)
            print "\t", result
 def search_by(self, **kwargs):
     command = kwargs.get('command', '')
     if command == '':
         return None
     field = kwargs.get('field')
     query_type = kwargs.get('query_type', 'chi')
     if query_type == 'chi':
         if field in ['token_taglist', 'token_content', 'token_title', 'token_author']:
             command = ' '.join(jieba.cut_for_search(command))
         hlt_analyzer = self.analyzer['ChineseAnalyzer']
     else:
         if field in ['token_content', 'token_title']:
             command = ' '.join(map(stem, command.split()))
         hlt_analyzer = self.analyzer['StandardAnalyzer']
     analyzer = self.analyzer['SimpleAnalyzer']
     num = kwargs.get('num', 50)
     attrs = kwargs.get('attrs', ['url', 'title'])
     print "[%s]\tSearching for '%s' in field '%s'" % (query_type, command, field)
     query = QueryParser(Version.LUCENE_CURRENT, field, analyzer).parse(command)
     if field in ['token_content', 'token_title']:
         getAbs = True
         query_for_highlight = QueryParser(Version.LUCENE_CURRENT, 'content', hlt_analyzer).parse(command)
         scorer = QueryScorer(query_for_highlight)
         formatter = SimpleHTMLFormatter("<strong>", "</strong>")
         # formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>")
         highlighter = Highlighter(formatter, scorer)
         fragmenter = SimpleFragmenter(20)
         highlighter.setTextFragmenter(fragmenter)
     else:
         getAbs = False
     scoreDocs = self.searcher.search(query, num).scoreDocs
     print "%s total matching documents." % len(scoreDocs)
     articles = []
     for scoreDoc in scoreDocs:
         doc = self.searcher.doc(scoreDoc.doc)
         article = {}
         for attr in attrs:
             article[attr] = doc.get(attr)
         if getAbs is True:
             content = doc.get('content')
             tokenStream = hlt_analyzer.tokenStream("content", StringReader(content))
             article['abstract'] = highlighter.getBestFragments(tokenStream, content, 3, "...")
         articles.append(article)
     return articles
Beispiel #11
0
    def get_highlighted_hits(self):
        extracted_fragments = []

        scorer = QueryScorer(self.query)
        fragmenter = SimpleSpanFragmenter(scorer, 10)
        highlighter = Highlighter(self.formatter, scorer)
        highlighter.setTextFragmenter(fragmenter)

        for hit in self.hits.scoreDocs:
            document = self.searcher.doc(hit.doc)
            stream = TokenSources.getAnyTokenStream(self.index_reader, hit.doc,
                                                    'contents', self.analyzer)
            best_fragments = highlighter.getBestFragments(
                stream, document.get('contents'), 10)

            for fragment in best_fragments:
                print('fragment: ', fragment)

            extracted_fragments.append((hit.doc, best_fragments))

        return extracted_fragments
Beispiel #12
0
def run(searcher, analyzer, command):
    command_dict = parseCommand(command)
    seg_list = jieba.cut(command_dict['contents'])
    command_dict['contents'] = (" ".join(seg_list))
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 50).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    scorer = QueryScorer(query)
    fragmenter = SimpleSpanFragmenter(scorer, 250)
    simpleHTMLFormatter = SimpleHTMLFormatter("<b>", "</b>")
    highlighter = Highlighter(simpleHTMLFormatter, scorer)
    highlighter.setTextFragmenter(fragmenter)

    results = []

    for i, scoreDoc in enumerate(scoreDocs):
        doc = searcher.doc(scoreDoc.doc)
        contents = doc.get("contents")
        if contents:
            tkStream = analyzer.tokenStream("contents", contents)
            highlight = highlighter.getBestFragment(tkStream, contents)
            highlightseg = highlight.split()
            highlight = ''.join(highlightseg)
            results.append(
                (doc.get("title").strip(), doc.get("url"), highlight))
        '''
        print 'path:', doc.get("path"), \
            '\nname:', doc.get("name"), \
            '\ntitle:', doc.get("title"), \
            "url:",doc.get("url"), \
            "\nsite:",doc.get("site"),\
            "\ncontent:",highlight,"\n"
        '''
        # print 'explain:', searcher.explain(query, scoreDoc.doc)
    return results
Beispiel #13
0
def get_lm_doc_snippets(query,
                        searcher,
                        qparser,
                        analyzer,
                        preprocessor,
                        topk=10):
    """
    Fetches the topk document snippets given query, searcher and qparser and
    returns (did, text) pair list
    :param query:
    :param searcher:
    :param qparser:
    :param topk:
    :return:
    """

    dids_text = []

    query = qparser.parse(query)
    scoreDocs = searcher.search(query, topk).scoreDocs

    highlighter = Highlighter(QueryScorer(query))
    highlighter.setTextFragmenter(SimpleFragmenter(100))

    for scoreDoc in scoreDocs:

        doc = searcher.doc(scoreDoc.doc)
        did = doc.get("id")

        text = doc.get("raw")
        token_stream = analyzer.tokenStream("raw", StringReader(text))
        result = highlighter.getBestFragments(token_stream, text, 4, "... ")
        text = get_parsed_text(result)
        text = preprocess_text(preprocessor, [text])
        text = " ".join(text)

        dids_text.append((did, text))

    return dids_text
Beispiel #14
0
def lucene_search(query, MAX, showHighlight):
    dir = os.getcwd()
    lucene.initVM()
    index_dir = SimpleFSDirectory(File(dir))
    index_reader = DirectoryReader.open(index_dir)
    lucene_searcher = IndexSearcher(index_reader)
    lucene_analyzer = StandardAnalyzer(Version.LUCENE_48)
    my_query = QueryParser(Version.LUCENE_48, "text",
                           lucene_analyzer).parse(query)
    #We can define the MAX number of results (default 10)
    total_hits = lucene_searcher.search(my_query, MAX)

    query_scorer = QueryScorer(my_query)
    formatter = SimpleHTMLFormatter()
    highlighter = Highlighter(formatter, query_scorer)
    # Set the fragment size. We break text in to fragment of 50 characters
    fragmenter = SimpleSpanFragmenter(query_scorer, 50)
    highlighter.setTextFragmenter(fragmenter)

    print "Only shows at most %s documents" % MAX
    if showHighlight:
        print "<br>"

    for hit in total_hits.scoreDocs:

        doc = lucene_searcher.doc(hit.doc)
        text = doc.get("text")
        ts = lucene_analyzer.tokenStream("text", StringReader(text))
        
        if showHighlight:
            print "<p>"

        print doc.get("title")

        if showHighlight:
            print "<br>"
            print highlighter.getBestFragments(ts, text, 3, "...")
            print "</p>"
    def run(self, writer=None, analyzer=None):

        if writer is None:
            writer = self.writer

        if analyzer is None:
            analyzer = self.analyzer

        searcher = IndexSearcher(DirectoryReader.open(\
        SimpleFSDirectory.open(File(self.store_dir))))
        while True:
            print
            print "Hit enter with no input to quit."
            command = raw_input("Query:")
            if command == '':
                return

            print "Searching for:", command
            query = QueryParser(Version.LUCENE_43, "contents",
                analyzer).parse(command)

            # We'll just show the top 10 matching documents for now
            scoreDocs = searcher.search(query, 10).scoreDocs
            print "%s total matching documents." % len(scoreDocs)

            # Highlight the matching text in red
            highlighter = Highlighter(SimpleHTMLFormatter('<b><font color\
            ="red">', '</font></b>'), QueryScorer(query))

            # Using NullFragmenter since we still want to see
            # the whole document
            highlighter.setTextFragmenter(NullFragmenter())

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream("contents",
                    StringReader(doc.get("contents")))

                # arg 3: the maximum number of fragments
                # arg 4: the separator used to intersperse the
                # document fragments (typically "...")
                # arg 3 and 4 don't really matter with NullFragmenter
                result = highlighter.getBestFragments(tokenStream,
                    doc.get("contents"), 2, "...")

                if len(result) > 10:
                    file_handler = open(self.hits_dir + '/' + doc.get("name"),
                        'w+')
                    file_handler.write(result)

            # create hit fragments, if we want to show them
            # arg 1: fragment size
            highlighter.setTextFragmenter(SimpleFragmenter(200))

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream("contents",
                    StringReader(doc.get("contents")))

                result = highlighter.getBestFragments(tokenStream,
                    doc.get("contents"), 2, "...")

                if len(result) > 10:
                    file_handler = open(self.frags_dir + '/' + doc.get("name"),
                        'w+')
                    file_handler.write(result)
def run(searcher, analyzer, command, urlclick):

    if command == '':
        return []
    res = firstsearch(searcher, analyzer, command)
    command = ''.join(my_jieba.cut(command))
    command = " ".join(jieba.cut(command, cut_all=True))
    if len(res) > 0:
        scoreDocs = res
    else:
        querys = BooleanQuery()
        for k in tag:
            query = QueryParser(Version.LUCENE_CURRENT, k,
                                analyzer).parse(command)
            if k == 'taste' or k == 'tech':
                query.setBoost(0.5)
            querys.add(query, BooleanClause.Occur.SHOULD)
        scoreDocs = searcher.search(querys, 10000).scoreDocs

    swxc_res = findres(command, scoreDocs, searcher)
    formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>")
    formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>")
    scorer = QueryScorer(
        QueryParser(Version.LUCENE_CURRENT, 'name', analyzer).parse(command))
    highlighter1 = Highlighter(formatter_name, scorer)
    highlighter2 = Highlighter(
        formatter_name,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'content',
                        analyzer).parse(command)))
    highlighter3 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'ingredient',
                        analyzer).parse(command)))
    highlighter4 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'taste',
                        analyzer).parse(command)))
    highlighter5 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'tech',
                        analyzer).parse(command)))
    highlighter6 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'others',
                        analyzer).parse(command)))

    fragmenter = SimpleFragmenter(1000)
    highlighter1.setTextFragmenter(fragmenter)
    highlighter2.setTextFragmenter(fragmenter)
    highlighter3.setTextFragmenter(fragmenter)
    highlighter4.setTextFragmenter(fragmenter)
    highlighter5.setTextFragmenter(fragmenter)
    highlighter6.setTextFragmenter(fragmenter)

    results = []
    for scoreDoc in scoreDocs:
        if (len(scoreDocs) > 200 and
                len(scoreDocs) * scoreDoc.score < 2) or scoreDoc.score < 0.002:
            continue
        doc = searcher.doc(scoreDoc.doc)

        highlighterContent = highlighter1.getBestFragment(
            analyzer, 'name', doc.get('name'))
        highlighterContent2 = highlighter2.getBestFragment(
            analyzer, 'content', doc.get('content'))
        highlighterContent3 = highlighter3.getBestFragment(
            analyzer, 'ingredient', doc.get('ingredient'))
        highlighterContent4 = highlighter4.getBestFragment(
            analyzer, 'taste', doc.get('taste'))
        highlighterContent5 = highlighter5.getBestFragment(
            analyzer, 'tech', doc.get('tech'))
        highlighterContent6 = highlighter6.getBestFragment(
            analyzer, 'others', doc.get('others'))

        if highlighterContent:
            highlighterContent = highlighterContent.replace(' ', '')
            highlighterContent = highlighterContent.replace(',', ' ')
        else:
            highlighterContent = doc.get('name').replace(' ', '')

        if highlighterContent2:
            highlighterContent2 = highlighterContent2.replace(' ', '')
            highlighterContent2 = highlighterContent2.replace(',', ' ')
        else:
            highlighterContent2 = doc.get('content').replace(' ', '')
        if highlighterContent3:
            highlighterContent3 = highlighterContent3.replace(',', '')
        else:
            highlighterContent3 = (doc.get('ingredient')).replace(',', '')
        if highlighterContent4:
            pass
        else:
            highlighterContent4 = doc.get('taste')
        if highlighterContent5:
            pass
        else:
            highlighterContent5 = doc.get('tech')
        if highlighterContent6:
            highlighterContent6 = highlighterContent6.replace(',', '')
        else:
            highlighterContent6 = (doc.get('others')).replace(',', '')

        results.append(
            (highlighterContent, doc.get('img'), highlighterContent2,
             highlighterContent3, highlighterContent4, highlighterContent5,
             highlighterContent6, doc.get('url'), scoreDoc.score))

        for i in range(0, min(20, len(results)) - 1):
            flag = True
            for j in range(0, min(20, len(results)) - i - 1):
                if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[
                        results[j][7]] < urlclick[results[j + 1][7]]:
                    flag = False
                    results[j], results[j + 1] = results[j + 1], results[j]
            if flag:
                break

    return results, swxc_res
Beispiel #17
0
    def run(self, writer=None, analyzer=None):

        if writer is None:
            writer = self.writer

        if analyzer is None:
            analyzer = self.analyzer

        searcher = IndexSearcher(DirectoryReader.open(\
        SimpleFSDirectory.open(File(self.store_dir))))
        while True:
            print()
            print("Hit enter with no input to quit.")
            command = input("Query:")
            if command == '':
                return

            print("Searching for:", command)
            query = QueryParser(Version.LUCENE_43, "contents",
                                analyzer).parse(command)

            # We'll just show the top 10 matching documents for now
            scoreDocs = searcher.search(query, 10).scoreDocs
            print("%s total matching documents." % len(scoreDocs))

            # Highlight the matching text in red
            highlighter = Highlighter(
                SimpleHTMLFormatter('<b><font color\
            ="red">', '</font></b>'), QueryScorer(query))

            # Using NullFragmenter since we still want to see
            # the whole document
            highlighter.setTextFragmenter(NullFragmenter())

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream(
                    "contents", StringReader(doc.get("contents")))

                # arg 3: the maximum number of fragments
                # arg 4: the separator used to intersperse the
                # document fragments (typically "...")
                # arg 3 and 4 don't really matter with NullFragmenter
                result = highlighter.getBestFragments(tokenStream,
                                                      doc.get("contents"), 2,
                                                      "...")

                if len(result) > 10:
                    file_handler = open(self.hits_dir + '/' + doc.get("name"),
                                        'w+')
                    file_handler.write(result)

            # create hit fragments, if we want to show them
            # arg 1: fragment size
            highlighter.setTextFragmenter(SimpleFragmenter(200))

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream(
                    "contents", StringReader(doc.get("contents")))

                result = highlighter.getBestFragments(tokenStream,
                                                      doc.get("contents"), 2,
                                                      "...")

                if len(result) > 10:
                    file_handler = open(self.frags_dir + '/' + doc.get("name"),
                                        'w+')
                    file_handler.write(result)
def build_highlighter(parsed_query):
    scorer = QueryScorer(parsed_query, 'content')
    highlighter = Highlighter(SimpleHTMLFormatter(), scorer)
    fragmenter = SimpleSpanFragmenter(scorer, FRAGMENT_SIZE)
    highlighter.setTextFragmenter(fragmenter)
    return highlighter