def text_search(command):
    envir.vm_env.attachCurrentThread()
    command_dict = parseCommand(command, "contents")
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k,
                            envir.analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = envir.text_searcher.search(querys, 30).scoreDocs
    res = []

    query_highlight = QueryParser(Version.LUCENE_CURRENT, k,
                                  envir.analyzer).parse(command_dict["contents"])
    myhighlighter = Highlighter(
        SimpleHTMLFormatter(), QueryScorer(query_highlight))
    myhighlighter.setTextFragmenter(SimpleFragmenter(50))
    for scoreDoc in scoreDocs:
        # find texts which are around the keyword
        doc = envir.text_searcher.doc(scoreDoc.doc)
        text = doc.get("contents")
        key_text = "".join((myhighlighter.getBestFragments(
            envir.analyzer, "contents", text, 3)))
        key_text = re.sub('\s', '', key_text)
        temp = [doc.get("title"), doc.get('url'), key_text]
        res.append(temp)
    return res
Exemple #2
0
    def search_synonym(self, query):
        self.hits_dict = {}
        self.hits = []
        similar_terms = self.w2v_model.most_similar(query)
        parser = QueryParser('text', self.analyzer)
        query = parser.parse(query)

        for s_term in similar_terms[:20]:
            s_term_query = parser.parse(s_term[0])
            hits = self.searcher.search(s_term_query, 1000).scoreDocs
            hit_count = 0
            for hit in hits:
                doc = self.searcher.doc(hit.doc)
                text = doc.get('text')
                terms = text.split()
                sentence = ''
                for term in terms:
                    sentence += term
                simpleHTMLFormatter = SimpleHTMLFormatter(
                    prefixHTML, suffixHTML)
                highlighter = Highlighter(simpleHTMLFormatter,
                                          QueryScorer(query))
                highLightText = highlighter.getBestFragment(
                    self.analyzer, 'text', sentence)
                if highLightText is not None:
                    self.hits.append(highLightText)
                    hit_count += 1
                if hit_count >= 3:
                    break

            if len(self.hits) > 0:
                self.hits_dict[s_term] = self.hits
                self.hits = []

        return self.hits_dict
Exemple #3
0
    def search(self, query_str, restriction=2):
        self.attachCurrentThread()

        # 对query进行解析
        result_contexts = []
        # 根据有没有‘/’判断有没有词性,
        if '/' in query_str:
            # 有词性就转到search_phrases
            result_contexts = self.search_phrases(query_str)
        else:
            # 有词性就转到search_terms
            result_contexts = self.search_terms(
                QueryParser("context", self.analyzer).parse(query_str))

        # 将搜索结果复原为文章返回
        self.recover_to_article(query_str, result_contexts, restriction)

        final_result = []
        #进行搜索结果中跟query相关的文段高量处理
        simpleHTMLFormatter = SimpleHTMLFormatter(u"<b><font color='red'>",
                                                  u"</font></b>")
        for index, recovered_query in enumerate(self.recovered_queries):
            # 不是直接拿用户输入的query来进行高亮处理,而是通过我们自己处理好的包含了位置约束的query进行高亮处理
            recovered_query = recovered_query.replace("/", ",")
            highlighter = Highlighter(
                simpleHTMLFormatter,
                QueryScorer(
                    QueryParser("context",
                                self.analyzer).parse(recovered_query)))
            highLightText = highlighter.getBestFragment(
                self.analyzer, 'context', self.recovered_contexts[index])
            if highLightText is not None:
                final_result.append(highLightText)

        return final_result
Exemple #4
0
def run(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index1"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(analysis(command))
    HighlightFormatter = SimpleHTMLFormatter()
    highlighter = Highlighter(HighlightFormatter, QueryScorer(query))
    scoreDocs = searcher.search(query, 500).scoreDocs
    print "%s total matching documents." % len(scoreDocs)
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        print 'path:', doc.get("path"), 'name:', doc.get(
            "name"), 'url:', doc.get("url"), 'title:', doc.get("title")
        text = doc.get('contents')
        highLightText = highlighter.getBestFragment(analyzer, "contents", text)
        if highLightText != None:
            highLightText = ''.join(highLightText.split(' '))
        data = {}
        data['url'] = doc.get("url")
        data['title'] = doc.get('title')
        data['highlight'] = highLightText
        result.append(data)
    return result
Exemple #5
0
def highlighting(analyzer,contents,query):
    formatter=SimpleHTMLFormatter("<b><font color='black'>","</font></b>")
    highlighter=Highlighter(formatter,QueryScorer(query))
    highlighter.setTextFragmenter(SimpleFragmenter(30))
    tokenStream=analyzer.tokenStream('contents',contents)
    light_content=highlighter.getBestFragments(tokenStream,contents,3,'...')
    return light_content 
Exemple #6
0
    def search(self, q, page=1, duplicates=False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)

        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream(
                "contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream,
                                                     doc['contents'], 3, "...")

            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight
            })

        del self.searcher

        totalPages = int(math.ceil(results.getTotalHits() / float(perPage)))

        return totalPages, docs
Exemple #7
0
    def output(self, score_docs, command):
        '''
        Highlight and return the search results.

        Input: `score_docs`: search results from the index
        Output: list of documents info found in the index,
                details includes `title`, `url` and `abstract`
        '''
        query = QueryParser('contents', self.analyzer).parse(command)
        highlighter = Highlighter(self.formatter, QueryScorer(query))
        highlighter.setTextFragmenter(
            SimpleFragmenter(200))  # Limit the max number of characters

        results = []
        for score_doc in score_docs:
            doc = self.searcher.doc(score_doc.doc)
            contents = doc.get('contents')
            stream = self.analyzer.tokenStream("contents", contents)
            abstract = highlighter.getBestFragment(
                stream, contents)  # Get the abstract and highlight
            result = {
                'title': doc.get('title'),
                'url': doc.get('url'),
                'abstract': abstract.replace(' ', '')
            }
            results.append(result)
        return results
Exemple #8
0
    def testSimpleHighlighter(self):

        self.doSearching("Wicked")
        highlighter = Highlighter(QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))
        maxNumFragmentsRequired = 2

        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  "...")
            print "\t", result
Exemple #9
0
 def search_multi_terms(self, query):
     print('Multiterms search')
     hits = self.searcher.search(query, 100).scoreDocs
     for hit in hits:
         doc = self.searcher.doc(hit.doc)
         text = doc.get("text")
         terms = text.split()
         sentence = ''
         for term in terms:
             sentence += term
         simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML)
         highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query))
         highLightText = highlighter.getBestFragment(
             self.analyzer, 'text', sentence)
         if highLightText is not None:
             self.hits.append(highLightText)
Exemple #10
0
    def doStandardHighlights(self):

        formatter = TestFormatter(self)

        highlighter = Highlighter(formatter, QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(20))
        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            maxNumFragmentsRequired = 2
            fragmentSeparator = "..."
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  fragmentSeparator)
            print "\t", result
 def search_by(self, **kwargs):
     command = kwargs.get('command', '')
     if command == '':
         return None
     field = kwargs.get('field')
     query_type = kwargs.get('query_type', 'chi')
     if query_type == 'chi':
         if field in ['token_taglist', 'token_content', 'token_title', 'token_author']:
             command = ' '.join(jieba.cut_for_search(command))
         hlt_analyzer = self.analyzer['ChineseAnalyzer']
     else:
         if field in ['token_content', 'token_title']:
             command = ' '.join(map(stem, command.split()))
         hlt_analyzer = self.analyzer['StandardAnalyzer']
     analyzer = self.analyzer['SimpleAnalyzer']
     num = kwargs.get('num', 50)
     attrs = kwargs.get('attrs', ['url', 'title'])
     print "[%s]\tSearching for '%s' in field '%s'" % (query_type, command, field)
     query = QueryParser(Version.LUCENE_CURRENT, field, analyzer).parse(command)
     if field in ['token_content', 'token_title']:
         getAbs = True
         query_for_highlight = QueryParser(Version.LUCENE_CURRENT, 'content', hlt_analyzer).parse(command)
         scorer = QueryScorer(query_for_highlight)
         formatter = SimpleHTMLFormatter("<strong>", "</strong>")
         # formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>")
         highlighter = Highlighter(formatter, scorer)
         fragmenter = SimpleFragmenter(20)
         highlighter.setTextFragmenter(fragmenter)
     else:
         getAbs = False
     scoreDocs = self.searcher.search(query, num).scoreDocs
     print "%s total matching documents." % len(scoreDocs)
     articles = []
     for scoreDoc in scoreDocs:
         doc = self.searcher.doc(scoreDoc.doc)
         article = {}
         for attr in attrs:
             article[attr] = doc.get(attr)
         if getAbs is True:
             content = doc.get('content')
             tokenStream = hlt_analyzer.tokenStream("content", StringReader(content))
             article['abstract'] = highlighter.getBestFragments(tokenStream, content, 3, "...")
         articles.append(article)
     return articles
Exemple #12
0
    def search_phrase(self, term, phrase):
        print('Phrase search')
        self.hits = []
        index_list = []
        parser = QueryParser('text', self.analyzer)
        query = parser.parse(term)

        hits = self.searcher.search(query, 1000).scoreDocs
        if hits is None:
            return

        for hit in hits:
            index = []
            doc = self.searcher.doc(hit.doc)
            text = doc.get("text")
            phrases = doc.get("phrase")

            # processing with saved text and phrase
            terms = text.split()
            phrases = phrases.split()
            flag = 1  # this flag is judging for phrase in every target term in text
            index = []  # index number for searched term, maybe many terms
            for i in range(len(terms)):
                if term == terms[i]:
                    index.append(i)
                    if not phrase == phrases[i]:
                        flag = 0
                        break
            if flag == 1:
                self.hits.append(text)
                index_list.append(index)
        self.recover_sentence(index_list)
        hits_copy = self.hits
        self.hits = []
        # add font tags for terms
        for hit in hits_copy:
            simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML)
            highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query))
            highLightText = highlighter.getBestFragment(
                self.analyzer, 'text', hit)
            if highLightText is not None:
                self.hits.append(highLightText)

        return self.hits[:40]
Exemple #13
0
    def search(self, terms, n_hits=5):
        """
        Run search query.
        """
        # TODO: support date range queries

        # build query
        parser = MultiFieldQueryParser(['fullpath', 'body'], self.analyzer)
        #parser.setDefaultOperator(QueryParser.Operator.AND) # defaults to OR unless terms have modifier
        query = MultiFieldQueryParser.parse(
            parser, terms)  # https://stackoverflow.com/a/26853987/130164
        # create a highlighter
        highlighter = Highlighter(SimpleHTMLFormatter('*', '*'),
                                  QueryScorer(query))
        # execute search for top N hits
        return [
            self._process_search_result(result, highlighter)
            for result in self.searcher.search(query, n_hits).scoreDocs
        ]
Exemple #14
0
    def get_highlighted_hits(self):
        extracted_fragments = []

        scorer = QueryScorer(self.query)
        fragmenter = SimpleSpanFragmenter(scorer, 10)
        highlighter = Highlighter(self.formatter, scorer)
        highlighter.setTextFragmenter(fragmenter)

        for hit in self.hits.scoreDocs:
            document = self.searcher.doc(hit.doc)
            stream = TokenSources.getAnyTokenStream(self.index_reader, hit.doc,
                                                    'contents', self.analyzer)
            best_fragments = highlighter.getBestFragments(
                stream, document.get('contents'), 10)

            for fragment in best_fragments:
                print('fragment: ', fragment)

            extracted_fragments.append((hit.doc, best_fragments))

        return extracted_fragments
Exemple #15
0
    def search(self, term, window=2):
        self.hits = []
        index_list = []
        sort_para = term

        parser = QueryParser('text', self.analyzer)
        query = parser.parse(term)
        print(query)

        # Jump to multi-terms search if there are several words
        if self.multi_terms(query):
            self.search_multi_terms(query)
            return self.hits[:40]

        hits = self.searcher.search(query, 1000).scoreDocs

        for hit in hits:
            index = []
            doc = self.searcher.doc(hit.doc)
            text = doc.get("text")
            self.hits.append(text)
            # save indexes of target term in each document
            terms = text.split()
            for i in range(len(terms)):
                if term == terms[i]:
                    index.append(i)
            index_list.append(index)

        self.recover_sentence(index_list, window)
        hits_copy = self.hits
        self.hits = []
        for hit in hits_copy:
            simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML)
            highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query))
            highLightText = highlighter.getBestFragment(
                self.analyzer, 'text', hit)
            if highLightText is not None:
                self.hits.append(highLightText)
        print('search over')
        return self.hits[:40]
Exemple #16
0
def run(searcher, analyzer, command):
    command_dict = parseCommand(command)
    seg_list = jieba.cut(command_dict['contents'])
    command_dict['contents'] = (" ".join(seg_list))
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 50).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    scorer = QueryScorer(query)
    fragmenter = SimpleSpanFragmenter(scorer, 250)
    simpleHTMLFormatter = SimpleHTMLFormatter("<b>", "</b>")
    highlighter = Highlighter(simpleHTMLFormatter, scorer)
    highlighter.setTextFragmenter(fragmenter)

    results = []

    for i, scoreDoc in enumerate(scoreDocs):
        doc = searcher.doc(scoreDoc.doc)
        contents = doc.get("contents")
        if contents:
            tkStream = analyzer.tokenStream("contents", contents)
            highlight = highlighter.getBestFragment(tkStream, contents)
            highlightseg = highlight.split()
            highlight = ''.join(highlightseg)
            results.append(
                (doc.get("title").strip(), doc.get("url"), highlight))
        '''
        print 'path:', doc.get("path"), \
            '\nname:', doc.get("name"), \
            '\ntitle:', doc.get("title"), \
            "url:",doc.get("url"), \
            "\nsite:",doc.get("site"),\
            "\ncontent:",highlight,"\n"
        '''
        # print 'explain:', searcher.explain(query, scoreDoc.doc)
    return results
def get_lm_doc_snippets(query,
                        searcher,
                        qparser,
                        analyzer,
                        preprocessor,
                        topk=10):
    """
    Fetches the topk document snippets given query, searcher and qparser and
    returns (did, text) pair list
    :param query:
    :param searcher:
    :param qparser:
    :param topk:
    :return:
    """

    dids_text = []

    query = qparser.parse(query)
    scoreDocs = searcher.search(query, topk).scoreDocs

    highlighter = Highlighter(QueryScorer(query))
    highlighter.setTextFragmenter(SimpleFragmenter(100))

    for scoreDoc in scoreDocs:

        doc = searcher.doc(scoreDoc.doc)
        did = doc.get("id")

        text = doc.get("raw")
        token_stream = analyzer.tokenStream("raw", StringReader(text))
        result = highlighter.getBestFragments(token_stream, text, 4, "... ")
        text = get_parsed_text(result)
        text = preprocess_text(preprocessor, [text])
        text = " ".join(text)

        dids_text.append((did, text))

    return dids_text
Exemple #18
0
def work(searcher, analyzer, command, low=None, high=None):
    global prefixHTML
    global suffixHTML
    if command == '':
        return 0, []
    tmp = jieba.cut(command)
    if command == '':
        return 0, []
    tmp = jieba.cut(command)
    tmp = ''.join(command)
    command = tmp
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 50).scoreDocs
    print "%s total matching documents." % len(scoreDocs)
    simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML)
    highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query))
    result = []
    match_count = len(scoreDocs)
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        text = doc.get("contents")
        content = highlighter.getBestFragment(analyzer, "contents", text)
        if (low == None) and (high == None):
            result.append({
                "url": doc.get('url'),
                "Content": content,
                "pic_url": doc.get('pic_url'),
                "title": doc.get('title'),
                "price": doc.get('price'),
                "description": doc.get('description')
            })
        elif (low != None and high != None):
            if doc.get('price') >= int(low) and doc.get('price') <= int(high):
                result.append({
                    "url": doc.get('url'),
                    "Content": content,
                    "pic_url": doc.get('pic_url'),
                    "title": doc.get('title'),
                    "price": doc.get('price'),
                    "description": doc.get('description')
                })
        elif (low == None and high != None):
            if doc.get('price') <= int(high):
                result.append({
                    "url": doc.get('url'),
                    "Content": content,
                    "pic_url": doc.get('pic_url'),
                    "title": doc.get('title'),
                    "price": doc.get('price'),
                    "description": doc.get('description')
                })
        else:
            if doc.get('price') >= int(low):
                result.append({
                    "url": doc.get('url'),
                    "Content": content,
                    "pic_url": doc.get('pic_url'),
                    "title": doc.get('title'),
                    "price": doc.get('price'),
                    "description": doc.get('description')
                })
    result = sorted(result, key=lambda x: float(x["price"]), reverse=True)

    return match_count, result
def run(searcher, analyzer, command, urlclick):

    if command == '':
        return []
    res = firstsearch(searcher, analyzer, command)
    command = ''.join(my_jieba.cut(command))
    command = " ".join(jieba.cut(command, cut_all=True))
    if len(res) > 0:
        scoreDocs = res
    else:
        querys = BooleanQuery()
        for k in tag:
            query = QueryParser(Version.LUCENE_CURRENT, k,
                                analyzer).parse(command)
            if k == 'taste' or k == 'tech':
                query.setBoost(0.5)
            querys.add(query, BooleanClause.Occur.SHOULD)
        scoreDocs = searcher.search(querys, 10000).scoreDocs

    swxc_res = findres(command, scoreDocs, searcher)
    formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>")
    formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>")
    scorer = QueryScorer(
        QueryParser(Version.LUCENE_CURRENT, 'name', analyzer).parse(command))
    highlighter1 = Highlighter(formatter_name, scorer)
    highlighter2 = Highlighter(
        formatter_name,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'content',
                        analyzer).parse(command)))
    highlighter3 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'ingredient',
                        analyzer).parse(command)))
    highlighter4 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'taste',
                        analyzer).parse(command)))
    highlighter5 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'tech',
                        analyzer).parse(command)))
    highlighter6 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'others',
                        analyzer).parse(command)))

    fragmenter = SimpleFragmenter(1000)
    highlighter1.setTextFragmenter(fragmenter)
    highlighter2.setTextFragmenter(fragmenter)
    highlighter3.setTextFragmenter(fragmenter)
    highlighter4.setTextFragmenter(fragmenter)
    highlighter5.setTextFragmenter(fragmenter)
    highlighter6.setTextFragmenter(fragmenter)

    results = []
    for scoreDoc in scoreDocs:
        if (len(scoreDocs) > 200 and
                len(scoreDocs) * scoreDoc.score < 2) or scoreDoc.score < 0.002:
            continue
        doc = searcher.doc(scoreDoc.doc)

        highlighterContent = highlighter1.getBestFragment(
            analyzer, 'name', doc.get('name'))
        highlighterContent2 = highlighter2.getBestFragment(
            analyzer, 'content', doc.get('content'))
        highlighterContent3 = highlighter3.getBestFragment(
            analyzer, 'ingredient', doc.get('ingredient'))
        highlighterContent4 = highlighter4.getBestFragment(
            analyzer, 'taste', doc.get('taste'))
        highlighterContent5 = highlighter5.getBestFragment(
            analyzer, 'tech', doc.get('tech'))
        highlighterContent6 = highlighter6.getBestFragment(
            analyzer, 'others', doc.get('others'))

        if highlighterContent:
            highlighterContent = highlighterContent.replace(' ', '')
            highlighterContent = highlighterContent.replace(',', ' ')
        else:
            highlighterContent = doc.get('name').replace(' ', '')

        if highlighterContent2:
            highlighterContent2 = highlighterContent2.replace(' ', '')
            highlighterContent2 = highlighterContent2.replace(',', ' ')
        else:
            highlighterContent2 = doc.get('content').replace(' ', '')
        if highlighterContent3:
            highlighterContent3 = highlighterContent3.replace(',', '')
        else:
            highlighterContent3 = (doc.get('ingredient')).replace(',', '')
        if highlighterContent4:
            pass
        else:
            highlighterContent4 = doc.get('taste')
        if highlighterContent5:
            pass
        else:
            highlighterContent5 = doc.get('tech')
        if highlighterContent6:
            highlighterContent6 = highlighterContent6.replace(',', '')
        else:
            highlighterContent6 = (doc.get('others')).replace(',', '')

        results.append(
            (highlighterContent, doc.get('img'), highlighterContent2,
             highlighterContent3, highlighterContent4, highlighterContent5,
             highlighterContent6, doc.get('url'), scoreDoc.score))

        for i in range(0, min(20, len(results)) - 1):
            flag = True
            for j in range(0, min(20, len(results)) - i - 1):
                if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[
                        results[j][7]] < urlclick[results[j + 1][7]]:
                    flag = False
                    results[j], results[j + 1] = results[j + 1], results[j]
            if flag:
                break

    return results, swxc_res
def build_highlighter(parsed_query):
    scorer = QueryScorer(parsed_query, 'content')
    highlighter = Highlighter(SimpleHTMLFormatter(), scorer)
    fragmenter = SimpleSpanFragmenter(scorer, FRAGMENT_SIZE)
    highlighter.setTextFragmenter(fragmenter)
    return highlighter
def superSearch(command, command_dict, urlclick):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File('index2.3'))
    print "run super search..."
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
    command = ' '.join(jieba.cut_for_search(command))
    querys = BooleanQuery()
    if command:
        query = QueryParser(Version.LUCENE_CURRENT, 'nameforsearch',
                            analyzer).parse(command)
        querys.add(query, BooleanClause.Occur.SHOULD)
    for k, v in (command_dict[0]).items():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        query.setBoost(0.1)
        querys.add(query, BooleanClause.Occur.MUST)
    for k, v in (command_dict[1]).items():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST_NOT)
    scoreDocs = searcher.search(querys, 10000).scoreDocs
    swxc_res = findres(command + ' ' + command_dict[0].get("ingredient", ''),
                       scoreDocs, searcher)
    formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>")
    formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>")
    if command:
        scorer = QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'name',
                        analyzer).parse(command))
        highlighters = [Highlighter(formatter_name, scorer)]
    else:
        highlighters = ['']
    if command_dict[0].get('ingredient'):
        highlighters.append(
            Highlighter(
                formatter,
                QueryScorer(
                    QueryParser(Version.LUCENE_CURRENT, 'ingredient',
                                analyzer).parse(
                                    command_dict[0]['ingredient']))))
    else:
        highlighters.append('')
    if command_dict[0].get('taste'):
        highlighters.append(
            Highlighter(
                formatter,
                QueryScorer(
                    QueryParser(Version.LUCENE_CURRENT, 'taste',
                                analyzer).parse(command_dict[0]['taste']))))
    else:
        highlighters.append('')
    if command_dict[0].get('tech'):
        highlighters.append(
            Highlighter(
                formatter,
                QueryScorer(
                    QueryParser(Version.LUCENE_CURRENT, 'tech',
                                analyzer).parse(command_dict[0]['tech']))))
    else:
        highlighters.append('')
    fragmenter = SimpleFragmenter(1000)
    for h in highlighters:
        if h:
            h.setTextFragmenter(fragmenter)

    results = []
    for scoreDoc in scoreDocs:
        if (scoreDoc.score * len(scoreDocs) < 200
                and len(scoreDocs) > 200) or scoreDoc.score < 0.1:
            continue
        doc = searcher.doc(scoreDoc.doc)
        if command:
            highlighterContent = highlighters[0].getBestFragment(
                analyzer, 'name', doc.get('name'))
        else:
            highlighterContent = ''
        if highlighters[1]:
            highlighterContent2 = highlighters[1].getBestFragment(
                analyzer, 'ingredient', doc.get('ingredient'))
        else:
            highlighterContent2 = ''
        if highlighters[2]:
            highlighterContent3 = highlighters[2].getBestFragment(
                analyzer, 'taste', doc.get('taste'))
        else:
            highlighterContent3 = ''
        if highlighters[3]:
            highlighterContent4 = highlighters[3].getBestFragment(
                analyzer, 'tech', doc.get('tech'))
        else:
            highlighterContent4 = ''

        if highlighterContent:
            highlighterContent = highlighterContent.replace(' ', '')
            highlighterContent = highlighterContent.replace(',', ' ')
        else:
            highlighterContent = doc.get('name').replace(' ', '')
        if highlighterContent2:
            highlighterContent2 = highlighterContent2.replace(',', '')
        else:
            highlighterContent2 = (doc.get('ingredient')).replace(',', '')
        if highlighterContent3:
            pass
        else:
            highlighterContent3 = doc.get('taste')
        if highlighterContent4:
            pass
        else:
            highlighterContent4 = doc.get('tech')
        results.append(
            (highlighterContent, doc.get('img'),
             doc.get('content').replace(' ', ''),
             highlighterContent2, highlighterContent3, highlighterContent4,
             doc.get('others').replace(',',
                                       ''), doc.get('url'), scoreDoc.score))

        for i in range(0, min(20, len(results)) - 1):
            flag = True
            for j in range(0, min(20, len(results)) - i - 1):
                if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[
                        results[j][7]] < urlclick[results[j + 1][7]]:
                    flag = False
                    results[j], results[j + 1] = results[j + 1], results[j]
            if flag:
                break

    return results, swxc_res
Exemple #22
0
    def run(self, writer=None, analyzer=None):

        if writer is None:
            writer = self.writer

        if analyzer is None:
            analyzer = self.analyzer

        searcher = IndexSearcher(DirectoryReader.open(\
        SimpleFSDirectory.open(File(self.store_dir))))
        while True:
            print()
            print("Hit enter with no input to quit.")
            command = input("Query:")
            if command == '':
                return

            print("Searching for:", command)
            query = QueryParser(Version.LUCENE_43, "contents",
                                analyzer).parse(command)

            # We'll just show the top 10 matching documents for now
            scoreDocs = searcher.search(query, 10).scoreDocs
            print("%s total matching documents." % len(scoreDocs))

            # Highlight the matching text in red
            highlighter = Highlighter(
                SimpleHTMLFormatter('<b><font color\
            ="red">', '</font></b>'), QueryScorer(query))

            # Using NullFragmenter since we still want to see
            # the whole document
            highlighter.setTextFragmenter(NullFragmenter())

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream(
                    "contents", StringReader(doc.get("contents")))

                # arg 3: the maximum number of fragments
                # arg 4: the separator used to intersperse the
                # document fragments (typically "...")
                # arg 3 and 4 don't really matter with NullFragmenter
                result = highlighter.getBestFragments(tokenStream,
                                                      doc.get("contents"), 2,
                                                      "...")

                if len(result) > 10:
                    file_handler = open(self.hits_dir + '/' + doc.get("name"),
                                        'w+')
                    file_handler.write(result)

            # create hit fragments, if we want to show them
            # arg 1: fragment size
            highlighter.setTextFragmenter(SimpleFragmenter(200))

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream(
                    "contents", StringReader(doc.get("contents")))

                result = highlighter.getBestFragments(tokenStream,
                                                      doc.get("contents"), 2,
                                                      "...")

                if len(result) > 10:
                    file_handler = open(self.frags_dir + '/' + doc.get("name"),
                                        'w+')
                    file_handler.write(result)