Esempio n. 1
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command
        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------------'
            print 'title:', doc.get('title')
            print 'url:', doc.get('url')
            print 'src:', doc.get('src')
    def search(self, topic):

        query = self.query_parser.parse(topic.title)
        results = self.searcher.search(query, self.top_n)

        score_pairs = {}
        for hit in results.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            for field in ["title", "heading", "text"]:
                terms = doc.get(field).split()
                for term in terms:
                    if (field, term) in score_pairs:
                        score_pairs[(field, term)].increment()
                    else:
                        score_pairs[(field, term)] = ScorePair(
                            self.reader, field, term)  # XXX

        top_terms = score_pairs.values()
        top_terms.sort(key=lambda x: x.score(), reverse=True)
        top_terms = top_terms[:25]

        # print([term.term for term in top_terms])

        bq = BooleanQuery()
        query.setBoost(float(10000000))
        bq.add(query, BooleanClause.Occur.SHOULD)
        for score_pair in top_terms:
            term = score_pair.to_term()
            bq.add(TermQuery(term), BooleanClause.Occur.SHOULD)

        return self.searcher.search(bq, 5000)
def run(command, pageindex,pagesize):
    global searcher,analyzer

    print "Searching for:", command 
    querys = BooleanQuery()
    command_dict = parseCommand(command)
    for k,v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k,
                            analyzer).parse(v)

        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 6000).scoreDocs
    print "%s total matching documents." % len(scoreDocs)
    start = (pageindex - 1) * pagesize
    end = start + pagesize
    res = []
    for scoreDoc in scoreDocs[start:end+1]:
        doc = searcher.doc(scoreDoc.doc)
        r = []
        r.append(doc.get('title'))
        r.append(doc.get('url'))
        r.append(doc.get('src'))
        r.append(doc.get('alt').replace(' ',''))
        res.append(r)
    return res,len(scoreDocs)
Esempio n. 4
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        #command = 'Christian author:mark twain title:autobiography language:English'
        command = unicode(command, 'GBK')
        if command == '':
            return

        print
        print "Searching for:", command

        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            ##            explanation = searcher.explain(query, scoreDoc.doc)
            print "------------------------"
            print 'path:', doc.get("path")
            print 'name:', doc.get("name")
            print 'title:', doc.get('title')
            print 'author:', doc.get('author')
            print 'language:', doc.get('language')
Esempio n. 5
0
    def testToString(self):

        query = BooleanQuery()
        query.add(FuzzyQuery(Term("field", "kountry")),
                  BooleanClause.Occur.MUST)
        query.add(TermQuery(Term("title", "western")),
                  BooleanClause.Occur.SHOULD)

        self.assertEqual("+kountry~0.5 title:western", query.toString("field"),
                         "both kinds")
Esempio n. 6
0
def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs
Esempio n. 7
0
    def testAnd(self):

        searchingBooks = TermQuery(Term("subject", "search"))
        books2004 = NumericRangeQuery.newIntRange("pubmonth", Integer(200401),
                                                  Integer(200412), True, True)

        searchingBooks2004 = BooleanQuery()
        searchingBooks2004.add(searchingBooks, BooleanClause.Occur.MUST)
        searchingBooks2004.add(books2004, BooleanClause.Occur.MUST)

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(searchingBooks2004, 50).scoreDocs

        self.assertHitsIncludeTitle(searcher, scoreDocs, "Lucene in Action")
Esempio n. 8
0
    def docsLike(self, id, doc, max):

        authors = doc.getValues("author")
        authorQuery = BooleanQuery()
        for author in authors:
            authorQuery.add(TermQuery(Term("author", author)),
                            BooleanClause.Occur.SHOULD)
        authorQuery.setBoost(2.0)

        vector = self.reader.getTermFreqVector(id, "subject")

        subjectQuery = BooleanQuery()
        for term in vector.getTerms():
            tq = TermQuery(Term("subject", term))
            subjectQuery.add(tq, BooleanClause.Occur.SHOULD)

        likeThisQuery = BooleanQuery()
        likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD)
        likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD)

        # exclude myself
        likeThisQuery.add(TermQuery(Term("isbn", doc.get("isbn"))),
                          BooleanClause.Occur.MUST_NOT)

        print "  Query:", likeThisQuery.toString("contents")
        scoreDocs = self.searcher.search(likeThisQuery, 50).scoreDocs

        docs = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            if len(docs) < max:
                docs.append(doc)
            else:
                break

        return docs
Esempio n. 9
0
    def testAgainstOR(self):

        quickFox = PhraseQuery()
        quickFox.setSlop(1)
        quickFox.add(Term("field", "quick"))
        quickFox.add(Term("field", "fox"))

        fastFox = PhraseQuery()
        fastFox.add(Term("field", "fast"))
        fastFox.add(Term("field", "fox"))

        query = BooleanQuery()
        query.add(quickFox, BooleanClause.Occur.SHOULD)
        query.add(fastFox, BooleanClause.Occur.SHOULD)
        topDocs = self.searcher.search(query, 10)
        self.assertEqual(2, topDocs.totalHits)
Esempio n. 10
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command  #朱莉与茱莉娅

        # final = jieba.cut(command)
        # query = QueryParser(Version.LUCENE_CURRENT, "contents",
        #                     analyzer).parse(' '.join(final))

        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k, v in command_dict.iteritems():
            if (k == 'site'):
                t = Term('url', '*' + v.strip() + '*')
                query = WildcardQuery(t)
            else:
                query = QueryParser(Version.LUCENE_CURRENT, k,
                                    analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 10).scoreDocs

        print "%s total matching documents." % len(scoreDocs)
        simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>",
                                                  "</font>")

        queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents",
                                  analyzer).parse(command_dict['contents'])

        hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh))
        hlter.setTextFragmenter(SimpleFragmenter(500))
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------'
            #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site')
            print 'title:', doc.get('title'),
            print 'url:', doc.get('url')
            ori_text = clear(doc.get('contents'))
            output = hlter.getBestFragment(analyzer, "contents", ori_text)
            print output
Esempio n. 11
0
def run(command, pageindex=1, pagesize=15):
    global searcher, analyzer, old_command, old_res_list
    global STORE_DIR, directory, searcher, analyzer
    if command == '':
        return

    print "Searching for:", command  #朱莉与茱莉娅

    # final = jieba.cut(command)
    # query = QueryParser(Version.LUCENE_CURRENT, "contents",
    #                     analyzer).parse(' '.join(final))

    querys = BooleanQuery()
    command_dict = parseCommand(command)
    for k, v in command_dict.iteritems():
        if (k == 'site'):
            t = Term('url', '*' + v.strip() + '*')
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 4000).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    res_list = []
    simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>",
                                              "</font>")

    queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents",
                              analyzer).parse(command_dict['contents'])

    hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh))
    hlter.setTextFragmenter(SimpleFragmenter(200))
    start = (pageindex - 1) * pagesize
    end = start + pagesize
    for scoreDoc in scoreDocs[start:end + 1]:
        doc = searcher.doc(scoreDoc.doc)
        res = []
        res.append(doc.get('title'))
        res.append(doc.get('url'))
        output = hlter.getBestFragment(analyzer, "contents",
                                       clear(doc.get('contents')))
        res.append(output)
        res_list.append(res)
    return res_list, len(scoreDocs)
Esempio n. 12
0
    def testFilteredQuery(self):

        isbns = ["0854402624"]  # Steiner

        accessor = TestSpecialsAccessor(isbns)
        filter = SpecialsFilter(accessor)

        educationBooks = WildcardQuery(Term("category", "*education*"))
        edBooksOnSpecial = FilteredQuery(educationBooks, filter)

        logoBooks = TermQuery(Term("subject", "logo"))

        logoOrEdBooks = BooleanQuery()
        logoOrEdBooks.add(logoBooks, BooleanClause.Occur.SHOULD)
        logoOrEdBooks.add(edBooksOnSpecial, BooleanClause.Occur.SHOULD)

        topDocs = self.searcher.search(logoOrEdBooks, 50)
        print logoOrEdBooks
        self.assertEqual(2, topDocs.totalHits, "Papert and Steiner")
Esempio n. 13
0
    def testOr(self):

        methodologyBooks = TermQuery(
            Term("category", "/technology/computers/programming/methodology"))
        easternPhilosophyBooks = TermQuery(
            Term("category", "/philosophy/eastern"))

        enlightenmentBooks = BooleanQuery()
        enlightenmentBooks.add(methodologyBooks, BooleanClause.Occur.SHOULD)
        enlightenmentBooks.add(easternPhilosophyBooks,
                               BooleanClause.Occur.SHOULD)

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(enlightenmentBooks, 50).scoreDocs
        print "or =", enlightenmentBooks

        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Extreme Programming Explained")
        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    u"Tao Te Ching \u9053\u5FB7\u7D93")
Esempio n. 14
0
def run(searcher, analyzer, command, prior):
    if command == '':
        return

    store = []

    command_dict = parseCommand(command)
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 500000).scoreDocs

    scored = []

    for scoreDoc in scoreDocs:
        returnfile = []
        doc = searcher.doc(scoreDoc.doc)

        if doc.get("qst_num") in scored:
            continue
        if not doc.get("qst_name"):
            continue
        scored.append(doc.get("qst_num"))

        name = doc.get("qst_name").replace(' ', '')
        returnfile.append(name)
        detail = doc.get("qst_detail").replace(' ', '')
        returnfile.append(detail)

        returnfile.append(doc.get("qst_topic_accu"))
        returnfile.append(int(doc.get("qst_browse")))
        returnfile.append(int(doc.get("qst_follow")))
        returnfile.append(int(doc.get("qst_ans")))
        returnfile.append(int(doc.get("qst_num")))

        store.append(returnfile)

    store = storesort(store, prior)
    return store
Esempio n. 15
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'GBK')
        if command == '':
            return

        command_dict = parseCommand(command)
        sep_command = " ".join(jieba.cut(command_dict['contents']))
        command_dict['contents'] = sep_command
        #print command_dict
        if not command_dict.has_key('site'):
            command = command_dict['contents']
        else:
            command = command_dict['contents'] + " site:" + command_dict['site']
        print
        print "Searching for:", command

        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            print k, v
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print "------------------------"
            #print 'site:', doc.get("site")
            print 'path:', doc.get("path")
            print 'title:', doc.get("title")
            print 'url:', doc.get("url")
            print 'name:', doc.get("name")
Esempio n. 16
0
def run(searcher, analyzer, command):
    while True:
        if command == '':
            return
        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 300).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        text = []
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            temptext = [
                doc.get("url"),
                doc.get('title'),
                doc.get("imgurl"),
                doc.get("price"),
                doc.get("kind")
            ]
            text.append(temptext)
        return text
    def searchDocuments(self, view, version, query=None, attribute=None):

        store = self.store

        if query is None:
            query = MatchAllDocsQuery()
        else:
            query = QueryParser("contents", StandardAnalyzer()).parse(query)

        if attribute:
            combinedQuery = BooleanQuery()
            combinedQuery.add(query, BooleanClause.Occur.MUST)
            combinedQuery.add(TermQuery(Term("attribute", attribute.str64())),
                              BooleanClause.Occur.MUST)
            query = combinedQuery

        class _collector(PythonHitCollector):
            def __init__(_self):

                super(_collector, _self).__init__()
                _self.hits = []

            def collect(_self, id, score):

                _self.hits.append((-score, id))

        class _iterator(object):
            def __init__(_self):

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __del__(_self):

                try:
                    if _self.searcher is not None:
                        _self.searcher.close()
                    store.abortTransaction(view, _self.txnStatus)
                except:
                    store.repository.logger.exception("in __del__")

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __iter__(_self):

                _self.txnStatus = store.startTransaction(view)
                _self.searcher = searcher = self.getIndexSearcher()
                _self.collector = _collector()

                searcher.search(query, _self.collector)
                hits = _self.collector.hits

                if hits:
                    heapify(hits)
                    while hits:
                        score, id = heappop(hits)
                        doc = searcher.doc(id)
                        uItem = UUID(doc['item'])

                        if long(doc['version']) <= version:
                            if store._items.isValue(view, version, uItem,
                                                    UUID(doc['value'])):
                                yield uItem, UUID(doc['attribute'])

        return _iterator()