def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------------' print 'title:', doc.get('title') print 'url:', doc.get('url') print 'src:', doc.get('src')
def search(self, topic): query = self.query_parser.parse(topic.title) results = self.searcher.search(query, self.top_n) score_pairs = {} for hit in results.scoreDocs: doc = self.searcher.doc(hit.doc) for field in ["title", "heading", "text"]: terms = doc.get(field).split() for term in terms: if (field, term) in score_pairs: score_pairs[(field, term)].increment() else: score_pairs[(field, term)] = ScorePair( self.reader, field, term) # XXX top_terms = score_pairs.values() top_terms.sort(key=lambda x: x.score(), reverse=True) top_terms = top_terms[:25] # print([term.term for term in top_terms]) bq = BooleanQuery() query.setBoost(float(10000000)) bq.add(query, BooleanClause.Occur.SHOULD) for score_pair in top_terms: term = score_pair.to_term() bq.add(TermQuery(term), BooleanClause.Occur.SHOULD) return self.searcher.search(bq, 5000)
def run(command, pageindex,pagesize): global searcher,analyzer print "Searching for:", command querys = BooleanQuery() command_dict = parseCommand(command) for k,v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 6000).scoreDocs print "%s total matching documents." % len(scoreDocs) start = (pageindex - 1) * pagesize end = start + pagesize res = [] for scoreDoc in scoreDocs[start:end+1]: doc = searcher.doc(scoreDoc.doc) r = [] r.append(doc.get('title')) r.append(doc.get('url')) r.append(doc.get('src')) r.append(doc.get('alt').replace(' ','')) res.append(r) return res,len(scoreDocs)
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") #command = 'Christian author:mark twain title:autobiography language:English' command = unicode(command, 'GBK') if command == '': return print print "Searching for:", command command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) ## explanation = searcher.explain(query, scoreDoc.doc) print "------------------------" print 'path:', doc.get("path") print 'name:', doc.get("name") print 'title:', doc.get('title') print 'author:', doc.get('author') print 'language:', doc.get('language')
def testToString(self): query = BooleanQuery() query.add(FuzzyQuery(Term("field", "kountry")), BooleanClause.Occur.MUST) query.add(TermQuery(Term("title", "western")), BooleanClause.Occur.SHOULD) self.assertEqual("+kountry~0.5 title:western", query.toString("field"), "both kinds")
def search_image(command): if command == ' ': return [] Docs = [] vm_env = getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_img" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) command_dict = {} allowed_opt = ['site'] opt = 'contents' for i in command.split(' '): if ':' in i: opt, value = i.split(':')[:2] opt = opt.lower() if opt in allowed_opt and value != '': command_dict[opt] = command_dict.get(opt, '') + ' ' + value else: seg_list = jieba.cut(i) command_dict[opt] = command_dict.get(opt, '') + ' ' + " ".join(seg_list) querys = BooleanQuery() for k, v in command_dict.iteritems(): if k == 'site': t = Term(k, '*' + v) query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10000).scoreDocs formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>") highlighter = Highlighter(formatter, QueryScorer(querys)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dic = {} doc_dic["url"] = doc.get("url") doc_dic["imgurl"] = doc.get("imgurl") doc_dic["urltitle"] = doc.get("urltitle") text = doc.get("contents") ts = analyzer.tokenStream(doc.get("contents"), StringReader(text)) doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...") Docs.append(doc_dic) searcher.close() return Docs
def testAnd(self): searchingBooks = TermQuery(Term("subject", "search")) books2004 = NumericRangeQuery.newIntRange("pubmonth", Integer(200401), Integer(200412), True, True) searchingBooks2004 = BooleanQuery() searchingBooks2004.add(searchingBooks, BooleanClause.Occur.MUST) searchingBooks2004.add(books2004, BooleanClause.Occur.MUST) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(searchingBooks2004, 50).scoreDocs self.assertHitsIncludeTitle(searcher, scoreDocs, "Lucene in Action")
def docsLike(self, id, doc, max): authors = doc.getValues("author") authorQuery = BooleanQuery() for author in authors: authorQuery.add(TermQuery(Term("author", author)), BooleanClause.Occur.SHOULD) authorQuery.setBoost(2.0) vector = self.reader.getTermFreqVector(id, "subject") subjectQuery = BooleanQuery() for term in vector.getTerms(): tq = TermQuery(Term("subject", term)) subjectQuery.add(tq, BooleanClause.Occur.SHOULD) likeThisQuery = BooleanQuery() likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD) likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD) # exclude myself likeThisQuery.add(TermQuery(Term("isbn", doc.get("isbn"))), BooleanClause.Occur.MUST_NOT) print " Query:", likeThisQuery.toString("contents") scoreDocs = self.searcher.search(likeThisQuery, 50).scoreDocs docs = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) if len(docs) < max: docs.append(doc) else: break return docs
def testAgainstOR(self): quickFox = PhraseQuery() quickFox.setSlop(1) quickFox.add(Term("field", "quick")) quickFox.add(Term("field", "fox")) fastFox = PhraseQuery() fastFox.add(Term("field", "fast")) fastFox.add(Term("field", "fox")) query = BooleanQuery() query.add(quickFox, BooleanClause.Occur.SHOULD) query.add(fastFox, BooleanClause.Occur.SHOULD) topDocs = self.searcher.search(query, 10) self.assertEqual(2, topDocs.totalHits)
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): if (k == 'site'): t = Term('url', '*' + v.strip() + '*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10).scoreDocs print "%s total matching documents." % len(scoreDocs) simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command_dict['contents']) hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(500)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------' #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site') print 'title:', doc.get('title'), print 'url:', doc.get('url') ori_text = clear(doc.get('contents')) output = hlter.getBestFragment(analyzer, "contents", ori_text) print output
def run(command, pageindex=1, pagesize=15): global searcher, analyzer, old_command, old_res_list global STORE_DIR, directory, searcher, analyzer if command == '': return print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): if (k == 'site'): t = Term('url', '*' + v.strip() + '*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 4000).scoreDocs print "%s total matching documents." % len(scoreDocs) res_list = [] simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command_dict['contents']) hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(200)) start = (pageindex - 1) * pagesize end = start + pagesize for scoreDoc in scoreDocs[start:end + 1]: doc = searcher.doc(scoreDoc.doc) res = [] res.append(doc.get('title')) res.append(doc.get('url')) output = hlter.getBestFragment(analyzer, "contents", clear(doc.get('contents'))) res.append(output) res_list.append(res) return res_list, len(scoreDocs)
def testFilteredQuery(self): isbns = ["0854402624"] # Steiner accessor = TestSpecialsAccessor(isbns) filter = SpecialsFilter(accessor) educationBooks = WildcardQuery(Term("category", "*education*")) edBooksOnSpecial = FilteredQuery(educationBooks, filter) logoBooks = TermQuery(Term("subject", "logo")) logoOrEdBooks = BooleanQuery() logoOrEdBooks.add(logoBooks, BooleanClause.Occur.SHOULD) logoOrEdBooks.add(edBooksOnSpecial, BooleanClause.Occur.SHOULD) topDocs = self.searcher.search(logoOrEdBooks, 50) print logoOrEdBooks self.assertEqual(2, topDocs.totalHits, "Papert and Steiner")
def testOr(self): methodologyBooks = TermQuery( Term("category", "/technology/computers/programming/methodology")) easternPhilosophyBooks = TermQuery( Term("category", "/philosophy/eastern")) enlightenmentBooks = BooleanQuery() enlightenmentBooks.add(methodologyBooks, BooleanClause.Occur.SHOULD) enlightenmentBooks.add(easternPhilosophyBooks, BooleanClause.Occur.SHOULD) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(enlightenmentBooks, 50).scoreDocs print "or =", enlightenmentBooks self.assertHitsIncludeTitle(searcher, scoreDocs, "Extreme Programming Explained") self.assertHitsIncludeTitle(searcher, scoreDocs, u"Tao Te Ching \u9053\u5FB7\u7D93")
def run(searcher, analyzer, command, prior): if command == '': return store = [] command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 500000).scoreDocs scored = [] for scoreDoc in scoreDocs: returnfile = [] doc = searcher.doc(scoreDoc.doc) if doc.get("qst_num") in scored: continue if not doc.get("qst_name"): continue scored.append(doc.get("qst_num")) name = doc.get("qst_name").replace(' ', '') returnfile.append(name) detail = doc.get("qst_detail").replace(' ', '') returnfile.append(detail) returnfile.append(doc.get("qst_topic_accu")) returnfile.append(int(doc.get("qst_browse"))) returnfile.append(int(doc.get("qst_follow"))) returnfile.append(int(doc.get("qst_ans"))) returnfile.append(int(doc.get("qst_num"))) store.append(returnfile) store = storesort(store, prior) return store
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'GBK') if command == '': return command_dict = parseCommand(command) sep_command = " ".join(jieba.cut(command_dict['contents'])) command_dict['contents'] = sep_command #print command_dict if not command_dict.has_key('site'): command = command_dict['contents'] else: command = command_dict['contents'] + " site:" + command_dict['site'] print print "Searching for:", command querys = BooleanQuery() for k, v in command_dict.iteritems(): print k, v query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print "------------------------" #print 'site:', doc.get("site") print 'path:', doc.get("path") print 'title:', doc.get("title") print 'url:', doc.get("url") print 'name:', doc.get("name")
def run(searcher, analyzer, command): while True: if command == '': return command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 300).scoreDocs print "%s total matching documents." % len(scoreDocs) text = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) temptext = [ doc.get("url"), doc.get('title'), doc.get("imgurl"), doc.get("price"), doc.get("kind") ] text.append(temptext) return text
def searchDocuments(self, view, version, query=None, attribute=None): store = self.store if query is None: query = MatchAllDocsQuery() else: query = QueryParser("contents", StandardAnalyzer()).parse(query) if attribute: combinedQuery = BooleanQuery() combinedQuery.add(query, BooleanClause.Occur.MUST) combinedQuery.add(TermQuery(Term("attribute", attribute.str64())), BooleanClause.Occur.MUST) query = combinedQuery class _collector(PythonHitCollector): def __init__(_self): super(_collector, _self).__init__() _self.hits = [] def collect(_self, id, score): _self.hits.append((-score, id)) class _iterator(object): def __init__(_self): _self.txnStatus = 0 _self.searcher = None _self.collector = None def __del__(_self): try: if _self.searcher is not None: _self.searcher.close() store.abortTransaction(view, _self.txnStatus) except: store.repository.logger.exception("in __del__") _self.txnStatus = 0 _self.searcher = None _self.collector = None def __iter__(_self): _self.txnStatus = store.startTransaction(view) _self.searcher = searcher = self.getIndexSearcher() _self.collector = _collector() searcher.search(query, _self.collector) hits = _self.collector.hits if hits: heapify(hits) while hits: score, id = heappop(hits) doc = searcher.doc(id) uItem = UUID(doc['item']) if long(doc['version']) <= version: if store._items.isValue(view, version, uItem, UUID(doc['value'])): yield uItem, UUID(doc['attribute']) return _iterator()