def search(self, topic): query = self.query_parser.parse(topic.title) results = self.searcher.search(query, self.top_n) score_pairs = {} for hit in results.scoreDocs: doc = self.searcher.doc(hit.doc) for field in ["title","heading", "text"]: terms = doc.get(field).split() for term in terms: if (field, term) in score_pairs: score_pairs[(field,term)].increment() else: score_pairs[(field,term)] = ScorePair(self.reader, field, term) # XXX top_terms = score_pairs.values() top_terms.sort(key=lambda x: x.score(), reverse=True) top_terms = top_terms[:25] # print([term.term for term in top_terms]) bq = BooleanQuery() query.setBoost(float(10000000)) bq.add(query, BooleanClause.Occur.SHOULD) for score_pair in top_terms: term = score_pair.to_term() bq.add(TermQuery(term), BooleanClause.Occur.SHOULD) return self.searcher.search(bq, 5000)
def run1(searcher, analyzer,target,distance,comein): print '&&&&&&&&&&&&&&&&&&&&&&&&&&' print target print [comein[1]] flit = kind_fliter.kind_fliter(comein[1].decode('utf8')) if (not flit[0]): segment = jieba.cut(flit[1]) command = " ".join(segment).replace("\n","").decode('utf-8') #print "Searching for:", command query_s = QueryParser(Version.LUCENE_CURRENT, "search", analyzer).parse(command) else: query_s = QueryParser(Version.LUCENE_CURRENT, "kind", analyzer).parse(flit[1]) #scoreDocs = searcher.search(query_s, 50).scoreDocs #print target cal_tar_x = int(target[0]*100000) cal_tar_y = int(target[1]*100000) #print cal_tar_x radius = distance*A*100000 #print radius cal_radius = int(radius) query1 = NumericRangeQuery.newFloatRange("calx",cal_tar_x-cal_radius,cal_tar_x+cal_radius,True,True) query2 = NumericRangeQuery.newFloatRange("caly",cal_tar_y-cal_radius,cal_tar_y+cal_radius,True,True) query = BooleanQuery() query.add(query1, BooleanClause.Occur.MUST) query.add(query2, BooleanClause.Occur.MUST) query.add(query_s, BooleanClause.Occur.MUST) scoreDocs = searcher.search(query, 500).scoreDocs return scoreDocs,query
def run(searcher, analyzer): while True: querys=BooleanQuery() print print "Hit enter with no input to quit." ## for i in range(searcher.maxDoc()): ## doc=searcher.doc(i) ## print doc.get('singername') command = raw_input("Query:") command = unicode(command, 'GBK') if command == '': return print ## command=" ".join(jieba.cut(command)) print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT,"singername",analyzer).parse(command) querys.add(query,BooleanClause.Occur.SHOULD) ## query = QueryParser(Version.LUCENE_CURRENT,"albumname",analyzer).parse(command) ## querys.add(query,BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 3).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'singername:', doc.get("singername"), '\n','singerplace:',doc.get('singerplace'), '\n',\ 'singerintro:',doc.get('singerintro'),'\n','singeralbums:', doc.get("singeralbums"),'\n',\ 'singeralbumURLs:',doc.get('singeralbumURLs'),'\n','singerpicURL:', doc.get("singerpicURL"),'\n'
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") #command = 'Christian author:mark twain title:autobiography language:English' command = unicode(command, 'GBK') if command == '': return print print "Searching for:", command command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) ## explanation = searcher.explain(query, scoreDoc.doc) print "------------------------" print 'path:', doc.get("path") print 'name:', doc.get("name") print 'title:', doc.get('title') print 'author:', doc.get('author') print 'language:', doc.get('language')
def main(cls, argv): allBooks = MatchAllDocsQuery() parser = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT)) query = BooleanQuery() query.add(allBooks, BooleanClause.Occur.SHOULD) query.add(parser.parse("java OR action"), BooleanClause.Occur.SHOULD) indexDir = System.getProperty("index.dir") directory = SimpleFSDirectory(File(indexDir)) example = SortingExample(directory) example.displayResults(query, Sort.RELEVANCE) example.displayResults(query, Sort.INDEXORDER) example.displayResults(query, Sort(SortField("category", SortField.STRING))) example.displayResults(query, Sort(SortField("pubmonth", SortField.INT, True))) example.displayResults(query, Sort([SortField("category", SortField.STRING), SortField.FIELD_SCORE, SortField("pubmonth", SortField.INT, True)])) example.displayResults(query, Sort([SortField.FIELD_SCORE, SortField("category", SortField.STRING)])) directory.close()
def run(command, pageindex,pagesize): global searcher,analyzer print "Searching for:", command querys = BooleanQuery() command_dict = parseCommand(command) for k,v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 6000).scoreDocs print "%s total matching documents." % len(scoreDocs) start = (pageindex - 1) * pagesize end = start + pagesize res = [] for scoreDoc in scoreDocs[start:end+1]: doc = searcher.doc(scoreDoc.doc) r = [] r.append(doc.get('title')) r.append(doc.get('url')) r.append(doc.get('src')) r.append(doc.get('alt').replace(' ','')) res.append(r) return res,len(scoreDocs)
def run(searcher, analyzer): for i in range(searcher.maxDoc()): doc=searcher.doc(i) print doc.get('songname') while True: querys=BooleanQuery() print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'GBK') if command == '': return print print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT,"songname",analyzer).parse(command) querys.add(query,BooleanClause.Occur.SHOULD) ## query = QueryParser(Version.LUCENE_CURRENT,"albumname",analyzer).parse(command) ## querys.add(query,BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 3).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'songname:', doc.get("songname")
def run(command,searcher, analyzer): #while True: #print #print "Hit enter with no input to quit." #command = raw_input("Query:") #command = unicode(command,'gbk') command = command.decode('utf-8') if command == '': return #print #print "Searching for:", querys = BooleanQuery() for i in jieba.cut(command): #print i, query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(i) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs #print "\n%s total matching documents." % len(scoreDocs) list1 = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) list1.append(doc) return list1
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command querys = BooleanQuery() command_dict = parseCommand(command) for k,v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------------' print 'title:',doc.get('title') print 'url:',doc.get('url') print 'src:',doc.get('src')
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------------' print 'title:', doc.get('title') print 'url:', doc.get('url') print 'src:', doc.get('src')
def testFilterAlternative(self): categoryQuery = TermQuery(Term("category", "/philosophy/eastern")) constrainedQuery = BooleanQuery() constrainedQuery.add(self.allBooks, BooleanClause.Occur.MUST) constrainedQuery.add(categoryQuery, BooleanClause.Occur.MUST) scoreDocs = self.searcher.search(constrainedQuery, 50).scoreDocs self.assertEqual(1, len(scoreDocs), "only tao te ching")
def testAnd(self): searchingBooks = TermQuery(Term("subject", "search")) books2004 = NumericRangeQuery.newIntRange("pubmonth", Integer(200401), Integer(200412), True, True) searchingBooks2004 = BooleanQuery() searchingBooks2004.add(searchingBooks, BooleanClause.Occur.MUST) searchingBooks2004.add(books2004, BooleanClause.Occur.MUST) searcher = self.getSearcher() scoreDocs = searcher.search(searchingBooks2004, 50).scoreDocs self.assertHitsIncludeTitle(searcher, scoreDocs, "Lucene in Action")
def search_image(command): if command == ' ': return [] Docs = [] vm_env = getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_img" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) command_dict = {} allowed_opt = ['site'] opt = 'contents' for i in command.split(' '): if ':' in i: opt, value = i.split(':')[:2] opt = opt.lower() if opt in allowed_opt and value != '': command_dict[opt] = command_dict.get(opt, '') + ' ' + value else: seg_list = jieba.cut(i) command_dict[opt] = command_dict.get(opt, '') + ' ' + " ".join(seg_list) querys = BooleanQuery() for k, v in command_dict.iteritems(): if k == 'site': t = Term(k, '*' + v) query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10000).scoreDocs formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>") highlighter = Highlighter(formatter, QueryScorer(querys)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dic = {} doc_dic["url"] = doc.get("url") doc_dic["imgurl"] = doc.get("imgurl") doc_dic["urltitle"] = doc.get("urltitle") text = doc.get("contents") ts = analyzer.tokenStream(doc.get("contents"), StringReader(text)) doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...") Docs.append(doc_dic) searcher.close() return Docs
def testOr(self): methodologyBooks = TermQuery(Term("category", "/technology/computers/programming/methodology")) easternPhilosophyBooks = TermQuery(Term("category", "/philosophy/eastern")) enlightenmentBooks = BooleanQuery() enlightenmentBooks.add(methodologyBooks, BooleanClause.Occur.SHOULD) enlightenmentBooks.add(easternPhilosophyBooks, BooleanClause.Occur.SHOULD) searcher = self.getSearcher() scoreDocs = searcher.search(enlightenmentBooks, 50).scoreDocs print "or =", enlightenmentBooks self.assertHitsIncludeTitle(searcher, scoreDocs, "Extreme Programming Explained") self.assertHitsIncludeTitle(searcher, scoreDocs, u"Tao Te Ching \u9053\u5FB7\u7D93")
def testAgainstOR(self): quickFox = PhraseQuery() quickFox.setSlop(1) quickFox.add(Term("field", "quick")) quickFox.add(Term("field", "fox")) fastFox = PhraseQuery() fastFox.add(Term("field", "fast")) fastFox.add(Term("field", "fox")) query = BooleanQuery() query.add(quickFox, BooleanClause.Occur.SHOULD) query.add(fastFox, BooleanClause.Occur.SHOULD) topDocs = self.searcher.search(query, 10) self.assertEqual(2, topDocs.totalHits)
def run(command,pageindex=1,pagesize=15): global searcher, analyzer,old_command,old_res_list global STORE_DIR,directory,searcher,analyzer if command == '': return print "Searching for:", command querys = BooleanQuery() command_dict = parseCommand(command) for k,v in command_dict.iteritems(): if(k=='site'): t = Term('url','*'+v.strip()+'*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 4000).scoreDocs print "%s total matching documents." % len(scoreDocs) res_list = [] simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT,"lrc",analyzer).parse(command_dict['content']) hlter = Highlighter(simpleHTMLFormatter,QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(200)) start = (pageindex-1)*pagesize end = start+pagesize print start,end for scoreDoc in scoreDocs[start:end+10]: doc = searcher.doc(scoreDoc.doc) res = [] res.append(doc.get('url')) res.append(doc.get('music_name')) res.append(doc.get('artist')) res.append(doc.get('album_name')) res.append(doc.get('lrc')) output = hlter.getBestFragment(analyzer,"lrc",clear(doc.get('lrc'))) res.append(output) res.append(doc.get('musicID')) if(res[5]!=None): res_list.append(res) if(len(res_list)==8): break return res_list,len(scoreDocs)
def run(command, pageindex=1, pagesize=15): global searcher, analyzer, old_command, old_res_list global STORE_DIR, directory, searcher, analyzer if command == '': return print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): if (k == 'site'): t = Term('url', '*' + v.strip() + '*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 4000).scoreDocs print "%s total matching documents." % len(scoreDocs) res_list = [] simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command_dict['contents']) hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(200)) start = (pageindex - 1) * pagesize end = start + pagesize for scoreDoc in scoreDocs[start:end + 1]: doc = searcher.doc(scoreDoc.doc) res = [] res.append(doc.get('title')) res.append(doc.get('url')) output = hlter.getBestFragment(analyzer, "contents", clear(doc.get('contents'))) res.append(output) res_list.append(res) return res_list, len(scoreDocs)
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): if (k == 'site'): t = Term('url', '*' + v.strip() + '*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10).scoreDocs print "%s total matching documents." % len(scoreDocs) simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command_dict['contents']) hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(500)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------' #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site') print 'title:', doc.get('title'), print 'url:', doc.get('url') ori_text = clear(doc.get('contents')) output = hlter.getBestFragment(analyzer, "contents", ori_text) print output
def testFilteredQuery(self): isbns = ["0854402624"] # Steiner accessor = TestSpecialsAccessor(isbns) filter = SpecialsFilter(accessor) educationBooks = WildcardQuery(Term("category", "*education*")) edBooksOnSpecial = FilteredQuery(educationBooks, filter) logoBooks = TermQuery(Term("subject", "logo")) logoOrEdBooks = BooleanQuery() logoOrEdBooks.add(logoBooks, BooleanClause.Occur.SHOULD) logoOrEdBooks.add(edBooksOnSpecial, BooleanClause.Occur.SHOULD) topDocs = self.searcher.search(logoOrEdBooks, 50) print logoOrEdBooks self.assertEqual(2, topDocs.totalHits, "Papert and Steiner")
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k,v in command_dict.iteritems(): if(k=='site'): t = Term('url','*'+v.strip()+'*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10).scoreDocs print "%s total matching documents." % len(scoreDocs) simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT,"contents",analyzer).parse(command_dict['contents']) hlter = Highlighter(simpleHTMLFormatter,QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(500)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------' #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site') print 'title:',doc.get('title'), print 'url:',doc.get('url') ori_text = clear(doc.get('contents')) output = hlter.getBestFragment(analyzer,"contents",ori_text) print output
def run3(searcher, analyzer,command): querys=BooleanQuery() if command == '': return False query = QueryParser(Version.LUCENE_CURRENT,"singername",analyzer).parse(command) querys.add(query,BooleanClause.Occur.SHOULD) doc=None scoreDocs = searcher.search(querys, 1).scoreDocs if len(scoreDocs)==1: doc = searcher.doc(scoreDocs[0].doc) print 12345,doc.get('singername') if doc.get('singername')!=command: return False else: return False return doc
def run2(searcher, analyzer,target,distance,comein): print [comein[1]] flit = kind_fliter.kind_fliter(comein[1]) #print flit[1].decode('utf8') query_s = [] if (not flit[0]): #print 11111111111 segment = jieba.cut(flit[1]) command = " ".join(segment).replace("\n","") #print "Searching for:", command query_s = QueryParser(Version.LUCENE_CURRENT, "search", analyzer).parse(command) else: query_s = QueryParser(Version.LUCENE_CURRENT, "kind", analyzer).parse(flit[1]) query = BooleanQuery() query.add(query_s, BooleanClause.Occur.MUST) scoreDocs = searcher.search(query, 500).scoreDocs return scoreDocs,query
def run3(searcher, analyzer,target,distance,comein): #print '&&&&&&&&&&&&&&&&&&&&&&&&&&' #print target print [comein[1]] distance = 5000 cal_tar_x = int(target[0]*100000) cal_tar_y = int(target[1]*100000) #print cal_tar_x radius = distance*A*100000 #print radius cal_radius = int(radius) query1 = NumericRangeQuery.newFloatRange("calx",cal_tar_x-cal_radius,cal_tar_x+cal_radius,True,True) query2 = NumericRangeQuery.newFloatRange("caly",cal_tar_y-cal_radius,cal_tar_y+cal_radius,True,True) query = BooleanQuery() query.add(query1, BooleanClause.Occur.MUST) query.add(query2, BooleanClause.Occur.MUST) #query.add(query_s, BooleanClause.Occur.MUST) scoreDocs = searcher.search(query, 500).scoreDocs return scoreDocs,query
def testToString(self): query = BooleanQuery() query.add(FuzzyQuery(Term("field", "kountry")), BooleanClause.Occur.MUST) query.add(TermQuery(Term("title", "western")), BooleanClause.Occur.SHOULD) self.assertEqual("+kountry~0.5 title:western", query.toString("field"), "both kinds")
def run(searcher, analyzer, command, prior): if command == '': return store = [] command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 500000).scoreDocs scored = [] for scoreDoc in scoreDocs: returnfile = [] doc = searcher.doc(scoreDoc.doc) if doc.get("qst_num") in scored: continue if not doc.get("qst_name"): continue scored.append(doc.get("qst_num")) name = doc.get("qst_name").replace(' ', '') returnfile.append(name) detail = doc.get("qst_detail").replace(' ', '') returnfile.append(detail) returnfile.append(doc.get("qst_topic_accu")) returnfile.append(int(doc.get("qst_browse"))) returnfile.append(int(doc.get("qst_follow"))) returnfile.append(int(doc.get("qst_ans"))) returnfile.append(int(doc.get("qst_num"))) store.append(returnfile) store = storesort(store, prior) return store
def search(self, topic): query = self.query_parser.parse(topic.title) results = self.searcher.search(query, self.top_n) score_pairs = {} for hit in results.scoreDocs: doc = self.searcher.doc(hit.doc) for field in ["title", "heading", "text"]: terms = doc.get(field).split() for term in terms: if (field, term) in score_pairs: score_pairs[(field, term)].increment() else: score_pairs[(field, term)] = ScorePair( self.reader, field, term) # XXX top_terms = score_pairs.values() top_terms.sort(key=lambda x: x.score(), reverse=True) top_terms = top_terms[:25] # print([term.term for term in top_terms]) bq = BooleanQuery() query.setBoost(float(10000000)) bq.add(query, BooleanClause.Occur.SHOULD) for score_pair in top_terms: term = score_pair.to_term() bq.add(TermQuery(term), BooleanClause.Occur.SHOULD) return self.searcher.search(bq, 5000)
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if len(command) > 0: dll=cdll.LoadLibrary("F:\\ICTCLAS50_Windows_32_C\ICTCLAS50.dll") dll.ICTCLAS_Init(c_char_p("F:\\ICTCLAS50_Windows_32_C")) strlen = len(c_char_p(command).value) t =c_buffer(strlen*6) bSuccess = dll.ICTCLAS_ParagraphProcess(c_char_p(command),c_int(strlen),t,c_int(0),0) command=t.value.decode('gbk').encode('utf8') ##list=t.value.split() ##print ' '.join(list) dll.ICTCLAS_Exit() command=command.decode('utf8') if command == '': return print print "Searching for:", command command_dict = parseCommand(command) querys = BooleanQuery() for k,v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) ## explanation = searcher.explain(query, scoreDoc.doc) print "------------------------" print 'path:', doc.get("path") print 'name:', doc.get("name") print 'title:', doc.get('title') print 'url:', doc.get('url')
def search(self, query,category_id=None): SHOULD = BooleanClause.Occur.SHOULD #MultiFieldQueryParser.setOperator(QueryParser.DEFAULT_OPERATOR_AND); parser1 = QueryParser('summary',self.analyzer) parser2 = QueryParser('title',self.analyzer) parser1.setDefaultOperator(QueryParser.AND_OPERATOR) parser2.setDefaultOperator(QueryParser.AND_OPERATOR) q1 = parser1.parse(query) q2 = parser2.parse(query) boolQuery = BooleanQuery() boolQuery.add(q1,SHOULD) boolQuery.add(q2,SHOULD) #camp = CategoryComparatorSource(query) #sortfield = SortField("link", camp) #sort = Sort(sortfield) if category_id: self.catfilter.query = query self.catfilter.category_id = category_id hits = self.searcher.search(boolQuery,self.catfilter) else: hits = self.searcher.search(boolQuery) return hits
def run2(searcher, analyzer,command,num): querys=BooleanQuery() if command == '': return False query = QueryParser(Version.LUCENE_CURRENT,"albumnum",analyzer).parse(command) querys.add(query,BooleanClause.Occur.SHOULD) query = QueryParser(Version.LUCENE_CURRENT,"albumname",analyzer).parse(command) querys.add(query,BooleanClause.Occur.SHOULD) doc=None scoreDocs = searcher.search(querys, num).scoreDocs if num>1: return scoreDocs if len(scoreDocs)==1: doc = searcher.doc(scoreDocs[0].doc) print 555,doc.get('albumname') if doc.get('albumname')!=command and doc.get('albumnum')!=command: return False else: return False return doc
def run(searcher, analyzer, command): while True: if command == '': return command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 300).scoreDocs print "%s total matching documents." % len(scoreDocs) text = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) temptext = [ doc.get("url"), doc.get('title'), doc.get("imgurl"), doc.get("price"), doc.get("kind") ] text.append(temptext) return text
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'GBK') if command == '': return command_dict = parseCommand(command) sep_command = " ".join(jieba.cut(command_dict['contents'])) command_dict['contents'] = sep_command #print command_dict if not command_dict.has_key('site'): command = command_dict['contents'] else: command = command_dict['contents'] + " site:" + command_dict['site'] print print "Searching for:", command querys = BooleanQuery() for k, v in command_dict.iteritems(): print k, v query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print "------------------------" #print 'site:', doc.get("site") print 'path:', doc.get("path") print 'title:', doc.get("title") print 'url:', doc.get("url") print 'name:', doc.get("name")
def run(searcher, analyzer,command): querys=BooleanQuery() if command == '': return False query = QueryParser(Version.LUCENE_CURRENT,"songartist",analyzer).parse(command) querys.add(query,BooleanClause.Occur.SHOULD) query = QueryParser(Version.LUCENE_CURRENT,"songname",analyzer).parse(command) querys.add(query,BooleanClause.Occur.SHOULD) query = QueryParser(Version.LUCENE_CURRENT,"songalbum",analyzer).parse(command) querys.add(query,BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 20).scoreDocs return scoreDocs
def testAnd(self): searchingBooks = TermQuery(Term("subject", "search")) books2004 = NumericRangeQuery.newIntRange("pubmonth", Integer(200401), Integer(200412), True, True) searchingBooks2004 = BooleanQuery() searchingBooks2004.add(searchingBooks, BooleanClause.Occur.MUST) searchingBooks2004.add(books2004, BooleanClause.Occur.MUST) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(searchingBooks2004, 50).scoreDocs self.assertHitsIncludeTitle(searcher, scoreDocs, "Lucene in Action")
def testOr(self): methodologyBooks = TermQuery( Term("category", "/technology/computers/programming/methodology")) easternPhilosophyBooks = TermQuery( Term("category", "/philosophy/eastern")) enlightenmentBooks = BooleanQuery() enlightenmentBooks.add(methodologyBooks, BooleanClause.Occur.SHOULD) enlightenmentBooks.add(easternPhilosophyBooks, BooleanClause.Occur.SHOULD) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(enlightenmentBooks, 50).scoreDocs print "or =", enlightenmentBooks self.assertHitsIncludeTitle(searcher, scoreDocs, "Extreme Programming Explained") self.assertHitsIncludeTitle(searcher, scoreDocs, u"Tao Te Ching \u9053\u5FB7\u7D93")
def docsLike(self, id, doc, max): authors = doc.getValues("author") authorQuery = BooleanQuery() for author in authors: authorQuery.add(TermQuery(Term("author", author)), BooleanClause.Occur.SHOULD) authorQuery.setBoost(2.0) vector = self.reader.getTermFreqVector(id, "subject") subjectQuery = BooleanQuery() for term in vector.getTerms(): tq = TermQuery(Term("subject", term)) subjectQuery.add(tq, BooleanClause.Occur.SHOULD) likeThisQuery = BooleanQuery() likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD) likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD) # exclude myself likeThisQuery.add(TermQuery(Term("isbn", doc.get("isbn"))), BooleanClause.Occur.MUST_NOT) print " Query:", likeThisQuery.toString("contents") scoreDocs = self.searcher.search(likeThisQuery, 50).scoreDocs docs = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) if len(docs) < max: docs.append(doc) else: break return docs
def searchDocuments(self, view, version, query=None, attribute=None): store = self.store if query is None: query = MatchAllDocsQuery() else: query = QueryParser("contents", StandardAnalyzer()).parse(query) if attribute: combinedQuery = BooleanQuery() combinedQuery.add(query, BooleanClause.Occur.MUST) combinedQuery.add(TermQuery(Term("attribute", attribute.str64())), BooleanClause.Occur.MUST) query = combinedQuery class _collector(PythonHitCollector): def __init__(_self): super(_collector, _self).__init__() _self.hits = [] def collect(_self, id, score): _self.hits.append((-score, id)) class _iterator(object): def __init__(_self): _self.txnStatus = 0 _self.searcher = None _self.collector = None def __del__(_self): try: if _self.searcher is not None: _self.searcher.close() store.abortTransaction(view, _self.txnStatus) except: store.repository.logger.exception("in __del__") _self.txnStatus = 0 _self.searcher = None _self.collector = None def __iter__(_self): _self.txnStatus = store.startTransaction(view) _self.searcher = searcher = self.getIndexSearcher() _self.collector = _collector() searcher.search(query, _self.collector) hits = _self.collector.hits if hits: heapify(hits) while hits: score, id = heappop(hits) doc = searcher.doc(id) uItem = UUID(doc['item']) if long(doc['version']) <= version: if store._items.isValue(view, version, uItem, UUID(doc['value'])): yield uItem, UUID(doc['attribute']) return _iterator()
def searchDocuments(self, view, version, query=None, attribute=None): store = self.store if query is None: query = MatchAllDocsQuery() else: query = QueryParser("contents", StandardAnalyzer()).parse(query) if attribute: combinedQuery = BooleanQuery() combinedQuery.add(query, BooleanClause.Occur.MUST) combinedQuery.add(TermQuery(Term("attribute", attribute.str64())), BooleanClause.Occur.MUST) query = combinedQuery class _collector(PythonHitCollector): def __init__(_self): super(_collector, _self).__init__() _self.hits=[] def collect(_self, id, score): _self.hits.append((-score, id)) class _iterator(object): def __init__(_self): _self.txnStatus = 0 _self.searcher = None _self.collector = None def __del__(_self): try: if _self.searcher is not None: _self.searcher.close() store.abortTransaction(view, _self.txnStatus) except: store.repository.logger.exception("in __del__") _self.txnStatus = 0 _self.searcher = None _self.collector = None def __iter__(_self): _self.txnStatus = store.startTransaction(view) _self.searcher = searcher = self.getIndexSearcher() _self.collector = _collector() searcher.search(query, _self.collector) hits = _self.collector.hits if hits: heapify(hits) while hits: score, id = heappop(hits) doc = searcher.doc(id) uItem = UUID(doc['item']) if long(doc['version']) <= version: if store._items.isValue(view, version, uItem, UUID(doc['value'])): yield uItem, UUID(doc['attribute']) return _iterator()
def getBaselineStatistics(searcher, analyzer): baseline_stats_hash = {} day_one = time.strptime("01 01 2005", "%d %m %Y") day_one_ts = int(time.mktime(day_one)) max_day_ctr = 1830 day_ctr = 0 while day_ctr < max_day_ctr: if day_ctr%100 == 0: print "on day ctr: ", day_ctr, " at time: ", time.time() curr_day_ts = day_one_ts + 86400*day_ctr next_day_ts = day_one_ts + 86400*(day_ctr+1) day_ctr+=1 range_filter = NumericRangeFilter.newIntRange("timestamp", Integer(curr_day_ts), Integer(next_day_ts), True, True) #all tweets in day range all_docs_query = MatchAllDocsQuery() tweets_in_range_search = searcher.search(all_docs_query, range_filter) num_tweets_in_range = tweets_in_range_search.length() #all tweets in day range US US_tweets_base_query = MatchAllDocsQuery() #us_escape_one = QueryParser("country", analyzer).escape("United") #us_escape_two = us_query = TermQuery(Term("country", "United States")) #us_query.add(Term("country","United")) #us_query.add(Term("country","States")) US_tweets_country_query = us_query #US_tweets_country_query = QueryParser("country", analyzer).parse(us_query) US_tweets_query_filter = QueryFilter(US_tweets_country_query) compound_filter_US_tweets = BooleanFilter() compound_filter_US_tweets.add(FilterClause(range_filter, BooleanClause.Occur.MUST)) compound_filter_US_tweets.add(FilterClause(US_tweets_query_filter, BooleanClause.Occur.MUST)) US_tweets_in_range_search = searcher.search(US_tweets_base_query, compound_filter_US_tweets) num_US_tweets_in_range = US_tweets_in_range_search.length() #all tweets in day range japan JP_tweets_base_query = MatchAllDocsQuery() JP_tweets_country_query = QueryParser("country", analyzer).parse("Japan") JP_tweets_query_filter = QueryFilter(JP_tweets_country_query) compound_filter_JP_tweets = BooleanFilter() compound_filter_JP_tweets.add(FilterClause(range_filter, BooleanClause.Occur.MUST)) compound_filter_JP_tweets.add(FilterClause(JP_tweets_query_filter, BooleanClause.Occur.MUST)) JP_tweets_in_range_search = searcher.search(JP_tweets_base_query, compound_filter_JP_tweets) num_JP_tweets_in_range = JP_tweets_in_range_search.length() #day_ctr%10 == 0: print "US tweets: ", num_US_tweets_in_range, " JP tweets: ", num_JP_tweets_in_range #all tweets containing emoticons empty_term = Term("emoticons") empty_term_prefix = PrefixQuery(empty_term) all_emoticons_docs_query_filter = QueryFilter(empty_term_prefix) compound_filter = BooleanFilter() compound_filter.add(FilterClause(range_filter, BooleanClause.Occur.MUST)) compound_filter.add(FilterClause(all_emoticons_docs_query_filter, BooleanClause.Occur.MUST)) emoticon_tweets_in_range_search = searcher.search(all_docs_query, compound_filter) num_emoticon_tweets_in_range = emoticon_tweets_in_range_search.length() #all tweets containing "http" or "https" bq = BooleanQuery() http_str = QueryParser.escape("http://") http_query = QueryParser("emoticons", analyzer).parse(http_str) https_str = QueryParser.escape("https://") https_query = QueryParser("emoticons", analyzer).parse(https_str) bq.add(http_query, BooleanClause.Occur.SHOULD) bq.add(https_query, BooleanClause.Occur.SHOULD) bq_search = searcher.search(bq, range_filter) num_http_emoticons = bq_search.length() baseline_stats_hash[day_ctr] = {'total tweets':num_tweets_in_range, 'emoticons':num_emoticon_tweets_in_range, 'http':num_http_emoticons, 'US tweets':num_US_tweets_in_range, \ 'JP tweets':num_JP_tweets_in_range} baseline_stats_text_file = open("/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_stats.txt","w") raw_stats_list = sorted(baseline_stats_hash.items(), key = lambda x: int(x[0])) baseline_stats_text_file.write("day total emoticons http US JP\n") for rs in raw_stats_list: baseline_stats_text_file.write("%s %s %s %s %s %s\n" %(rs[0], rs[1]["total tweets"], rs[1]["emoticons"], rs[1]["http"], rs[1]['US tweets'], \ rs[1]['JP tweets'])) baseline_stats_text_file.close() baseline_stats_file = open("/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_stats.json","w") baseline_stats_file.write(json.dumps(baseline_stats_hash)) baseline_stats_file.close()