def testLowercasing(self): q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse("PrefixQuery*") self.assertEqual("prefixquery*", q.toString("field"), "lowercased") qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer) qp.setLowercaseExpandedTerms(False) q = qp.parse("PrefixQuery*") self.assertEqual("PrefixQuery*", q.toString("field"), "not lowercased")
def testPhraseQuery(self): analyzer = StandardAnalyzer(Version.LUCENE_24) q = QueryParser(Version.LUCENE_24, "field", analyzer).parse('"This is Some Phrase*"') self.assertEqual("\"some phrase\"", q.toString("field"), "analyzed") q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"term"') self.assert_(TermQuery.instance_(q), "reduced to TermQuery")
def testSlop(self): q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"exact phrase"') self.assertEqual("\"exact phrase\"", q.toString("field"), "zero slop") qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer) qp.setPhraseSlop(5) q = qp.parse('"sloppy phrase"') self.assertEqual("\"sloppy phrase\"~5", q.toString("field"), "sloppy, implicitly")
def testWithQueryParser(self): query = QueryParser(Version.LUCENE_CURRENT, "content", self.synonymAnalyzer).parse('"fox jumps"') topDocs = self.searcher.search(query, 50) # in Lucene 1.9, position increments are no longer ignored self.assertEqual(1, topDocs.totalHits, "!!!! what?!") query = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer( Version.LUCENE_CURRENT)).parse('"fox jumps"') topDocs = self.searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "*whew*")
def testStems(self): searcher = IndexSearcher(self.directory) query = QueryParser(Version.LUCENE_CURRENT, "contents", self.porterAnalyzer).parse("laziness") topDocs = searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "lazi") query = QueryParser(Version.LUCENE_CURRENT, "contents", self.porterAnalyzer).parse('"fox jumped"') topDocs = searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "jump jumps jumped jumping")
def testTermRangeQuery(self): query = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer).parse("title2:[K TO N]") self.assert_(TermRangeQuery.instance_(query)) scoreDocs = self.searcher.search(query, 10).scoreDocs self.assertHitsIncludeTitle(self.searcher, scoreDocs, "Mindstorms") query = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer).parse("title2:{K TO Mindstorms}") scoreDocs = self.searcher.search(query, 10).scoreDocs self.assertHitsIncludeTitle(self.searcher, scoreDocs, "Mindstorms", True)
def testQueryParser(self): searcher = IndexSearcher(self.directory, True) query = QueryParser(Version.LUCENE_CURRENT, "contents", SimpleAnalyzer()).parse("+JUNIT +ANT -MOCK") scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs)) d = searcher.doc(scoreDocs[0].doc) self.assertEqual("Java Development with Ant", d.get("title")) query = QueryParser(Version.LUCENE_CURRENT, "contents", SimpleAnalyzer()).parse("mock OR junit") scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(2, len(scoreDocs), "JDwA and JIA")
def __init__(self, location): lucene.initVM() directory = SimpleFSDirectory(File(location)) self.reader = IndexReader.open(directory, True) self.searcher = IndexSearcher(self.reader) self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text", WhitespaceAnalyzer())
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------------' print 'title:', doc.get('title') print 'url:', doc.get('url') print 'src:', doc.get('src')
def luceneRetriver(query): lucene.initVM() indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query= QueryParser(Version.LUCENE_30,"text",\ lucene_analyzer).parse(query) MAX = 1000 total_hits = lucene_searcher.search(my_query, MAX) print "Hits: ", total_hits.totalHits for hit in total_hits.scoreDocs: print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString( ) doc = lucene_searcher.doc(hit.doc) print doc.get("text").encode("utf-8")
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") #command = 'Christian author:mark twain title:autobiography language:English' command = unicode(command, 'GBK') if command == '': return print print "Searching for:", command command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) ## explanation = searcher.explain(query, scoreDoc.doc) print "------------------------" print 'path:', doc.get("path") print 'name:', doc.get("name") print 'title:', doc.get('title') print 'author:', doc.get('author') print 'language:', doc.get('language')
def retrieveDocs(q): lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_30) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 1000 hits = searcher.search(query, MAX) nonDiverse = [] docsToScores = {} #create a list of html files with relevant websites rQ = [] print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("text").encode("utf-8") #print(new_urls[str(hit.doc)]) result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString() if (len(nonDiverse) < 10): nonDiverse.append(new_urls[str(hit.doc)]) #find the document that corresponds to the html website and append to a list for min distance website = new_urls[str(hit.doc)] #html_files numbers of the hit websites added to rQ rQ.append(inv_map[website]) docsToScores[int(inv_map[website])] = hit.score print(inv_map[website]) return docsToScores, rQ, nonDiverse
def run(command, pageindex,pagesize): global searcher,analyzer print "Searching for:", command querys = BooleanQuery() command_dict = parseCommand(command) for k,v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 6000).scoreDocs print "%s total matching documents." % len(scoreDocs) start = (pageindex - 1) * pagesize end = start + pagesize res = [] for scoreDoc in scoreDocs[start:end+1]: doc = searcher.doc(scoreDoc.doc) r = [] r.append(doc.get('title')) r.append(doc.get('url')) r.append(doc.get('src')) r.append(doc.get('alt').replace(' ','')) res.append(r) return res,len(scoreDocs)
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") #输入查询语句 command = unicode(command, 'GBK') #将查询语句转化为Unicode(注意创建索引时文件也是Unicode) #在Python IDLE下为GBK,在PyScripter-Portable中为UTF8(见Q.ppt) if command == '': return command = " ".join(jieba.cut(command)) print print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) #用analyzer来对查询语句进行词法分析和语言处理。 #QueryParser调用parser进行语法分析,形成查询语法树,放到Query中。 scoreDocs = searcher.search(query, 50).scoreDocs #IndexSearcher调用search对查询语法树Query进行搜索,得到结果 print "%s total matching documents." % len(scoreDocs), '\n' for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print "------------------------" print 'path:', doc.get("path") print 'title:', doc.get("title") print 'url:', doc.get("url") print 'name:', doc.get("name")
def testPrefixQuery(self): parser = QueryParser(Version.LUCENE_CURRENT, "category", StandardAnalyzer(Version.LUCENE_CURRENT)) parser.setLowercaseExpandedTerms(False) print parser.parse("/Computers/technology*").toString("category")
def post(self): q= self.get_argument("query") # self.write(key) # def query(query): # query = self.get_argument("q") lucene.initVM() indexDir = "index" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 10 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) items = [] rQ = [] #for key, value in doc_urls.iteritems() # print (key, value) for hit in hits.scoreDocs: #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]}) print hit.score, hit.doc, hit.toString() print(len(doc_urls)) items.append(doc_urls[str(hit.doc)]) doc = searcher.doc(hit.doc) print(hit.doc) self.render("index.html", title="Results", items=items, query=q)
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): if (k == 'site'): t = Term('url', '*' + v.strip() + '*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10).scoreDocs print "%s total matching documents." % len(scoreDocs) simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command_dict['contents']) hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(500)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------' #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site') print 'title:', doc.get('title'), print 'url:', doc.get('url') ori_text = clear(doc.get('contents')) output = hlter.getBestFragment(analyzer, "contents", ori_text) print output
def testExactPhrase(self): searcher = IndexSearcher(self.directory, True) query = QueryParser(Version.LUCENE_24, "contents", self.porterAnalyzer).parse('"over the lazy"') topDocs = searcher.search(query, 50) self.assertEqual(0, topDocs.totalHits, "exact match not found!")
def testWildcard(self): try: QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse("*xyz") self.fail("Leading wildcard character should not be allowed") except: self.assert_(True)
def run(command, pageindex=1, pagesize=15): global searcher, analyzer, old_command, old_res_list global STORE_DIR, directory, searcher, analyzer if command == '': return print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): if (k == 'site'): t = Term('url', '*' + v.strip() + '*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 4000).scoreDocs print "%s total matching documents." % len(scoreDocs) res_list = [] simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command_dict['contents']) hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(200)) start = (pageindex - 1) * pagesize end = start + pagesize for scoreDoc in scoreDocs[start:end + 1]: doc = searcher.doc(scoreDoc.doc) res = [] res.append(doc.get('title')) res.append(doc.get('url')) output = hlter.getBestFragment(analyzer, "contents", clear(doc.get('contents'))) res.append(output) res_list.append(res) return res_list, len(scoreDocs)
def main(cls): query = QueryParser(Version.LUCENE_CURRENT, "content", cls.synonymAnalyzer).parse('"fox jumps"') print "\"fox jumps\" parses to ", query.toString("content") print "From AnalyzerUtils.tokensFromAnalysis: " AnalyzerUtils.displayTokens(cls.synonymAnalyzer, "\"fox jumps\"") print ''
def query(self,title): self._th.attachCurrentThread() searcher = IndexSearcher(self._dir) query=QueryParser(Version.LUCENE_30, "title", self._analyzer).parse(title) total_hits = searcher.search(query, 10) for hit in total_hits.scoreDocs: doc = (searcher.doc(hit.doc)) return doc.get("title")+"\n"+doc.get("content")+"--------------------------------" return "None"
def extractFeatureQueryWords(query): import string from lucene import Document, TermQuery, Term # create analyzer aux_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) try: file = open('../features.txt', 'r') featurelist = [] for line in file.readlines(): words_in_line = line.split() featurelist += words_in_line querywordlist = query.split() featureQueryList = [] productQueryList = [] for word in querywordlist: if word in featurelist: featureQueryList.append(word) else: # create parser for word aux_parser = QueryParser(Version.LUCENE_CURRENT, "title", aux_analyzer) aux_query = aux_parser.parse(word) scoreDocs = searcher.search(aux_query, 50).scoreDocs if scoreDocs: productQueryList.append(word) featureQuery = "" if featureQueryList: featureQuery = "(" for i in range(len(featureQueryList)): if i == len(featureQueryList) - 1: featureQuery += featureQueryList[i] + ")" else: featureQuery += featureQueryList[i] + " AND " print featureQuery productQuery = "" if productQueryList: productQuery = "(" for i in range(len(productQueryList)): if i == len(productQueryList) - 1: productQuery += productQueryList[i] + ")" else: productQuery += productQueryList[i] + " AND " return (featureQuery, productQuery, featureQueryList, productQueryList) except Exception, ex: print "Could not separate feature query words. Reason: ", ex return ("", "(" + query + ")", [], querywordlist)
def testBasicQueryParser(self): analyzer = SimpleAnalyzer() query = QueryParser(Version.LUCENE_CURRENT, "description", analyzer).parse("partnum:Q36 AND SPACE") scoreDocs = self.searcher.search(query, 50).scoreDocs self.assertEqual("+partnum:q +space", query.toString("description"), "note Q36 -> q") self.assertEqual(0, len(scoreDocs), "doc not found :(")
def testParseException(self): try: QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer).parse("^&#") except: # expression is invalid, as expected self.assert_(True) else: self.fail("ParseException expected, but not thrown")
def testPerFieldAnalyzer(self): analyzer = PerFieldAnalyzerWrapper(SimpleAnalyzer()) analyzer.addAnalyzer("partnum", KeywordAnalyzer()) query = QueryParser(Version.LUCENE_CURRENT, "description", analyzer).parse("partnum:Q36 AND SPACE") scoreDocs = self.searcher.search(query, 50).scoreDocs #self.assertEqual("+partnum:Q36 +space", query.toString("description")) self.assertEqual(1, len(scoreDocs), "doc found!")
def getCrowds(self, query, field=CrowdFields.text): searcher = IndexSearcher(self.index, True) q = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(query) collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(q, collector) hits = collector.topDocs().scoreDocs return [ searcher.doc(scoreDoc.doc).get(CrowdFields.id) for scoreDoc in hits ]
def testGrouping(self): query = QueryParser( Version.LUCENE_CURRENT, "subject", self.analyzer).parse("(agile OR extreme) AND methodology") scoreDocs = self.searcher.search(query, 50).scoreDocs self.assertHitsIncludeTitle(self.searcher, scoreDocs, "Extreme Programming Explained") self.assertHitsIncludeTitle(self.searcher, scoreDocs, "The Pragmatic Programmer")
def testAnalyzer(self): analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryString = "category:/philosophy/eastern" parser = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer) parser.setAutoGeneratePhraseQueries(True) query = parser.parse(queryString) self.assertEqual("category:\"philosophy eastern\"", query.toString("contents"), "path got split, yikes!") perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer) perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer()) query = QueryParser(Version.LUCENE_CURRENT, "contents", perFieldAnalyzer).parse(queryString) self.assertEqual("category:/philosophy/eastern", query.toString("contents"), "leave category field alone")
def TotalSearch(self, keyWord): try: searcher = IndexSearcher(self.indexDir) keyWord = keyWord.encode('utf8') query = QueryParser(Version.LUCENE_30, "title", self.analyzer).parse(keyWord) hits = searcher.search(query, 1000) return self.__MakeResultFormat(hits, searcher) except Exception, err: sys.stderr.write("ERROR: %s\n" % str(err))