def testSpanNearQuery(self): quick_brown_dog = [self.quick, self.brown, self.dog] snq = SpanNearQuery(quick_brown_dog, 0, True) self.assertNoMatches(snq) self.dumpSpans(snq) snq = SpanNearQuery(quick_brown_dog, 4, True) self.assertNoMatches(snq) self.dumpSpans(snq) snq = SpanNearQuery(quick_brown_dog, 5, True) self.assertOnlyBrownFox(snq) self.dumpSpans(snq) # interesting - even a sloppy phrase query would require # more slop to match snq = SpanNearQuery([self.lazy, self.fox], 3, False) self.assertOnlyBrownFox(snq) self.dumpSpans(snq) pq = PhraseQuery() pq.add(Term("f", "lazy")) pq.add(Term("f", "fox")) pq.setSlop(4) self.assertNoMatches(pq) pq.setSlop(5) self.assertOnlyBrownFox(pq)
def testToString(self): query = BooleanQuery() query.add(FuzzyQuery(Term("field", "kountry")), BooleanClause.Occur.MUST) query.add(TermQuery(Term("title", "western")), BooleanClause.Occur.SHOULD) self.assertEqual("+kountry~0.5 title:western", query.toString("field"), "both kinds")
def testSearchByAPI(self): tq = TermQuery(Term("content", "hops")) topDocs = self.searcher.search(tq, 50) self.assertEqual(1, topDocs.totalHits) pq = PhraseQuery() pq.add(Term("content", "fox")) pq.add(Term("content", "hops")) topDocs = self.searcher.search(pq, 50) self.assertEquals(1, topDocs.totalHits)
def testTerm(self): searcher = IndexSearcher(self.directory, True) t = Term("subject", "ant") query = TermQuery(t) scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs), "JDwA") t = Term("subject", "junit") scoreDocs = searcher.search(TermQuery(t), 50).scoreDocs self.assertEqual(2, len(scoreDocs)) searcher.close()
def testBasic(self): query = MultiPhraseQuery() query.add([Term("field", "quick"), Term("field", "fast")]) query.add(Term("field", "fox")) print query topDocs = self.searcher.search(query, 10) self.assertEqual(1, topDocs.totalHits, "fast fox match") query.setSlop(1) topDocs = self.searcher.search(query, 10) self.assertEqual(2, topDocs.totalHits, "both match")
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add( Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def testSecurityFilter(self): query = TermQuery(Term("keywords", "info")) searcher = IndexSearcher(self.directory, True) topDocs = searcher.search(query, 50) self.assertEqual(2, topDocs.totalHits, "Both documents match") jakeFilter = QueryWrapperFilter(TermQuery(Term("owner", "jake"))) scoreDocs = searcher.search(query, jakeFilter, 50).scoreDocs self.assertEqual(1, len(scoreDocs)) self.assertEqual("jakes sensitive info", searcher.doc(scoreDocs[0].doc).get("keywords"), "elwood is safe")
def testKeyword(self): searcher = IndexSearcher(self.directory, True) t = Term("isbn", "1930110995") query = TermQuery(t) scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs), "JUnit in Action")
def testChinese(self): searcher = IndexSearcher(self.directory, True) query = TermQuery(Term("contents", "道")) scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs), "tao")
def __init__(self, reader, field, term): self.count = 1 self.idf = ( 1 + math.log(reader.numDocs() / (float(reader.docFreq(Term(field, term))) + 1)))**2 self.field = field self.term = term
def _searchIndex(self, searcher, row, colname, id_col): """ function to search text in the lucene index, iterates over inputDF row by row, and search the matched candidates with a match score By default - WhitespaceAnalyzer is used, other Analyzers are also available. """ text = row[colname] idd = row[id_col] # Search word by word of a text containing multiple keywords words = text.split() results = [] for word in words: query = FuzzyQuery(Term("field", word)) scoreDocs = searcher.search(query, 50).scoreDocs candidates = str(scoreDocs).split("[")[1].split("]")[0].split(",") for i, candidate in enumerate(candidates): if not candidate: continue # Matches : Value + Score score = float( candidate.split("score=")[1].replace(">", "").strip()) value = searcher.doc(scoreDocs[i].doc).get("field") if value in results: continue results.append((idd, text, value)) return results
def purgeDocuments(self, txn, counter, indexSearcher, indexReader, uItem, toVersion=None): term = Term("item", uItem.str64()) if toVersion is None: counter.documentCount += indexReader.deleteDocuments(term) else: x, keep = self.store._items.findValues(None, toVersion, uItem, None, True) keep = set(keep) for hit in indexSearcher.search(TermQuery(term)): hit = Hit.cast_(hit) doc = hit.getDocument() ver = long(doc['version']) if ver <= toVersion and UUID(doc['value']) not in keep: indexReader.deleteDocument(hit.getId()) counter.documentCount += 1
def searchWithDrillDown(cls, indexReader, taxoReader): """ Search an index with facets drill-down. returns a List<FacetResult> """ # base query the user is interested in baseQuery = TermQuery(Term(TEXT, "white")) # facet of interest facetRequest = CountFacetRequest(createCategoryPath(["root", "a"]), 10) # initial search - all docs matching the base query will contribute to the accumulation res1 = cls.searchWithRequest(indexReader, taxoReader, None, facetRequest) # a single result (because there was a single request) fres = res1.get(0) # assume the user is interested in the second sub-result # (just take the second sub-result returned by the iterator - we know there are 3 results!) subResults = fres.getFacetResultNode().getSubResults() # NOTE: .getSubResults() yields an "Iterable<? extends FacetResultNode>:" # the elements of this iterator are of type Object and need to be casted to # FacetResultNode by calling FacetResultNode.cast_(obj) first resIterator = subResults.iterator() resIterator.next() # skip first result resultNode = resIterator.next() resultNode = FacetResultNode.cast_(resultNode) categoryOfInterest = resultNode.getLabel() # drill-down preparation: turn the base query into a drill-down query for the category of interest query2 = DrillDown.query(baseQuery, [ categoryOfInterest, ]) # that's it - search with the new query and we're done! # only documents both matching the base query AND containing the # category of interest will contribute to the new accumulation return cls.searchWithRequestAndQuery(query2, indexReader, taxoReader, None, facetRequest)
def testAgainstOR(self): quickFox = PhraseQuery() quickFox.setSlop(1) quickFox.add(Term("field", "quick")) quickFox.add(Term("field", "fox")) fastFox = PhraseQuery() fastFox.add(Term("field", "fast")) fastFox.add(Term("field", "fox")) query = BooleanQuery() query.add(quickFox, BooleanClause.Occur.SHOULD) query.add(fastFox, BooleanClause.Occur.SHOULD) topDocs = self.searcher.search(query, 10) self.assertEqual(2, topDocs.totalHits)
def remove(self, id): print "removing index for", id spec.remove(id) # Delete old versions WorkflowIndexer.writer.deleteDocuments( [Term('workflow_id', id)] ) print "done removing", id
def testUpdate(self): self.assertEqual(1, self.getHitCount("city", "Amsterdam")) reader = IndexReader.open(self.dir, False) reader.deleteDocuments(Term("city", "Amsterdam")) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", "St. Petersburg has lots of bridges", Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", "St. Petersburg", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() self.assertEqual(0, self.getHitCount("city", "Amsterdam")) self.assertEqual(1, self.getHitCount("city", "Petersburg"))
def main(cls, argv): query = TermQuery(Term("f", "ipsum")) scorer = QueryScorer(query) formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>") highlighter = Highlighter(formatter, scorer) fragmenter = SimpleFragmenter(50) highlighter.setTextFragmenter(fragmenter) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) tokenStream = analyzer.tokenStream("f", StringReader(cls.text)) result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...") stdout.write("<html>") stdout.write("<style>\n") stdout.write(".highlight {\n") stdout.write(" background: yellow\n") stdout.write("}\n") stdout.write("</style>") stdout.write("<body>") stdout.write(result) stdout.write("</body></html>\n") stdout.flush()
def undoDocuments(self, indexSearcher, indexReader, uItem, version): term = Term("item", uItem.str64()) for hit in indexSearcher.search(TermQuery(term)): hit = Hit.cast_(hit) if long(hit.getDocument()['version']) == version: indexReader.deleteDocument(hit.getId())
def main(cls, argv): if len(argv) != 2: print "Usage: BerkeleyDbSearcher <index dir>" return dbHome = argv[1] env = DBEnv() env.set_flags(DB_LOG_INMEMORY, 1); if os.name == 'nt': env.set_cachesize(0, 0x4000000, 1) elif os.name == 'posix': from commands import getstatusoutput if getstatusoutput('uname') == (0, 'Linux'): env.set_cachesize(0, 0x4000000, 1) env.open(dbHome, (DB_THREAD | DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0) index = DB(env) blocks = DB(env) txn = None try: txn = env.txn_begin(None) index.open(filename = '__index__', dbtype = DB_BTREE, flags = DB_THREAD, txn = txn) blocks.open(filename = '__blocks__', dbtype = DB_BTREE, flags = DB_THREAD, txn = txn) except: if txn is not None: txn.abort() txn = None raise else: txn.commit() txn = None try: txn = env.txn_begin(None) directory = DbDirectory(txn, index, blocks, 0) searcher = IndexSearcher(directory, True) topDocs = searcher.search(TermQuery(Term("contents", "fox")), 50) print topDocs.totalHits, "document(s) found" searcher.close() except: if txn is not None: txn.abort() txn = None raise else: txn.abort() index.close() blocks.close() env.close()
def getHitCount(self, fieldName, searchString): searcher = IndexSearcher(self.dir, True) t = Term(fieldName, searchString) query = TermQuery(t) hitCount = len(searcher.search(query, 50).scoreDocs) searcher.close() return hitCount
def searchWithRequest(cls, indexReader, taxoReader, indexingParams, facetRequest): """ Search an index with facets for given facet requests. returns a List<FacetResult> """ query = TermQuery(Term(TEXT, "white")) return cls.searchWithRequestAndQuery(query, indexReader, taxoReader, indexingParams, facetRequest)
def addCrowd(self, id, text): doc = Document() doc.add( Field(CrowdFields.id, id, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field(CrowdFields.text, text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.updateDocument(Term(CrowdFields.id, id), doc)
def testFilteredQuery(self): isbns = ["0854402624"] # Steiner accessor = TestSpecialsAccessor(isbns) filter = SpecialsFilter(accessor) educationBooks = WildcardQuery(Term("category", "*education*")) edBooksOnSpecial = FilteredQuery(educationBooks, filter) logoBooks = TermQuery(Term("subject", "logo")) logoOrEdBooks = BooleanQuery() logoOrEdBooks.add(logoBooks, BooleanClause.Occur.SHOULD) logoOrEdBooks.add(edBooksOnSpecial, BooleanClause.Occur.SHOULD) topDocs = self.searcher.search(logoOrEdBooks, 50) print logoOrEdBooks self.assertEqual(2, topDocs.totalHits, "Papert and Steiner")
def matched(self, phrase, slop): query = PhraseQuery() query.setSlop(slop) for word in phrase: query.add(Term("field", word)) topDocs = self.searcher.search(query, 50) return topDocs.totalHits > 0
def getSynonyms(self, word): synList = [] topDocs = self.searcher.search(TermQuery(Term("word", word)), 50) for scoreDoc in topDocs.scoreDocs: doc = self.searcher.doc(scoreDoc.doc) for value in doc.getValues("syn"): synList.append(value) return synList
def testOr(self): methodologyBooks = TermQuery( Term("category", "/technology/computers/programming/methodology")) easternPhilosophyBooks = TermQuery( Term("category", "/philosophy/eastern")) enlightenmentBooks = BooleanQuery() enlightenmentBooks.add(methodologyBooks, BooleanClause.Occur.SHOULD) enlightenmentBooks.add(easternPhilosophyBooks, BooleanClause.Occur.SHOULD) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(enlightenmentBooks, 50).scoreDocs print "or =", enlightenmentBooks self.assertHitsIncludeTitle(searcher, scoreDocs, "Extreme Programming Explained") self.assertHitsIncludeTitle(searcher, scoreDocs, u"Tao Te Ching \u9053\u5FB7\u7D93")
def testPlay(self): orQ = SpanOrQuery([self.quick, self.fox]) self.dumpSpans(orQ) quick_fox = SpanNearQuery([self.quick, self.fox], 1, True) sfq = SpanFirstQuery(quick_fox, 4) self.dumpSpans(sfq) self.dumpSpans(SpanTermQuery(Term("f", "the"))) quick_brown = SpanNearQuery([self.quick, self.brown], 0, False) self.dumpSpans(quick_brown)
def testHighlighting(self): text = "The quick brown fox jumps over the lazy dog" query = TermQuery(Term("field", "fox")) scorer = QueryScorer(query) highlighter = Highlighter(scorer) tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream( "field", StringReader(text)) self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog", highlighter.getBestFragment(tokenStream, text))
def search_image(command): if command == ' ': return [] Docs = [] vm_env = getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_img" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) command_dict = {} allowed_opt = ['site'] opt = 'contents' for i in command.split(' '): if ':' in i: opt, value = i.split(':')[:2] opt = opt.lower() if opt in allowed_opt and value != '': command_dict[opt] = command_dict.get(opt, '') + ' ' + value else: seg_list = jieba.cut(i) command_dict[opt] = command_dict.get(opt, '') + ' ' + " ".join(seg_list) querys = BooleanQuery() for k, v in command_dict.iteritems(): if k == 'site': t = Term(k, '*' + v) query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10000).scoreDocs formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>") highlighter = Highlighter(formatter, QueryScorer(querys)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dic = {} doc_dic["url"] = doc.get("url") doc_dic["imgurl"] = doc.get("imgurl") doc_dic["urltitle"] = doc.get("urltitle") text = doc.get("contents") ts = analyzer.tokenStream(doc.get("contents"), StringReader(text)) doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...") Docs.append(doc_dic) searcher.close() return Docs
def testAnd(self): searchingBooks = TermQuery(Term("subject", "search")) books2004 = NumericRangeQuery.newIntRange("pubmonth", Integer(200401), Integer(200412), True, True) searchingBooks2004 = BooleanQuery() searchingBooks2004.add(searchingBooks, BooleanClause.Occur.MUST) searchingBooks2004.add(books2004, BooleanClause.Occur.MUST) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(searchingBooks2004, 50).scoreDocs self.assertHitsIncludeTitle(searcher, scoreDocs, "Lucene in Action")