def testDistanceAsEditsSearching(self): w = self.getWriter() self._addDoc("foobar", w) self._addDoc("test", w) self._addDoc("working", w) reader = w.getReader() searcher = self.getSearcher(reader=reader) w.close() q = FuzzyQuery(Term("field", "fouba"), 2) hits = searcher.search(q, 10).scoreDocs self.assertEqual(1, len(hits)) self.assertEqual("foobar", searcher.doc(hits[0].doc).get("field")) q = FuzzyQuery(Term("field", "foubara"), 2) hits = searcher.search(q, 10).scoreDocs self.assertEqual(1, len(hits)) self.assertEqual("foobar", searcher.doc(hits[0].doc).get("field")) try: q = FuzzyQuery(Term("field", "t"), 3) self.fail() except JavaError as e: #expected pass
def test_removeDocument(self): self.test_indexDocument() store = self.openStore() searcher = None writer = None try: searcher = self.getSearcher(store) query = TermQuery(Term("docid", str(1))) topDocs = searcher.search(query, 50) self.assertEqual(topDocs.totalHits, 1) # be careful with ids they are ephemeral docid = topDocs.scoreDocs[0].doc writer = self.getWriter(store) writer.deleteDocuments(Term("docid", str(1))) finally: self.closeStore(store, writer) store = self.openStore() searcher = None try: searcher = self.getSearcher(store) query = TermQuery(Term("docid", str(1))) topDocs = searcher.search(query, 50) self.assertEqual(topDocs.totalHits, 0) finally: self.closeStore(store)
def testFilteredQuery(self): filteredquery = FilteredQuery(self.query, self.filter) topDocs = self.searcher.search(filteredquery, 50) self.assertEqual(1, topDocs.totalHits) self.assertEqual(1, topDocs.scoreDocs[0].doc) topDocs = self.searcher.search(filteredquery, None, 50, Sort(SortField("sorter", SortField.Type.STRING))) self.assertEqual(1, topDocs.totalHits) self.assertEqual(1, topDocs.scoreDocs[0].doc) filteredquery = FilteredQuery(TermQuery(Term("field", "one")), self.filter) topDocs = self.searcher.search(filteredquery, 50) self.assertEqual(2, topDocs.totalHits) filteredquery = FilteredQuery(TermQuery(Term("field", "x")), self.filter) topDocs = self.searcher.search(filteredquery, 50) self.assertEqual(1, topDocs.totalHits) self.assertEqual(3, topDocs.scoreDocs[0].doc) filteredquery = FilteredQuery(TermQuery(Term("field", "y")), self.filter) topDocs = self.searcher.search(filteredquery, 50) self.assertEqual(0, topDocs.totalHits)
def testNotCloseEnough(self): self.builder.setSlop(2) self.builder.add(Term("field", "one")) self.builder.add(Term("field", "five")) topDocs = self.searcher.search(self.builder.build(), 50) self.assertEqual(0, topDocs.totalHits.value)
def testDiacriticsShouldBeNormalizedNFC(self): pq = PhraseQuery() pq.add(Term("title", "more")) pq.add(Term("title", "e")) self.assertConversion(pq, 'title=More\xcc\x81e') # Combined ` from unicodedata import normalize self.assertConversion(TermQuery(Term('title', 'moree')), normalize('NFC', unicode('title=More\xcc\x81e')))
def lucene_sample_query_parse(sampleq, ftypes): fields = [] queries = [] booleans = [] bq = BooleanQuery() for query_tuple in sampleq: (field, op_, value) = re.split(snapconf.RANGE_QUERY_OPS, query_tuple) m=snapconf.RANGE_QUERY_FIELD_PATTERN.search(query_tuple) if m is None or field is None: continue op=m.group(1) if op not in snapconf.operators: sys.stderr.write("bad operator %s in range query,exiting\n" % (str(op))) sys.exit(-1) field_w_type = snapconf.SAMPLE_HEADER_FIELDS_TYPE_MAP[field] (fieldtypechar, ftype_method) = ftypes[field_w_type] #range query if fieldtypechar == 'i' or fieldtypechar == 'f': bq.add(lucene_range_query_parse(field_w_type, op, value, fieldtypechar, ftype_method), BOOLEAN_OCCUR) #phrase query elif ' ' in value or '\t' in value: pquery = PhraseQuery() [pquery.add(Term(field_w_type, v.lower())) for v in re.split(r'\s+',value)] #force exact phrase matching only pquery.setSlop(0) bq.add(pquery, BOOLEAN_OCCUR) #term query else: bq.add(TermQuery(Term(field_w_type, value.lower())), BOOLEAN_OCCUR) sys.stderr.write("value + fields: %s %s\n" % (value.lower(), field_w_type)) return bq
def testNotCloseEnough(self): self.query.setSlop(2) self.query.add(Term("field", "one")) self.query.add(Term("field", "five")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(0, topDocs.totalHits)
def testIndexRelationTermOutput(self): self.assertConversion(TermQuery(Term("animal", "cats")), 'animal=cats') query = PhraseQuery() query.add(Term("animal", "cats")) query.add(Term("animal", "dogs")) self.assertConversion(query, 'animal="cats dogs"') self.assertConversion(query, 'animal="catS Dogs"')
def testBarelyCloseEnough(self): self.builder.setSlop(3) self.builder.add(Term("field", "one")) self.builder.add(Term("field", "five")) topDocs = self.searcher.search(self.builder.build(), 50) self.assertEqual(1, topDocs.totalHits)
def testGet(self): cache = LruCache(keyEqualsFunction=lambda q1, q2: q1.equals(q2), createFunction=lambda q: QueryWrapperFilter(q)) f1 = cache.get(TermQuery(Term("field1", "value1"))) f2 = cache.get(TermQuery(Term("field1", "value1"))) self.assertEquals(f1, f2) f3 = cache.get(TermQuery(Term("field1", "value2"))) self.assertNotEquals(f1, f3)
def cascadeSearch(self, searcher, reader, phrase, ratio): results = [] # 1) Exact match query1 = TermQuery(Term("str", phrase)) hits1 = searcher.search(query1, self._maxHits).scoreDocs if hits1 > 0: for hit in hits1: results.append((100 * ratio, self.getDocument(reader, hit))) # 2) Normalized match normPhrase = self.normalizeCasePunct(phrase) if results == [] or self._includeAllResults == True: query2 = TermQuery(Term("str_norm", normPhrase)) hits2 = searcher.search(query2, self._maxHits).scoreDocs if hits2 > 0: for hit in hits2: results.append((90 * ratio, self.getDocument(reader, hit))) # 3) Alpha sorted and normalized sortedPhrase = self.sortWords(normPhrase) if results == [] or self._includeAllResults == True: query3 = TermQuery(Term("str_sorted", sortedPhrase)) hits3 = searcher.search(query3, self._maxHits).scoreDocs # May need length instead of just hits3 if hits3 > 0: for hit in hits3: results.append((80 * ratio, self.getDocument(reader, hit))) # 4) Normalized and Stemmed stemmedPhrase = self.stemWords(normPhrase) if results == [] or self._includeAllResults == True: query4 = TermQuery(Term("str_stemmed", stemmedPhrase)) hits4 = searcher.search(query4, self._maxHits).scoreDocs # May need length instead of just hits3 if hits4 > 0: for hit in hits4: results.append((70 * ratio, self.getDocument(reader, hit))) # 5) Normalized, Alpha sorted, and Stemmed stemmedSortedPhrase = self.stemWords(sortedPhrase) if results == [] or self._includeAllResults == True: query5 = TermQuery(Term("str_stemmedSorted", stemmedSortedPhrase)) hits5 = searcher.search(query5, self._maxHits).scoreDocs # May need length instead of just hits3 if hits5 > 0: for hit in hits5: results.append((60 * ratio, self.getDocument(reader, hit))) return results
def doc_search(self, keywords): term1 = self.lReader.totalTermFreq(Term('Title', keywords)) term2 = self.lReader.totalTermFreq(Term('Body', keywords)) term3 = self.lReader.totalTermFreq(Term('Tags', keywords)) term = term1 + term2 + term3 self.term = term # print term return term
def testBraces(self): self.assertConversion(TermQuery(Term('unqualified', 'cats')), '(cats)') innerQuery = BooleanQuery() innerQuery.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.MUST) innerQuery.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.MUST) outerQuery = BooleanQuery() outerQuery.add(innerQuery, BooleanClause.Occur.SHOULD) outerQuery.add(TermQuery(Term('unqualified', 'mice')), BooleanClause.Occur.SHOULD) self.assertConversion(outerQuery, '(cats AND dogs) OR mice')
def delete(indexDir: str, id: str): index_dir = SimpleFSDirectory(Paths.get(indexDir)) config = IndexWriterConfig(StandardAnalyzer()) index_writer = IndexWriter(index_dir, config) delete_term_query = RegexpQuery(Term('id', id)) delete_reg_query = RegexpQuery(Term('id', id + '\..*')) index_writer.deleteDocuments(delete_term_query) index_writer.deleteDocuments(delete_reg_query) index_writer.commit() index_writer.close()
def __init__(self, *args): super(BooleanOrTestCase, self).__init__(*args) self.FIELD_T = "T" self.FIELD_C = "C" self.t1 = TermQuery(Term(self.FIELD_T, "files")) self.t2 = TermQuery(Term(self.FIELD_T, "deleting")) self.c1 = TermQuery(Term(self.FIELD_C, "production")) self.c2 = TermQuery(Term(self.FIELD_C, "optimize")) self.searcher = None
def testEquality(self): bq1 = BooleanQuery() bq1.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD) bq1.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD) nested1 = BooleanQuery() nested1.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD) nested1.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD) bq1.add(nested1, BooleanClause.Occur.SHOULD) bq2 = BooleanQuery() bq2.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD) bq2.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD) nested2 = BooleanQuery() nested2.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD) nested2.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD) bq2.add(nested2, BooleanClause.Occur.SHOULD) self.assert_(bq1.equals(bq2))
def testEquality(self): b1 = BooleanQuery.Builder() b1.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD) b1.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD) bq1 = b1.build() n1 = BooleanQuery.Builder() n1.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD) n1.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD) nested1 = n1.build() b1.add(nested1, BooleanClause.Occur.SHOULD) bq1 = b1.build() b2 = BooleanQuery.Builder() b2.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD) b2.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD) n2 = BooleanQuery.Builder() n2.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD) n2.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD) nested2 = n2.build() b2.add(nested2, BooleanClause.Occur.SHOULD) bq2 = b2.build() self.assert_(bq1.equals(bq2))
def testWildcards(self): query = PrefixQuery(Term('unqualified', 'prefix')) self.assertConversion(query, 'prefix*') self.assertConversion(query, 'PREfix*') query = PrefixQuery(Term('field', 'prefix')) self.assertConversion(query, 'field="PREfix*"') self.assertConversion(query, 'field=prefix*') query = PrefixQuery(Term('field', 'oc-0123')) self.assertConversion(query, 'field="oc-0123*"') query = TermQuery(Term('field', 'p')) self.assertConversion(query, 'field="P*"') #only prefix queries for now query = TermQuery(Term('field', 'post')) self.assertConversion(query, 'field="*post"') query = TermQuery(Term('field', 'prefix')) self.assertConversion(query, 'field=prefix**') result = LuceneQueryComposer(unqualifiedTermFields=[("field0", 0.2), ("field1", 2.0)], luceneSettings=LuceneSettings()).compose(parseCql("prefix*")) query = BooleanQuery() left = PrefixQuery(Term("field0", "prefix")) left.setBoost(0.2) query.add(left, BooleanClause.Occur.SHOULD) right = PrefixQuery(Term("field1", "prefix")) right.setBoost(2.0) query.add(right, BooleanClause.Occur.SHOULD) self.assertEquals(type(query), type(result)) self.assertEquals(repr(query), repr(result))
def getLastStampId(self, prefix='oai_dc', setSpec=None): searcher = self._getSearcher() sort = Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG, True)) if prefix is None and setSpec is None: query = MatchAllDocsQuery() else: if prefix is None: query = TermQuery(Term(SETS_FIELD, setSpec)) else: query = TermQuery(Term(PREFIX_FIELD, prefix)) results = searcher.search(query, 1, sort) if results.totalHits.value < 1: return None return _stampFromDocument(searcher.doc(results.scoreDocs[0].doc))
def query_doc(self): searcher = self._searcher query_document = RegexpQuery(Term('id', str(self._id))) top_docs_doc = searcher.search(query_document, 1) document_id = str(self._id) res_dict = {} query_section = RegexpQuery(Term('id', document_id + '\.[0-9]+')) top_docs_section = searcher.search(query_section, 99999) query_paragraph = RegexpQuery( Term('id', document_id + '\.[0-9]+\.[0-9]+')) top_docs_sentence = searcher.search(query_paragraph, 99999) top_docs = top_docs_doc.merge( 1000000, [top_docs_section, top_docs_doc, top_docs_sentence]) for hit in top_docs.scoreDocs: doc = searcher.doc(hit.doc) id = doc.get('id') document = doc.get('document') section = doc.get('section') author = doc.get('author') dynasty = doc.get('dynasty') type = doc.get('type') text = doc.get('text') color = doc.get('color') area = doc.get('area') zhujie = doc.get('zhujie') detail = doc.get('detail') res_dict[id] = {} if document: res_dict[id]['document'] = document if section: res_dict[id]['section'] = section if author: res_dict[id]['author'] = author if dynasty: res_dict[id]['dynasty'] = dynasty if type: res_dict[id]['type'] = type if text: res_dict[id]['text'] = text if color: res_dict[id]['color'] = color if area: res_dict[id]['area'] = area if zhujie: res_dict[id]['zhujie'] = zhujie if detail: res_dict[id]['detail'] = detail self._resDict = res_dict return self
def testUnqualifiedTermFields(self): composer = LuceneQueryComposer(unqualifiedTermFields=[("field0", 0.2), ("field1", 2.0)], luceneSettings=LuceneSettings()) ast = parseCql("value") result = composer.compose(ast) query = BooleanQuery() left = TermQuery(Term("field0", "value")) left.setBoost(0.2) query.add(left, BooleanClause.Occur.SHOULD) right = TermQuery(Term("field1", "value")) right.setBoost(2.0) query.add(right, BooleanClause.Occur.SHOULD) self.assertEquals(type(query), type(result)) self.assertEquals(repr(query), repr(result))
def query_section(self, section): searcher = self._searcher query_doc = RegexpQuery(Term('id', self._id + '\\..+')) query_section = TermQuery(Term('section', section)) query = BooleanQuery.Builder() bc1 = BooleanClause(query_doc, BooleanClause.Occur.MUST) bc2 = BooleanClause(query_section, BooleanClause.Occur.MUST) query = query.add(bc1).add(bc2).build() top_docs = searcher.search(query, 1000000) hits = top_docs.scoreDocs res_dict = {} for hit in hits: doc = searcher.doc(hit.doc) id = doc.get('id') document = doc.get('document') section = doc.get('section') author = doc.get('author') dynasty = doc.get('dynasty') type = doc.get('type') text = doc.get('text') color = doc.get('color') area = doc.get('area') zhujie = doc.get('zhujie') detail = doc.get('detail') res_dict[id] = {} if document: res_dict[id]['document'] = document if section: res_dict[id]['section'] = section if author: res_dict[id]['author'] = author if dynasty: res_dict[id]['dynasty'] = dynasty if type: res_dict[id]['type'] = type if text: res_dict[id]['text'] = text if color: res_dict[id]['color'] = color if area: res_dict[id]['area'] = area if zhujie: res_dict[id]['zhujie'] = zhujie if detail: res_dict[id]['detail'] = detail res_dict[self._id] = {'document': section} self._resDict = res_dict return self
def _getDocId(self, identifier): searcher = self._getSearcher(identifier) results = searcher.search( TermQuery(Term(IDENTIFIER_FIELD, identifier)), 1) if results.totalHits.value == 0: return None return results.scoreDocs[0].doc
def getNrOfRecords(self, prefix='oai_dc', setSpec=None, continueAfter=None, oaiFrom=None, oaiUntil=None, partition=None): queryBuilder = self._luceneQueryBuilder( prefix=prefix, sets=[setSpec] if setSpec else None, partition=partition) collector = self._search(queryBuilder.build(), continueAfter, oaiFrom, oaiUntil, batchSize=1, shouldCountHits=True) queryBuilder.add(TermQuery(Term(TOMBSTONE_FIELD, TOMBSTONE_VALUE)), BooleanClause.Occur.MUST) deleteCollector = self._search(queryBuilder.build(), continueAfter, oaiFrom, oaiUntil, batchSize=1, shouldCountHits=True) return { "total": collector.totalHits(), "deletes": deleteCollector.totalHits() }
def search(self, q_ids, limit): docs = [] c = 0 for i, q_id in enumerate(q_ids): #Index 가 안되어있는 Question은 찾지 못함. query = TermQuery(Term("question_id", str(q_id))) topdocs = self.searcher.search( query, 1).scoreDocs #현재는 Accepted Answer 하나만 인덱싱 되기 때문에 1개로 한정 # index searcher에 TermQuery의 객체가 들어가고.. 질문 id에 달려있는 답변 중 상위 n개 가져옴/ scoreDocs는 점수 (소수형임..) for hit in topdocs: doc = self.searcher.doc(hit.doc) docs.append( ResultItem(doc, len(q_ids) - i, doc.get("title"), doc.get("question_id"))) if len(topdocs) > 0: c += 1 if c >= limit: break # self.searchermgr.decRef(self.searcher) self.searchermgr.release(self.searcher) self.searcher = None self.directory.close() self.directory = None return docs
def testGiga(self): w = self.getWriter(analyzer=StandardAnalyzer()) self._addDoc("Lucene in Action", w) self._addDoc("Lucene for Dummies", w) self._addDoc("Giga byte", w) self._addDoc("ManagingGigabytesManagingGigabyte", w) self._addDoc("ManagingGigabytesManagingGigabytes", w) self._addDoc("The Art of Computer Science", w) self._addDoc("J. K. Rowling", w) self._addDoc("JK Rowling", w) self._addDoc("Joanne K Roling", w) self._addDoc("Bruce Willis", w) self._addDoc("Willis bruce", w) self._addDoc("Brute willis", w) self._addDoc("B. willis", w) r = w.getReader() w.close() q = FuzzyQuery(Term("field", "giga"), 0) searcher = self.getSearcher(reader=r) hits = searcher.search(q, 10).scoreDocs self.assertEqual(1, len(hits)) self.assertEqual("Giga byte", searcher.doc(hits[0].doc).get("field"))
def test2(self): writer = self.getWriter() self._addDoc("LANGE", writer) self._addDoc("LUETH", writer) self._addDoc("PIRSING", writer) self._addDoc("RIEGEL", writer) self._addDoc("TRZECZIAK", writer) self._addDoc("WALKER", writer) self._addDoc("WBR", writer) self._addDoc("WE", writer) self._addDoc("WEB", writer) self._addDoc("WEBE", writer) self._addDoc("WEBER", writer) self._addDoc("WEBERE", writer) self._addDoc("WEBREE", writer) self._addDoc("WEBEREI", writer) self._addDoc("WBRE", writer) self._addDoc("WITTKOPF", writer) self._addDoc("WOJNAROWSKI", writer) self._addDoc("WRICKE", writer) reader = writer.getReader() searcher = self.getSearcher(reader=reader) writer.close() query = FuzzyQuery(Term("field", "WEBER"), 2, 1) hits = searcher.search(query, 1000).scoreDocs self.assertEqual(8, len(hits))
def search(self, query, limit=1): # default 파라미터로 5를 넣었음.. # 이 함수를 통해 결과불러줄 것들을 정리하는 거인듯.. docs = [] query += " java site:stackoverflow.com" g = GoogleSearcher() q_ids = g.search(query) #q_ids : Stackoverflow ids i = 0 for i, q_id in enumerate(q_ids): query = TermQuery(Term("id", q_id)) topdocs = self.searcher.search(query, 10).scoreDocs # index searcher에 TermQuery의 객체가 들어가고.. 질문 id에 달려있는 답변 중 상위 n개 가져옴/ scoreDocs는 점수 (소수형임..) for hit in topdocs: doc = self.searcher.doc(hit.doc) docs.append( SOResultItem(doc, len(q_ids) - i, doc.get("title"), doc.get("id"), doc.get("description"))) print("%s, Question Id: %s, Answer Id: %s" % (doc.get("title"), doc.get("id"), doc.get("answer_id"))) if len(topdocs) > 0: i += 1 if i >= limit: break else: print "Stackoverflow id %s is not in our index" % q_id return docs
def getSpanNearQuery(analyzer, s, field="title", slop=100, inOrder=True): keywords = tokenize_string(analyzer, s) spanTermQueries = [ SpanMultiTermQueryWrapper(FuzzyQuery(Term(field, keyword))) for keyword in keywords ] return SpanNearQuery(spanTermQueries, slop, inOrder)
def testExact(self): """ Ensures slop of 0 works for exact matches, but not reversed """ # slop is zero by default self.builder.add(Term("field", "four")) self.builder.add(Term("field", "five")) topDocs = self.searcher.search(self.builder.build(), 50) self.assertEqual(1, topDocs.totalHits, "exact match") self.builder = PhraseQuery.Builder() self.builder.add(Term("field", "two")) self.builder.add(Term("field", "one")) topDocs = self.searcher.search(self.builder.build(), 50) self.assertEqual(0, topDocs.totalHits, "reverse not exact")