def get_phrase_query(self, query, field): """Creates phrase query for searching exact phrase.""" # NOTE: "slop" argument in phrasequery constructor would implement fuzzy matching phq_builder = PhraseQuery.Builder() for t in query.split(): phq_builder.add(Term(field, t)) phq = phq_builder.build() return phq
def setUp(self): super(PhraseQueryTestCase, self).setUp() doc = Document() doc.add(Field("field", "one two three four five", TextField.TYPE_STORED)) writer = self.getWriter() writer.addDocument(doc) writer.close() self.searcher = self.getSearcher() self.builder = PhraseQuery.Builder()
def testExact(self): """ Ensures slop of 0 works for exact matches, but not reversed """ # slop is zero by default self.builder.add(Term("field", "four")) self.builder.add(Term("field", "five")) topDocs = self.searcher.search(self.builder.build(), 50) self.assertEqual(1, topDocs.totalHits, "exact match") self.builder = PhraseQuery.Builder() self.builder.add(Term("field", "two")) self.builder.add(Term("field", "one")) topDocs = self.searcher.search(self.builder.build(), 50) self.assertEqual(0, topDocs.totalHits, "reverse not exact")
def testPhraseQueryWithStopAnalyzer(self): writer = self.getWriter(analyzer=StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)) doc = Document() doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() # valid exact phrase query builder = PhraseQuery.Builder() builder.add(Term("field", "stop")) builder.add(Term("field", "words")) scoreDocs = searcher.search(builder.build(), 50).scoreDocs self.assertEqual(1, len(scoreDocs))
def testOrderDoesntMatter(self): """ As long as slop is at least 2, terms can be reversed """ self.builder.setSlop(2) # must be at least two for reverse order match self.builder.add(Term("field", "two")) self.builder.add(Term("field", "one")) topDocs = self.searcher.search(self.builder.build(), 50) self.assertEqual(1, topDocs.totalHits, "just sloppy enough") self.builder = PhraseQuery.Builder() self.builder.setSlop(2) self.builder.add(Term("field", "three")) self.builder.add(Term("field", "one")) topDocs = self.searcher.search(self.builder.build(), 50) self.assertEqual(0, topDocs.totalHits, "not sloppy enough")
def testSlop1(self): # Ensures slop of 1 works with terms in order. self.builder.setSlop(1) self.builder.add(Term("field", "one")) self.builder.add(Term("field", "two")) topDocs = self.searcher.search(self.builder.build(), 50) self.assertEqual(1, topDocs.totalHits, "in order") # Ensures slop of 1 does not work for phrases out of order # must be at least 2. self.builder = PhraseQuery.Builder() self.builder.setSlop(1) self.builder.add(Term("field", "two")) self.builder.add(Term("field", "one")) topDocs = self.searcher.search(self.builder.build(), 50) self.assertEqual(0, topDocs.totalHits, "reversed, slop not 2 or more")
def _parse_query(self, field_name, query): ts = self.analyzer.tokenStream("dummy", StringReader(query)) termAtt = ts.getAttribute(CharTermAttribute.class_) ts.reset() tokens = [] while ts.incrementToken(): tokens.append(termAtt.toString()) ts.end() ts.close() booleanQuery = BooleanQuery.Builder() for token in tokens: builder = PhraseQuery.Builder() for i, word in enumerate(token.split(' ')): builder.add(Term(field_name, word), i) pq = builder.build() booleanQuery.add(pq, BooleanClause.Occur.SHOULD) final_query = booleanQuery.build() return final_query
def testMultipleTerms(self): """ slop is the total number of positional moves allowed to line up a phrase """ self.builder.setSlop(2) self.builder.add(Term("field", "one")) self.builder.add(Term("field", "three")) self.builder.add(Term("field", "five")) topDocs = self.searcher.search(self.builder.build(), 50) self.assertEqual(1, topDocs.totalHits, "two total moves") self.builder = PhraseQuery.Builder() self.builder.setSlop(5) # it takes six moves to match this phrase self.builder.add(Term("field", "five")) self.builder.add(Term("field", "three")) self.builder.add(Term("field", "one")) topDocs = self.searcher.search(self.builder.build(), 50) self.assertEqual(0, topDocs.totalHits, "slop of 5 not close enough") self.builder.setSlop(6) topDocs = self.searcher.search(self.builder.build(), 50) self.assertEqual(1, topDocs.totalHits, "slop of 6 just right")
def testPhraseQueryInConjunctionScorer(self): writer = self.getWriter() doc = Document() doc.add(Field("source", "marketing info", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "foobar", TextField.TYPE_STORED)) doc.add(Field("source", "marketing info", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() builder = PhraseQuery.Builder() builder.add(Term("source", "marketing")) builder.add(Term("source", "info")) phraseQuery = builder.build() topDocs = searcher.search(phraseQuery, 50) self.assertEqual(2, topDocs.totalHits) termQuery = TermQuery(Term("contents", "foobar")) builder = BooleanQuery.Builder() builder.add(termQuery, BooleanClause.Occur.MUST) builder.add(phraseQuery, BooleanClause.Occur.MUST) booleanQuery = builder.build() topDocs = searcher.search(booleanQuery, 50) self.assertEqual(1, topDocs.totalHits) writer = self.getWriter() doc = Document() doc.add(Field("contents", "map entry woo", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "woo map entry", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add( Field("contents", "map foobarword entry woo", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() termQuery = TermQuery(Term("contents", "woo")) builder = PhraseQuery.Builder() builder.add(Term("contents", "map")) builder.add(Term("contents", "entry")) topDocs = searcher.search(termQuery, 50) self.assertEqual(3, topDocs.totalHits) phraseQuery = builder.build() topDocs = searcher.search(phraseQuery, 50) self.assertEqual(2, topDocs.totalHits) builder = BooleanQuery.Builder() builder.add(termQuery, BooleanClause.Occur.MUST) builder.add(phraseQuery, BooleanClause.Occur.MUST) booleanQuery = builder.build() topDocs = searcher.search(booleanQuery, 50) self.assertEqual(2, topDocs.totalHits) builder = BooleanQuery.Builder() builder.add(phraseQuery, BooleanClause.Occur.MUST) builder.add(termQuery, BooleanClause.Occur.MUST) booleanQuery = builder.build() topDocs = searcher.search(booleanQuery, 50) self.assertEqual(2, topDocs.totalHits)
def testSetPosition(self): class _tokenizer(PythonTokenizer): def __init__(_self): super(_tokenizer, _self).__init__() _self.TOKENS = ["1", "2", "3", "4", "5"] _self.INCREMENTS = [1, 2, 1, 0, 1] _self.i = 0 _self.posIncrAtt = _self.addAttribute(PositionIncrementAttribute.class_) _self.termAtt = _self.addAttribute(CharTermAttribute.class_) _self.offsetAtt = _self.addAttribute(OffsetAttribute.class_) def incrementToken(_self): if _self.i == len(_self.TOKENS): return False _self.clearAttributes() _self.termAtt.append(_self.TOKENS[_self.i]) _self.offsetAtt.setOffset(_self.i, _self.i) _self.posIncrAtt.setPositionIncrement(_self.INCREMENTS[_self.i]) _self.i += 1 return True def reset(_self): super(_tokenizer, _self).reset() _self.i = 0 class _analyzer(PythonAnalyzer): def createComponents(_self, fieldName): return Analyzer.TokenStreamComponents(_tokenizer()) def initReader(_self, fieldName, reader): return reader writer = self.getWriter(analyzer=_analyzer()) d = Document() d.add(Field("field", "bogus", TextField.TYPE_STORED)) writer.addDocument(d) writer.commit() writer.close() searcher = self.getSearcher() reader = searcher.getIndexReader() pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("1")) pos.nextDoc() # first token should be at position 0 self.assertEqual(0, pos.nextPosition()) pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("2")) pos.nextDoc() # second token should be at position 2 self.assertEqual(2, pos.nextPosition()) b = PhraseQuery.Builder() b.add(Term("field", "1")) b.add(Term("field", "2")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # same as previous, just specify positions explicitely. b = PhraseQuery.Builder() b.add(Term("field", "1"), 0) b.add(Term("field", "2"), 1) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # specifying correct positions should find the phrase. b = PhraseQuery.Builder() b.add(Term("field", "1"), 0) b.add(Term("field", "2"), 2) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "3")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "3")) b.add(Term("field", "4")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # phrase query would find it when correct positions are specified. b = PhraseQuery.Builder() b.add(Term("field", "3"), 0) b.add(Term("field", "4"), 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) # phrase query should fail for non existing searched term # even if there exist another searched terms in the same searched # position. b = PhraseQuery.Builder() b.add(Term("field", "3"), 0) b.add(Term("field", "9"), 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # multi-phrase query should succed for non existing searched term # because there exist another searched terms in the same searched # position. b = MultiPhraseQuery.Builder() b.add([Term("field", "3"), Term("field", "9")], 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "4")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "3")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "4")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits))
def testSimilarity(self): writer = self.getWriter(analyzer=SimpleAnalyzer( Version.LUCENE_CURRENT), similarity=SimpleSimilarity()) d1 = Document() d1.add(Field("field", "a c", TextField.TYPE_STORED)) d2 = Document() d2.add(Field("field", "a b c", TextField.TYPE_STORED)) writer.addDocument(d1) writer.addDocument(d2) writer.commit() writer.close() searcher = self.getSearcher() searcher.setSimilarity(SimpleSimilarity()) a = Term("field", "a") b = Term("field", "b") c = Term("field", "c") class collector1(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def doSetNextReader(_self, context): pass def needsScores(_self): return True searcher.search(TermQuery(b), collector1()) builder = BooleanQuery.Builder() builder.add(TermQuery(a), BooleanClause.Occur.SHOULD) builder.add(TermQuery(b), BooleanClause.Occur.SHOULD) bq = builder.build() class collector2(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(doc + _self.base + 1, score) def doSetNextReader(_self, context): _self.base = context.docBase def needsScores(_self): return True searcher.search(bq, collector2()) builder = PhraseQuery.Builder() builder.add(a) builder.add(c) pq = builder.build() class collector3(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def doSetNextReader(_self, context): pass def needsScores(_self): return True searcher.search(pq, collector3()) builder.setSlop(2) pq = builder.build() class collector4(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(2.0, score) def doSetNextReader(_self, context): pass def needsScores(_self): return True searcher.search(pq, collector4())