Exemple #1
0
 def get_phrase_query(self, query, field):
     """Creates phrase query for searching exact phrase."""
     # NOTE: "slop" argument in phrasequery constructor would implement fuzzy matching
     phq_builder = PhraseQuery.Builder()
     for t in query.split():
         phq_builder.add(Term(field, t))
     phq = phq_builder.build()
     return phq
Exemple #2
0
    def setUp(self):
        super(PhraseQueryTestCase, self).setUp()

        doc = Document()
        doc.add(Field("field", "one two three four five", TextField.TYPE_STORED))
        writer = self.getWriter()
        writer.addDocument(doc)
        writer.close()

        self.searcher = self.getSearcher()
        self.builder = PhraseQuery.Builder()
    def testExact(self):
        """
        Ensures slop of 0 works for exact matches, but not reversed
        """

        # slop is zero by default
        self.builder.add(Term("field", "four"))
        self.builder.add(Term("field", "five"))
        topDocs = self.searcher.search(self.builder.build(), 50)
        self.assertEqual(1, topDocs.totalHits, "exact match")

        self.builder = PhraseQuery.Builder()
        self.builder.add(Term("field", "two"))
        self.builder.add(Term("field", "one"))
        topDocs = self.searcher.search(self.builder.build(), 50)
        self.assertEqual(0, topDocs.totalHits, "reverse not exact")
Exemple #4
0
    def testPhraseQueryWithStopAnalyzer(self):

        writer = self.getWriter(analyzer=StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET))
        doc = Document()
        doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED))
        writer.addDocument(doc)
        writer.close()

        searcher = self.getSearcher()

        # valid exact phrase query
        builder = PhraseQuery.Builder()
        builder.add(Term("field", "stop"))
        builder.add(Term("field", "words"))
        scoreDocs = searcher.search(builder.build(), 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
    def testOrderDoesntMatter(self):
        """
        As long as slop is at least 2, terms can be reversed
        """

        self.builder.setSlop(2)  # must be at least two for reverse order match
        self.builder.add(Term("field", "two"))
        self.builder.add(Term("field", "one"))
        topDocs = self.searcher.search(self.builder.build(), 50)
        self.assertEqual(1, topDocs.totalHits, "just sloppy enough")

        self.builder = PhraseQuery.Builder()
        self.builder.setSlop(2)
        self.builder.add(Term("field", "three"))
        self.builder.add(Term("field", "one"))
        topDocs = self.searcher.search(self.builder.build(), 50)
        self.assertEqual(0, topDocs.totalHits, "not sloppy enough")
    def testSlop1(self):

        # Ensures slop of 1 works with terms in order.
        self.builder.setSlop(1)
        self.builder.add(Term("field", "one"))
        self.builder.add(Term("field", "two"))
        topDocs = self.searcher.search(self.builder.build(), 50)
        self.assertEqual(1, topDocs.totalHits, "in order")

        # Ensures slop of 1 does not work for phrases out of order
        # must be at least 2.
        self.builder = PhraseQuery.Builder()
        self.builder.setSlop(1)
        self.builder.add(Term("field", "two"))
        self.builder.add(Term("field", "one"))
        topDocs = self.searcher.search(self.builder.build(), 50)
        self.assertEqual(0, topDocs.totalHits, "reversed, slop not 2 or more")
Exemple #7
0
    def _parse_query(self, field_name, query):
        ts = self.analyzer.tokenStream("dummy", StringReader(query))
        termAtt = ts.getAttribute(CharTermAttribute.class_)
        ts.reset()
        tokens = []
        while ts.incrementToken():
            tokens.append(termAtt.toString())
        ts.end()
        ts.close()

        booleanQuery = BooleanQuery.Builder()
        for token in tokens:
            builder = PhraseQuery.Builder()
            for i, word in enumerate(token.split(' ')):
                builder.add(Term(field_name, word), i)
            pq = builder.build()
            booleanQuery.add(pq, BooleanClause.Occur.SHOULD)
        final_query = booleanQuery.build()
        return final_query
    def testMultipleTerms(self):
        """
        slop is the total number of positional moves allowed
        to line up a phrase
        """

        self.builder.setSlop(2)
        self.builder.add(Term("field", "one"))
        self.builder.add(Term("field", "three"))
        self.builder.add(Term("field", "five"))
        topDocs = self.searcher.search(self.builder.build(), 50)
        self.assertEqual(1, topDocs.totalHits, "two total moves")

        self.builder = PhraseQuery.Builder()
        self.builder.setSlop(5)  # it takes six moves to match this phrase
        self.builder.add(Term("field", "five"))
        self.builder.add(Term("field", "three"))
        self.builder.add(Term("field", "one"))
        topDocs = self.searcher.search(self.builder.build(), 50)
        self.assertEqual(0, topDocs.totalHits, "slop of 5 not close enough")

        self.builder.setSlop(6)
        topDocs = self.searcher.search(self.builder.build(), 50)
        self.assertEqual(1, topDocs.totalHits, "slop of 6 just right")
    def testPhraseQueryInConjunctionScorer(self):

        writer = self.getWriter()

        doc = Document()
        doc.add(Field("source", "marketing info", TextField.TYPE_STORED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("contents", "foobar", TextField.TYPE_STORED))
        doc.add(Field("source", "marketing info", TextField.TYPE_STORED))
        writer.addDocument(doc)

        writer.close()

        searcher = self.getSearcher()

        builder = PhraseQuery.Builder()
        builder.add(Term("source", "marketing"))
        builder.add(Term("source", "info"))
        phraseQuery = builder.build()
        topDocs = searcher.search(phraseQuery, 50)
        self.assertEqual(2, topDocs.totalHits)

        termQuery = TermQuery(Term("contents", "foobar"))
        builder = BooleanQuery.Builder()
        builder.add(termQuery, BooleanClause.Occur.MUST)
        builder.add(phraseQuery, BooleanClause.Occur.MUST)
        booleanQuery = builder.build()
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(1, topDocs.totalHits)

        writer = self.getWriter()

        doc = Document()
        doc.add(Field("contents", "map entry woo", TextField.TYPE_STORED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("contents", "woo map entry", TextField.TYPE_STORED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("contents", "map foobarword entry woo",
                  TextField.TYPE_STORED))
        writer.addDocument(doc)

        writer.close()

        searcher = self.getSearcher()

        termQuery = TermQuery(Term("contents", "woo"))
        builder = PhraseQuery.Builder()
        builder.add(Term("contents", "map"))
        builder.add(Term("contents", "entry"))

        topDocs = searcher.search(termQuery, 50)
        self.assertEqual(3, topDocs.totalHits)
        phraseQuery = builder.build()
        topDocs = searcher.search(phraseQuery, 50)
        self.assertEqual(2, topDocs.totalHits)

        builder = BooleanQuery.Builder()
        builder.add(termQuery, BooleanClause.Occur.MUST)
        builder.add(phraseQuery, BooleanClause.Occur.MUST)
        booleanQuery = builder.build()
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(2, topDocs.totalHits)

        builder = BooleanQuery.Builder()
        builder.add(phraseQuery, BooleanClause.Occur.MUST)
        builder.add(termQuery, BooleanClause.Occur.MUST)
        booleanQuery = builder.build()
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(2, topDocs.totalHits)
Exemple #10
0
    def testSetPosition(self):

        class _tokenizer(PythonTokenizer):
            def __init__(_self):
                super(_tokenizer, _self).__init__()

                _self.TOKENS = ["1", "2", "3", "4", "5"]
                _self.INCREMENTS = [1, 2, 1, 0, 1]
                _self.i = 0
                _self.posIncrAtt = _self.addAttribute(PositionIncrementAttribute.class_)
                _self.termAtt = _self.addAttribute(CharTermAttribute.class_)
                _self.offsetAtt = _self.addAttribute(OffsetAttribute.class_)

            def incrementToken(_self):
                if _self.i == len(_self.TOKENS):
                    return False

                _self.clearAttributes()
                _self.termAtt.append(_self.TOKENS[_self.i])
                _self.offsetAtt.setOffset(_self.i, _self.i)
                _self.posIncrAtt.setPositionIncrement(_self.INCREMENTS[_self.i])
                _self.i += 1

                return True

            def reset(_self):
                super(_tokenizer, _self).reset()
                _self.i = 0

        class _analyzer(PythonAnalyzer):
            def createComponents(_self, fieldName):
                return Analyzer.TokenStreamComponents(_tokenizer())
            def initReader(_self, fieldName, reader):
                return reader

        writer = self.getWriter(analyzer=_analyzer())

        d = Document()
        d.add(Field("field", "bogus", TextField.TYPE_STORED))

        writer.addDocument(d)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        reader = searcher.getIndexReader()
        pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("1"))
        pos.nextDoc()
        # first token should be at position 0
        self.assertEqual(0, pos.nextPosition())

        pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("2"))
        pos.nextDoc()
        # second token should be at position 2
        self.assertEqual(2, pos.nextPosition())

        b = PhraseQuery.Builder()
        b.add(Term("field", "1"))
        b.add(Term("field", "2"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # same as previous, just specify positions explicitely.
        b = PhraseQuery.Builder()
        b.add(Term("field", "1"), 0)
        b.add(Term("field", "2"), 1)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # specifying correct positions should find the phrase.
        b = PhraseQuery.Builder()
        b.add(Term("field", "1"), 0)
        b.add(Term("field", "2"), 2)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "2"))
        b.add(Term("field", "3"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "3"))
        b.add(Term("field", "4"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # phrase query would find it when correct positions are specified. 
        b = PhraseQuery.Builder()
        b.add(Term("field", "3"), 0)
        b.add(Term("field", "4"), 0)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        # phrase query should fail for non existing searched term 
        # even if there exist another searched terms in the same searched
        # position.
        b = PhraseQuery.Builder()
        b.add(Term("field", "3"), 0)
        b.add(Term("field", "9"), 0)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # multi-phrase query should succed for non existing searched term
        # because there exist another searched terms in the same searched
        # position.

        b = MultiPhraseQuery.Builder()
        b.add([Term("field", "3"), Term("field", "9")], 0)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "2"))
        b.add(Term("field", "4"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "3"))
        b.add(Term("field", "5"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "4"))
        b.add(Term("field", "5"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "2"))
        b.add(Term("field", "5"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))
Exemple #11
0
    def testSimilarity(self):

        writer = self.getWriter(analyzer=SimpleAnalyzer(
            Version.LUCENE_CURRENT),
                                similarity=SimpleSimilarity())

        d1 = Document()
        d1.add(Field("field", "a c", TextField.TYPE_STORED))

        d2 = Document()
        d2.add(Field("field", "a b c", TextField.TYPE_STORED))

        writer.addDocument(d1)
        writer.addDocument(d2)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        searcher.setSimilarity(SimpleSimilarity())

        a = Term("field", "a")
        b = Term("field", "b")
        c = Term("field", "c")

        class collector1(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)

            def doSetNextReader(_self, context):
                pass

            def needsScores(_self):
                return True

        searcher.search(TermQuery(b), collector1())

        builder = BooleanQuery.Builder()
        builder.add(TermQuery(a), BooleanClause.Occur.SHOULD)
        builder.add(TermQuery(b), BooleanClause.Occur.SHOULD)
        bq = builder.build()

        class collector2(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(doc + _self.base + 1, score)

            def doSetNextReader(_self, context):
                _self.base = context.docBase

            def needsScores(_self):
                return True

        searcher.search(bq, collector2())

        builder = PhraseQuery.Builder()
        builder.add(a)
        builder.add(c)
        pq = builder.build()

        class collector3(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)

            def doSetNextReader(_self, context):
                pass

            def needsScores(_self):
                return True

        searcher.search(pq, collector3())

        builder.setSlop(2)
        pq = builder.build()

        class collector4(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(2.0, score)

            def doSetNextReader(_self, context):
                pass

            def needsScores(_self):
                return True

        searcher.search(pq, collector4())