Example #1
0
 def testIndexRelationTermOutput(self):
     self.assertConversion(TermQuery(Term("animal", "cats")), 'animal=cats')
     query = PhraseQuery()
     query.add(Term("animal", "cats"))
     query.add(Term("animal", "dogs"))
     self.assertConversion(query, 'animal="cats dogs"')
     self.assertConversion(query, 'animal="catS Dogs"')
Example #2
0
 def testExcludeUnqualifiedFieldForWhichNoPhraseQueryIsPossibleInCaseOfPhraseQuery(self):
     fieldRegistry = FieldRegistry()
     fieldRegistry.register('noTermFreqField', NO_TERMS_FREQUENCY_FIELDTYPE)
     self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0), ('noTermFreqField', 2.0)], luceneSettings=LuceneSettings(fieldRegistry=fieldRegistry))
     expected = PhraseQuery()
     expected.add(Term("unqualified", "phrase query"))
     self.assertConversion(expected, '"phrase query"')
Example #3
0
def lucene_sample_query_parse(sampleq, ftypes):
    fields = []
    queries = []
    booleans = []
    bq = BooleanQuery()
    for query_tuple in sampleq:
        (field, op_, value) = re.split(snapconf.RANGE_QUERY_OPS, query_tuple)
        m=snapconf.RANGE_QUERY_FIELD_PATTERN.search(query_tuple)
        if m is None or field is None:
            continue
        op=m.group(1)
        if op not in snapconf.operators:
            sys.stderr.write("bad operator %s in range query,exiting\n" % (str(op)))
            sys.exit(-1)
        field_w_type = snapconf.SAMPLE_HEADER_FIELDS_TYPE_MAP[field]
        (fieldtypechar, ftype_method) = ftypes[field_w_type]
        #range query
        if fieldtypechar == 'i' or fieldtypechar == 'f':
            bq.add(lucene_range_query_parse(field_w_type, op, value, fieldtypechar, ftype_method), BOOLEAN_OCCUR)
        #phrase query
        elif ' ' in value or '\t' in value:
            pquery = PhraseQuery()
            [pquery.add(Term(field_w_type, v.lower())) for v in re.split(r'\s+',value)]
            #force exact phrase matching only
            pquery.setSlop(0)
            bq.add(pquery, BOOLEAN_OCCUR)
        #term query
        else:
            bq.add(TermQuery(Term(field_w_type, value.lower())), BOOLEAN_OCCUR)
        sys.stderr.write("value + fields: %s %s\n" % (value.lower(), field_w_type))
    return bq
 def _termOrPhraseQuery(self, index, termString):
     listOfTermStrings = self._analyzeToken(termString)
     if len(listOfTermStrings) == 1:
         if prefixRegexp.match(termString):
             return PrefixQuery(self._createTerm(index, listOfTermStrings[0]))
         return TermQuery(self._createTerm(index, listOfTermStrings[0]))
     result = PhraseQuery()
     for term in listOfTermStrings:
         result.add(self._createTerm(index, term))
     return result
Example #5
0
 def _termOrPhraseQuery(self, index, termString):
     listOfTermStrings = self._analyzeToken(termString)
     if len(listOfTermStrings) == 1:
         if prefixRegexp.match(termString):
             return PrefixQuery(
                 self._createTerm(index, listOfTermStrings[0]))
         return TermQuery(self._createTerm(index, listOfTermStrings[0]))
     result = PhraseQuery()
     for term in listOfTermStrings:
         result.add(self._createTerm(index, term))
     return result
Example #6
0
    def setUp(self):
        super(PhraseQueryTestCase, self).setUp()

        doc = Document()
        doc.add(Field("field", "one two three four five", TextField.TYPE_STORED))
        writer = self.getWriter()
        writer.addDocument(doc)
        writer.close()
        
        self.searcher = self.getSearcher()
        self.query = PhraseQuery()
Example #7
0
    def runSearch(self, runCount, mainThread=False):
        """ search for runCount number of times """

        # problem: if there are any assertion errors in the child
        #   thread, the calling thread is not notified and may still
        #   consider the test case pass. We are using self.totalQueries
        #   to double check that work has actually been done.

        if not mainThread:
            getVMEnv().attachCurrentThread()
        time.sleep(0.5)

        searcher = self.getSearcher()
        try:
            self.query = PhraseQuery()
            for word, count in self.testData[0:runCount]:
                query = TermQuery(Term("field", word))
                topDocs = searcher.search(query, 50)
                self.assertEqual(topDocs.totalHits, count)

                self.lock.acquire()
                self.totalQueries += 1
                self.lock.release()
        finally:
            del searcher
Example #8
0
    def testExact(self):
        """
        Ensures slop of 0 works for exact matches, but not reversed
        """

        # slop is zero by default
        self.query.add(Term("field", "four"))
        self.query.add(Term("field", "five"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(1, topDocs.totalHits, "exact match")

        self.query = PhraseQuery()
        self.query.add(Term("field", "two"))
        self.query.add(Term("field", "one"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(0, topDocs.totalHits, "reverse not exact")
Example #9
0
    def testOrderDoesntMatter(self):
        """
        As long as slop is at least 2, terms can be reversed
        """

        self.query.setSlop(2) # must be at least two for reverse order match
        self.query.add(Term("field", "two"))
        self.query.add(Term("field", "one"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(1, topDocs.totalHits, "just sloppy enough")

        self.query = PhraseQuery()
        self.query.setSlop(2)
        self.query.add(Term("field", "three"))
        self.query.add(Term("field", "one"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(0, topDocs.totalHits, "not sloppy enough")
Example #10
0
    def testSlop1(self):

        # Ensures slop of 1 works with terms in order.
        self.query.setSlop(1)
        self.query.add(Term("field", "one"))
        self.query.add(Term("field", "two"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(1, topDocs.totalHits, "in order")

        # Ensures slop of 1 does not work for phrases out of order
        # must be at least 2.
        self.query = PhraseQuery()
        self.query.setSlop(1)
        self.query.add(Term("field", "two"))
        self.query.add(Term("field", "one"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(0, topDocs.totalHits, "reversed, slop not 2 or more")
Example #11
0
 def get_phrase_query(self, query, field):
     """Creates phrase query for searching exact phrase."""
     # NOTE: "slop" argument in phrasequery constructor would implement fuzzy matching
     phq_builder = PhraseQuery.Builder()
     for t in query.split():
         phq_builder.add(Term(field, t))
     phq = phq_builder.build()
     return phq
Example #12
0
 def testDiacriticsShouldBeNormalizedNFC(self):
     pq = PhraseQuery()
     pq.add(Term("title", "more"))
     pq.add(Term("title", "e"))
     self.assertConversion(pq, 'title=More\xcc\x81e') # Combined `
     from unicodedata import normalize
     self.assertConversion(TermQuery(Term('title', 'moree')), normalize('NFC', unicode('title=More\xcc\x81e')))
Example #13
0
    def testMultipleTerms(self):
        """
        slop is the total number of positional moves allowed
        to line up a phrase
        """
        
        self.query.setSlop(2)
        self.query.add(Term("field", "one"))
        self.query.add(Term("field", "three"))
        self.query.add(Term("field", "five"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(1, topDocs.totalHits, "two total moves")

        self.query = PhraseQuery()
        self.query.setSlop(5) # it takes six moves to match this phrase
        self.query.add(Term("field", "five"))
        self.query.add(Term("field", "three"))
        self.query.add(Term("field", "one"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(0, topDocs.totalHits, "slop of 5 not close enough")

        self.query.setSlop(6)
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(1, topDocs.totalHits, "slop of 6 just right")
Example #14
0
    def testPhraseQueryWithStopAnalyzer(self):

        writer = self.getWriter(analyzer=StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET))
        doc = Document()
        doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED))
        writer.addDocument(doc)
        writer.close()

        searcher = self.getSearcher()

        # valid exact phrase query
        builder = PhraseQuery.Builder()
        builder.add(Term("field", "stop"))
        builder.add(Term("field", "words"))
        scoreDocs = searcher.search(builder.build(), 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
 def get_phrase_query(self, query, field):
     """Creates phrase query for searching exact phrase."""
     terms = []
     trans_query = ''
     #process Chinese query
     for c in query:
         if ord(c) >=256:
             trans_query += '%s '%c
         else:
             trans_query += c
     for t in trans_query.split():
         #term = Term(field, t)
         #terms.append(term)
         terms.append(t)
         #phq.add(Term(field, t))
     phq = PhraseQuery(field, terms)
     return phq
Example #16
0
    def _parse_query(self, field_name, query):
        ts = self.analyzer.tokenStream("dummy", StringReader(query))
        termAtt = ts.getAttribute(CharTermAttribute.class_)
        ts.reset()
        tokens = []
        while ts.incrementToken():
            tokens.append(termAtt.toString())
        ts.end()
        ts.close()

        booleanQuery = BooleanQuery.Builder()
        for token in tokens:
            builder = PhraseQuery.Builder()
            for i, word in enumerate(token.split(' ')):
                builder.add(Term(field_name, word), i)
            pq = builder.build()
            booleanQuery.add(pq, BooleanClause.Occur.SHOULD)
        final_query = booleanQuery.build()
        return final_query
Example #17
0
    def testPhraseQueryWithStopAnalyzer(self):

        writer = self.getWriter(analyzer=StopAnalyzer(Version.LUCENE_CURRENT))
        doc = Document()
        doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED))
        writer.addDocument(doc)
        writer.close()

        searcher = self.getSearcher()

        # valid exact phrase query
        query = PhraseQuery()
        query.add(Term("field", "stop"))
        query.add(Term("field", "words"))
        scoreDocs = searcher.search(query, None, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
Example #18
0
    def search(self,
               words,
               words_orig,
               stopwords=[],
               min_length=0,
               slop=2,
               remove_digits=False,
               any_one_word_occur=False):
        words_without_digits = re.sub(r'\w*\d\w*', '',
                                      " ".join(words)).strip().split(" ")
        if remove_digits and len(words_without_digits) > 0:
            words = words_without_digits
        words = [
            x for x in words
            if x.lower() not in stopwords and len(x) > min_length
        ]
        words_orig = [
            x for x in words_orig
            if x.lower() not in stopwords and len(x) > min_length
        ]

        if len(words) == 0:
            return []
        query = BooleanQuery()
        query1 = PhraseQuery()
        query1.setSlop(slop)
        query2 = PhraseQuery()
        query2.setSlop(slop)
        query3 = PhraseQuery()
        query3.setSlop(slop)
        for word in words:
            query2.add(Term("wiki_name_analyzed_nopunct", word))
            query3.add(Term("wiki_name_analyzed_nopunct_nostop", word))
        for word in words_orig:
            query1.add(Term("wiki_name_analyzed", word))
        query.add(query1, BooleanClause.Occur.SHOULD)
        query.add(query2, BooleanClause.Occur.SHOULD)
        query.add(query3, BooleanClause.Occur.SHOULD)
        # print "1. query ", query
        scoreDocs = self.searcher.search(query,
                                         self.num_docs_to_return).scoreDocs
        if len(scoreDocs) > 0:
            # self.printDocs(scoreDocs)
            return scoreDocs
        query = BooleanQuery()
        for word in words:
            query_word = BooleanQuery()
            query_word.add(TermQuery(Term("wiki_name_analyzed_nopunct", word)),
                           BooleanClause.Occur.SHOULD)
            query_word.add(
                TermQuery(Term("wiki_name_analyzed_nopunct_nostop", word)),
                BooleanClause.Occur.SHOULD)
            query.add(query_word, BooleanClause.Occur.MUST)
        # print "2. query ", query
        scoreDocs = self.searcher.search(query,
                                         self.num_docs_to_return).scoreDocs
        if len(scoreDocs) > 0:
            return scoreDocs
        query = BooleanQuery()
        for word in words_orig:
            query.add(TermQuery(Term("wiki_name_analyzed", word)),
                      BooleanClause.Occur.MUST)
        # print "3. query ", query
        scoreDocs = self.searcher.search(query,
                                         self.num_docs_to_return).scoreDocs
        if len(stopwords) > 0 and any_one_word_occur:
            query = BooleanQuery()
            for word in words_orig:
                query.add(TermQuery(Term("wiki_name_analyzed", word)),
                          BooleanClause.Occur.SHOULD)
        return scoreDocs
    def testSimilarity(self):

        writer = self.getWriter(analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT),
                                similarity=SimpleSimilarity())
    
        d1 = Document()
        d1.add(Field("field", "a c", TextField.TYPE_STORED))

        d2 = Document()
        d2.add(Field("field", "a b c", TextField.TYPE_STORED))
    
        writer.addDocument(d1)
        writer.addDocument(d2)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        searcher.setSimilarity(SimpleSimilarity())

        a = Term("field", "a")
        b = Term("field", "b")
        c = Term("field", "c")

        class collector1(PythonCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)
            def setNextReader(_self, context):
                pass
            def acceptsDocsOutOfOrder(_self):
                return True

        searcher.search(TermQuery(b), collector1())

        bq = BooleanQuery()
        bq.add(TermQuery(a), BooleanClause.Occur.SHOULD)
        bq.add(TermQuery(b), BooleanClause.Occur.SHOULD)

        class collector2(PythonCollector):
            def collect(_self, doc, score):
                self.assertEqual(doc + _self.base + 1, score)
            def setNextReader(_self, context):
                _self.base = context.docBase
            def acceptsDocsOutOfOrder(_self):
                return True

        searcher.search(bq, collector2())

        pq = PhraseQuery()
        pq.add(a)
        pq.add(c)

        class collector3(PythonCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)
            def setNextReader(_self, context):
                pass
            def acceptsDocsOutOfOrder(_self):
                return True

        searcher.search(pq, collector3())

        pq.setSlop(2)

        class collector4(PythonCollector):
            def collect(_self, doc, score):
                self.assertEqual(2.0, score)
            def setNextReader(_self, context):
                pass
            def acceptsDocsOutOfOrder(_self):
                return True

        searcher.search(pq, collector4())
    def testSimilarity(self):

        writer = self.getWriter(analyzer=SimpleAnalyzer(
            Version.LUCENE_CURRENT),
                                similarity=SimpleSimilarity())

        d1 = Document()
        d1.add(Field("field", "a c", TextField.TYPE_STORED))

        d2 = Document()
        d2.add(Field("field", "a c b", TextField.TYPE_STORED))

        writer.addDocument(d1)
        writer.addDocument(d2)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        searcher.setSimilarity(SimpleSimilarity())

        a = Term("field", "a")
        b = Term("field", "b")
        c = Term("field", "c")

        class collector1(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)

            def doSetNextReader(_self, context):
                pass

            def scoreMode(_self):
                return ScoreMode.COMPLETE

        searcher.search(TermQuery(b), collector1())

        builder = BooleanQuery.Builder()
        builder.add(TermQuery(a), BooleanClause.Occur.SHOULD)
        builder.add(TermQuery(b), BooleanClause.Occur.SHOULD)
        bq = builder.build()

        class collector2(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(doc + _self.base + 1, score)

            def doSetNextReader(_self, context):
                _self.base = context.docBase

            def scoreMode(_self):
                return ScoreMode.COMPLETE

        searcher.search(bq, collector2())

        pq = PhraseQuery(a.field(), [a.bytes(), c.bytes()])

        class collector3(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)

            def doSetNextReader(_self, context):
                pass

            def scoreMode(_self):
                return ScoreMode.COMPLETE

        searcher.search(pq, collector3())

        pq = PhraseQuery(2, a.field(), [a.bytes(), b.bytes()])

        class collector4(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(0.5, score)

            def doSetNextReader(_self, context):
                pass

            def scoreMode(_self):
                return ScoreMode.COMPLETE

        searcher.search(pq, collector4())
Example #21
0
 def testPhraseOutputDutchStemming(self):
     self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings(analyzer=MerescoDutchStemmingAnalyzer()))
     query = PhraseQuery()
     query.add(Term("unqualified", "kat"))
     query.add(Term("unqualified", "hond"))
     self.assertConversion(query, '"katten honden"')
Example #22
0
 def testOneTermPhraseQueryUsesStandardAnalyzed(self):
     expected = PhraseQuery()
     expected.add(Term('unqualified', 'aap'))
     expected.add(Term('unqualified', 'noot'))
     self.assertConversion(expected, 'aap:noot')
Example #23
0
 def testPhraseOutput(self):
     query = PhraseQuery()
     query.add(Term("unqualified", "cats"))
     query.add(Term("unqualified", "dogs"))
     self.assertConversion(query,'"cats dogs"')
    def testSetPosition(self):

        class _tokenizer(PythonTokenizer):
            def __init__(_self, reader):
                super(_tokenizer, _self).__init__(reader)

                _self.TOKENS = ["1", "2", "3", "4", "5"]
                _self.INCREMENTS = [1, 2, 1, 0, 1]
                _self.i = 0
                _self.posIncrAtt = _self.addAttribute(PositionIncrementAttribute.class_)
                _self.termAtt = _self.addAttribute(CharTermAttribute.class_)
                _self.offsetAtt = _self.addAttribute(OffsetAttribute.class_)

            def incrementToken(_self):
                if _self.i == len(_self.TOKENS):
                    return False

                _self.clearAttributes()
                _self.termAtt.append(_self.TOKENS[_self.i])
                _self.offsetAtt.setOffset(_self.i, _self.i)
                _self.posIncrAtt.setPositionIncrement(_self.INCREMENTS[_self.i])
                _self.i += 1

                return True

            def end(_self):
                pass
            def reset(_self):
                pass
            def close(_self):
                pass

        class _analyzer(PythonAnalyzer):
            def createComponents(_self, fieldName, reader):
                return Analyzer.TokenStreamComponents(_tokenizer(reader))

        writer = self.getWriter(analyzer=_analyzer())

        d = Document()
        d.add(Field("field", "bogus", TextField.TYPE_STORED))

        writer.addDocument(d)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        reader = searcher.getIndexReader()
        pos = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "field", BytesRef("1"))
        pos.nextDoc()
        # first token should be at position 0
        self.assertEqual(0, pos.nextPosition())
    
        pos = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "field", BytesRef("2"))
        pos.nextDoc()
        # second token should be at position 2
        self.assertEqual(2, pos.nextPosition())
    
        q = PhraseQuery()
        q.add(Term("field", "1"))
        q.add(Term("field", "2"))
        hits = searcher.search(q, None, 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # same as previous, just specify positions explicitely.
        q = PhraseQuery() 
        q.add(Term("field", "1"), 0)
        q.add(Term("field", "2"), 1)
        hits = searcher.search(q, None, 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # specifying correct positions should find the phrase.
        q = PhraseQuery()
        q.add(Term("field", "1"), 0)
        q.add(Term("field", "2"), 2)
        hits = searcher.search(q, None, 1000).scoreDocs
        self.assertEqual(1, len(hits))

        q = PhraseQuery()
        q.add(Term("field", "2"))
        q.add(Term("field", "3"))
        hits = searcher.search(q, None, 1000).scoreDocs
        self.assertEqual(1, len(hits))

        q = PhraseQuery()
        q.add(Term("field", "3"))
        q.add(Term("field", "4"))
        hits = searcher.search(q, None, 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # phrase query would find it when correct positions are specified. 
        q = PhraseQuery()
        q.add(Term("field", "3"), 0)
        q.add(Term("field", "4"), 0)
        hits = searcher.search(q, None, 1000).scoreDocs
        self.assertEqual(1, len(hits))

        # phrase query should fail for non existing searched term 
        # even if there exist another searched terms in the same searched
        # position.
        q = PhraseQuery()
        q.add(Term("field", "3"), 0)
        q.add(Term("field", "9"), 0)
        hits = searcher.search(q, None, 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # multi-phrase query should succed for non existing searched term
        # because there exist another searched terms in the same searched
        # position.

        mq = MultiPhraseQuery()
        mq.add([Term("field", "3"), Term("field", "9")], 0)
        hits = searcher.search(mq, None, 1000).scoreDocs
        self.assertEqual(1, len(hits))

        q = PhraseQuery()
        q.add(Term("field", "2"))
        q.add(Term("field", "4"))
        hits = searcher.search(q, None, 1000).scoreDocs
        self.assertEqual(1, len(hits))

        q = PhraseQuery()
        q.add(Term("field", "3"))
        q.add(Term("field", "5"))
        hits = searcher.search(q, None, 1000).scoreDocs
        self.assertEqual(1, len(hits))

        q = PhraseQuery()
        q.add(Term("field", "4"))
        q.add(Term("field", "5"))
        hits = searcher.search(q, None, 1000).scoreDocs
        self.assertEqual(1, len(hits))

        q = PhraseQuery()
        q.add(Term("field", "2"))
        q.add(Term("field", "5"))
        hits = searcher.search(q, None, 1000).scoreDocs
        self.assertEqual(0, len(hits))
Example #25
0
 def testPhraseQueryIsStandardAnalyzed(self):
     expected = PhraseQuery()
     for term in ["vol.118", "2008", "nr.3", "march", "p.435-444"]:
         expected.add(Term("unqualified", term))
     input = '"vol.118 (2008) nr.3 (March) p.435-444"'
     self.assertConversion(expected, input)
 def get_phrase_query(self, query, field):
     """Creates phrase query for searching exact phrase."""
     phq = PhraseQuery()
     for t in query.split():
         phq.add(Term(field, t))
     return phq
Example #27
0
 def testStandardAnalyserWithoutStopWords(self):
     expected = PhraseQuery()
     for term in ["no", "is", "the", "only", "option"]:
         expected.add(Term("unqualified", term))
     self.assertConversion(expected, '"no is the only option"')
    def testPhraseQueryInConjunctionScorer(self):

        writer = self.getWriter()
    
        doc = Document()
        doc.add(Field("source", "marketing info", TextField.TYPE_STORED))
        writer.addDocument(doc)
    
        doc = Document()
        doc.add(Field("contents", "foobar", TextField.TYPE_STORED))
        doc.add(Field("source", "marketing info", TextField.TYPE_STORED))
        writer.addDocument(doc)
    
        writer.close()
        
        searcher = self.getSearcher()
    
        phraseQuery = PhraseQuery()
        phraseQuery.add(Term("source", "marketing"))
        phraseQuery.add(Term("source", "info"))
        topDocs = searcher.search(phraseQuery, 50)
        self.assertEqual(2, topDocs.totalHits)
    
        termQuery = TermQuery(Term("contents","foobar"))
        booleanQuery = BooleanQuery()
        booleanQuery.add(termQuery, BooleanClause.Occur.MUST)
        booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST)
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(1, topDocs.totalHits)
    
        writer = self.getWriter()
        
        doc = Document()
        doc.add(Field("contents", "map entry woo", TextField.TYPE_STORED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("contents", "woo map entry", TextField.TYPE_STORED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("contents", "map foobarword entry woo", TextField.TYPE_STORED))
        writer.addDocument(doc)

        writer.close()
        
        searcher = self.getSearcher()
    
        termQuery = TermQuery(Term("contents", "woo"))
        phraseQuery = PhraseQuery()
        phraseQuery.add(Term("contents", "map"))
        phraseQuery.add(Term("contents", "entry"))
    
        topDocs = searcher.search(termQuery, 50)
        self.assertEqual(3, topDocs.totalHits)
        topDocs = searcher.search(phraseQuery, 50)
        self.assertEqual(2, topDocs.totalHits)
    
        booleanQuery = BooleanQuery()
        booleanQuery.add(termQuery, BooleanClause.Occur.MUST)
        booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST)
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(2, topDocs.totalHits)
    
        booleanQuery = BooleanQuery()
        booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST)
        booleanQuery.add(termQuery, BooleanClause.Occur.MUST)
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(2, topDocs.totalHits)
Example #29
0
    def testSetPosition(self):

        class _tokenizer(PythonTokenizer):
            def __init__(_self):
                super(_tokenizer, _self).__init__()

                _self.TOKENS = ["1", "2", "3", "4", "5"]
                _self.INCREMENTS = [1, 2, 1, 0, 1]
                _self.i = 0
                _self.posIncrAtt = _self.addAttribute(PositionIncrementAttribute.class_)
                _self.termAtt = _self.addAttribute(CharTermAttribute.class_)
                _self.offsetAtt = _self.addAttribute(OffsetAttribute.class_)

            def incrementToken(_self):
                if _self.i == len(_self.TOKENS):
                    return False

                _self.clearAttributes()
                _self.termAtt.append(_self.TOKENS[_self.i])
                _self.offsetAtt.setOffset(_self.i, _self.i)
                _self.posIncrAtt.setPositionIncrement(_self.INCREMENTS[_self.i])
                _self.i += 1

                return True

            def reset(_self):
                super(_tokenizer, _self).reset()
                _self.i = 0

        class _analyzer(PythonAnalyzer):
            def createComponents(_self, fieldName):
                return Analyzer.TokenStreamComponents(_tokenizer())
            def initReader(_self, fieldName, reader):
                return reader

        writer = self.getWriter(analyzer=_analyzer())

        d = Document()
        d.add(Field("field", "bogus", TextField.TYPE_STORED))

        writer.addDocument(d)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        reader = searcher.getIndexReader()
        pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("1"))
        pos.nextDoc()
        # first token should be at position 0
        self.assertEqual(0, pos.nextPosition())

        pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("2"))
        pos.nextDoc()
        # second token should be at position 2
        self.assertEqual(2, pos.nextPosition())

        b = PhraseQuery.Builder()
        b.add(Term("field", "1"))
        b.add(Term("field", "2"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # same as previous, just specify positions explicitely.
        b = PhraseQuery.Builder()
        b.add(Term("field", "1"), 0)
        b.add(Term("field", "2"), 1)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # specifying correct positions should find the phrase.
        b = PhraseQuery.Builder()
        b.add(Term("field", "1"), 0)
        b.add(Term("field", "2"), 2)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "2"))
        b.add(Term("field", "3"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "3"))
        b.add(Term("field", "4"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # phrase query would find it when correct positions are specified. 
        b = PhraseQuery.Builder()
        b.add(Term("field", "3"), 0)
        b.add(Term("field", "4"), 0)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        # phrase query should fail for non existing searched term 
        # even if there exist another searched terms in the same searched
        # position.
        b = PhraseQuery.Builder()
        b.add(Term("field", "3"), 0)
        b.add(Term("field", "9"), 0)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # multi-phrase query should succed for non existing searched term
        # because there exist another searched terms in the same searched
        # position.

        b = MultiPhraseQuery.Builder()
        b.add([Term("field", "3"), Term("field", "9")], 0)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "2"))
        b.add(Term("field", "4"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "3"))
        b.add(Term("field", "5"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "4"))
        b.add(Term("field", "5"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "2"))
        b.add(Term("field", "5"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))
query1b = Term("capital_html","shakespeare")

fuzzy_query_b = FuzzyQuery(query1b)

get_query_results(reader,fuzzy_query_b,n_docs,"capital")

#Results:
#Found 4 hits:
#1. https://en.wikipedia.org/wiki/London
#2. https://en.wikipedia.org/wiki/Prague
#3. https://en.wikipedia.org/wiki/Cairo
#4. https://en.wikipedia.org/wiki/Washington,_D.C.

#### part(c)

phrase_c = PhraseQuery()
phrase_c.setSlop(10)

term_phrase_c = 'located below sea level'
token_phrase_c = lucene_english_normalizer(term_phrase_c)

def get_phrase(token_phrase):
    for word in token_phrase:
        term = Term('capital_html', word.encode('ascii', 'ignore'))
        phrase_c.add(term)

get_phrase(token_phrase_c)
get_query_results(reader,phrase_c,n_docs,'capital')

#Found 1 hits:
#Results:
Example #31
0
    q_regex = RegexpQuery(Term("contents", "[0-9]+\.?[0-9]*"))
    print(f'regex results: {searcher.search(q_regex,1000000).totalHits}')

    span1 = SpanMultiTermQueryWrapper(q_regex)
    span2 = SpanMultiTermQueryWrapper(RegexpQuery(Term("contents", "tiger")))
    spannearquery = SpanNearQuery([span1, span2], 20, True)
    print(
        f'spanquery results: {searcher.search(spannearquery, 1000000).totalHits}'
    )

    parser = QueryParser('contents', StandardAnalyzer())
    q = parser.parse('"tiger leopard"')
    print(q)  # prints contents:"tiger leopard"
    print(searcher.search(q, 10000000).totalHits)

    phrase_query = PhraseQuery(10, 'contents', 'tiger leopard')
    print(phrase_query)
    print(searcher.search(phrase_query, 10000000).totalHits)

    parser = QueryParser('contents', StandardAnalyzer())
    q = parser.parse('"tiger leopard"~10')
    print(q)  # prints contents:"tiger leopard"~10
    print(searcher.search(q, 10000000).totalHits)

    for i in range(0, reader.numDocs()):
        doc = reader.document(i)
        text = doc.get("contents")
        articleID = doc.get("articleID")
        # Do your pattern matching and record patterns for document
    articleID
Example #32
0
class PhraseQueryTestCase(PyLuceneTestCase):
    """
    Unit tests ported from Java Lucene
    """

    def setUp(self):
        super(PhraseQueryTestCase, self).setUp()

        doc = Document()
        doc.add(Field("field", "one two three four five", TextField.TYPE_STORED))
        writer = self.getWriter()
        writer.addDocument(doc)
        writer.close()
        
        self.searcher = self.getSearcher()
        self.query = PhraseQuery()

    def testNotCloseEnough(self):

        self.query.setSlop(2)
        self.query.add(Term("field", "one"))
        self.query.add(Term("field", "five"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(0, topDocs.totalHits)

    def testBarelyCloseEnough(self):

        self.query.setSlop(3)
        self.query.add(Term("field", "one"))
        self.query.add(Term("field", "five"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(1, topDocs.totalHits)

    def testExact(self):
        """
        Ensures slop of 0 works for exact matches, but not reversed
        """

        # slop is zero by default
        self.query.add(Term("field", "four"))
        self.query.add(Term("field", "five"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(1, topDocs.totalHits, "exact match")

        self.query = PhraseQuery()
        self.query.add(Term("field", "two"))
        self.query.add(Term("field", "one"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(0, topDocs.totalHits, "reverse not exact")

    def testSlop1(self):

        # Ensures slop of 1 works with terms in order.
        self.query.setSlop(1)
        self.query.add(Term("field", "one"))
        self.query.add(Term("field", "two"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(1, topDocs.totalHits, "in order")

        # Ensures slop of 1 does not work for phrases out of order
        # must be at least 2.
        self.query = PhraseQuery()
        self.query.setSlop(1)
        self.query.add(Term("field", "two"))
        self.query.add(Term("field", "one"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(0, topDocs.totalHits, "reversed, slop not 2 or more")

    def testOrderDoesntMatter(self):
        """
        As long as slop is at least 2, terms can be reversed
        """

        self.query.setSlop(2) # must be at least two for reverse order match
        self.query.add(Term("field", "two"))
        self.query.add(Term("field", "one"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(1, topDocs.totalHits, "just sloppy enough")

        self.query = PhraseQuery()
        self.query.setSlop(2)
        self.query.add(Term("field", "three"))
        self.query.add(Term("field", "one"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(0, topDocs.totalHits, "not sloppy enough")

    def testMultipleTerms(self):
        """
        slop is the total number of positional moves allowed
        to line up a phrase
        """
        
        self.query.setSlop(2)
        self.query.add(Term("field", "one"))
        self.query.add(Term("field", "three"))
        self.query.add(Term("field", "five"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(1, topDocs.totalHits, "two total moves")

        self.query = PhraseQuery()
        self.query.setSlop(5) # it takes six moves to match this phrase
        self.query.add(Term("field", "five"))
        self.query.add(Term("field", "three"))
        self.query.add(Term("field", "one"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(0, topDocs.totalHits, "slop of 5 not close enough")

        self.query.setSlop(6)
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(1, topDocs.totalHits, "slop of 6 just right")

    def testPhraseQueryWithStopAnalyzer(self):

        writer = self.getWriter(analyzer=StopAnalyzer(Version.LUCENE_CURRENT))
        doc = Document()
        doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED))
        writer.addDocument(doc)
        writer.close()

        searcher = self.getSearcher()

        # valid exact phrase query
        query = PhraseQuery()
        query.add(Term("field", "stop"))
        query.add(Term("field", "words"))
        scoreDocs = searcher.search(query, None, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
  
    def testPhraseQueryInConjunctionScorer(self):

        writer = self.getWriter()
    
        doc = Document()
        doc.add(Field("source", "marketing info", TextField.TYPE_STORED))
        writer.addDocument(doc)
    
        doc = Document()
        doc.add(Field("contents", "foobar", TextField.TYPE_STORED))
        doc.add(Field("source", "marketing info", TextField.TYPE_STORED))
        writer.addDocument(doc)
    
        writer.close()
        
        searcher = self.getSearcher()
    
        phraseQuery = PhraseQuery()
        phraseQuery.add(Term("source", "marketing"))
        phraseQuery.add(Term("source", "info"))
        topDocs = searcher.search(phraseQuery, 50)
        self.assertEqual(2, topDocs.totalHits)
    
        termQuery = TermQuery(Term("contents","foobar"))
        booleanQuery = BooleanQuery()
        booleanQuery.add(termQuery, BooleanClause.Occur.MUST)
        booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST)
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(1, topDocs.totalHits)
    
        writer = self.getWriter()
        
        doc = Document()
        doc.add(Field("contents", "map entry woo", TextField.TYPE_STORED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("contents", "woo map entry", TextField.TYPE_STORED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("contents", "map foobarword entry woo", TextField.TYPE_STORED))
        writer.addDocument(doc)

        writer.close()
        
        searcher = self.getSearcher()
    
        termQuery = TermQuery(Term("contents", "woo"))
        phraseQuery = PhraseQuery()
        phraseQuery.add(Term("contents", "map"))
        phraseQuery.add(Term("contents", "entry"))
    
        topDocs = searcher.search(termQuery, 50)
        self.assertEqual(3, topDocs.totalHits)
        topDocs = searcher.search(phraseQuery, 50)
        self.assertEqual(2, topDocs.totalHits)
    
        booleanQuery = BooleanQuery()
        booleanQuery.add(termQuery, BooleanClause.Occur.MUST)
        booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST)
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(2, topDocs.totalHits)
    
        booleanQuery = BooleanQuery()
        booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST)
        booleanQuery.add(termQuery, BooleanClause.Occur.MUST)
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(2, topDocs.totalHits)
Example #33
0
    def testPhraseQueryInConjunctionScorer(self):

        writer = self.getWriter()

        doc = Document()
        doc.add(Field("source", "marketing info", TextField.TYPE_STORED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("contents", "foobar", TextField.TYPE_STORED))
        doc.add(Field("source", "marketing info", TextField.TYPE_STORED))
        writer.addDocument(doc)

        writer.close()

        searcher = self.getSearcher()

        builder = PhraseQuery.Builder()
        builder.add(Term("source", "marketing"))
        builder.add(Term("source", "info"))
        phraseQuery = builder.build()
        topDocs = searcher.search(phraseQuery, 50)
        self.assertEqual(2, topDocs.totalHits)

        termQuery = TermQuery(Term("contents", "foobar"))
        builder = BooleanQuery.Builder()
        builder.add(termQuery, BooleanClause.Occur.MUST)
        builder.add(phraseQuery, BooleanClause.Occur.MUST)
        booleanQuery = builder.build()
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(1, topDocs.totalHits)

        writer = self.getWriter()

        doc = Document()
        doc.add(Field("contents", "map entry woo", TextField.TYPE_STORED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("contents", "woo map entry", TextField.TYPE_STORED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("contents", "map foobarword entry woo",
                  TextField.TYPE_STORED))
        writer.addDocument(doc)

        writer.close()

        searcher = self.getSearcher()

        termQuery = TermQuery(Term("contents", "woo"))
        builder = PhraseQuery.Builder()
        builder.add(Term("contents", "map"))
        builder.add(Term("contents", "entry"))

        topDocs = searcher.search(termQuery, 50)
        self.assertEqual(3, topDocs.totalHits)
        phraseQuery = builder.build()
        topDocs = searcher.search(phraseQuery, 50)
        self.assertEqual(2, topDocs.totalHits)

        builder = BooleanQuery.Builder()
        builder.add(termQuery, BooleanClause.Occur.MUST)
        builder.add(phraseQuery, BooleanClause.Occur.MUST)
        booleanQuery = builder.build()
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(2, topDocs.totalHits)

        builder = BooleanQuery.Builder()
        builder.add(phraseQuery, BooleanClause.Occur.MUST)
        builder.add(termQuery, BooleanClause.Occur.MUST)
        booleanQuery = builder.build()
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(2, topDocs.totalHits)
Example #34
0
 def search(self, value, stopwords=[], min_length=0):
     words = [
         x for x in nltk.word_tokenize(value)
         if x not in stopwords and len(x) > min_length
     ]
     query = BooleanQuery()
     query1 = PhraseQuery()
     query1.setSlop(2)
     query2 = PhraseQuery()
     query2.setSlop(2)
     query3 = PhraseQuery()
     query3.setSlop(2)
     for word in words:
         query1.add(Term("wiki_name_analyzed", word))
         query2.add(Term("wiki_name_analyzed_nopunct", word))
         query3.add(Term("wiki_name_analyzed_nopunct_nostop", word))
     query.add(query1, BooleanClause.Occur.SHOULD)
     query.add(query2, BooleanClause.Occur.SHOULD)
     query.add(query3, BooleanClause.Occur.SHOULD)
     scoreDocs = self.searcher.search(query,
                                      self.num_docs_to_return).scoreDocs
     if len(scoreDocs) > 0:
         #self.printDocs(scoreDocs)
         return scoreDocs
     query = BooleanQuery()
     for word in words:
         query_word = BooleanQuery()
         query_word.add(TermQuery(Term("wiki_name_analyzed", word)),
                        BooleanClause.Occur.SHOULD)
         query_word.add(TermQuery(Term("wiki_name_analyzed_nopunct", word)),
                        BooleanClause.Occur.SHOULD)
         query_word.add(
             TermQuery(Term("wiki_name_analyzed_nopunct_nostop", word)),
             BooleanClause.Occur.SHOULD)
         query.add(query_word, BooleanClause.Occur.MUST)
     scoreDocs = self.searcher.search(query,
                                      self.num_docs_to_return).scoreDocs
     return scoreDocs
Example #35
0
 def get_phrase_query(self, query, field):
     """Creates phrase query for searching exact phrase."""
     phq = PhraseQuery()
     for t in query.split():
         phq.add(Term(field, t))
     return phq
Example #36
0
    def testSimilarity(self):

        writer = self.getWriter(analyzer=SimpleAnalyzer(
            Version.LUCENE_CURRENT),
                                similarity=SimpleSimilarity())

        d1 = Document()
        d1.add(Field("field", "a c", TextField.TYPE_STORED))

        d2 = Document()
        d2.add(Field("field", "a b c", TextField.TYPE_STORED))

        writer.addDocument(d1)
        writer.addDocument(d2)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        searcher.setSimilarity(SimpleSimilarity())

        a = Term("field", "a")
        b = Term("field", "b")
        c = Term("field", "c")

        class collector1(PythonCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)

            def setNextReader(_self, context):
                pass

            def acceptsDocsOutOfOrder(_self):
                return True

        searcher.search(TermQuery(b), collector1())

        bq = BooleanQuery()
        bq.add(TermQuery(a), BooleanClause.Occur.SHOULD)
        bq.add(TermQuery(b), BooleanClause.Occur.SHOULD)

        class collector2(PythonCollector):
            def collect(_self, doc, score):
                self.assertEqual(doc + _self.base + 1, score)

            def setNextReader(_self, context):
                _self.base = context.docBase

            def acceptsDocsOutOfOrder(_self):
                return True

        searcher.search(bq, collector2())

        pq = PhraseQuery()
        pq.add(a)
        pq.add(c)

        class collector3(PythonCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)

            def setNextReader(_self, context):
                pass

            def acceptsDocsOutOfOrder(_self):
                return True

        searcher.search(pq, collector3())

        pq.setSlop(2)

        class collector4(PythonCollector):
            def collect(_self, doc, score):
                self.assertEqual(2.0, score)

            def setNextReader(_self, context):
                pass

            def acceptsDocsOutOfOrder(_self):
                return True

        searcher.search(pq, collector4())