コード例 #1
0
    def testTerm(self):

        searcher = IndexSearcher(self.directory, True)
        t = Term("subject", "ant")
        query = TermQuery(t)
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs), "JDwA")

        t = Term("subject", "junit")
        scoreDocs = searcher.search(TermQuery(t), 50).scoreDocs
        self.assertEqual(2, len(scoreDocs))

        searcher.close()
コード例 #2
0
    def testSecurityFilter(self):

        query = TermQuery(Term("keywords", "info"))

        searcher = IndexSearcher(self.directory, True)
        topDocs = searcher.search(query, 50)
        self.assertEqual(2, topDocs.totalHits, "Both documents match")

        jakeFilter = QueryWrapperFilter(TermQuery(Term("owner", "jake")))

        scoreDocs = searcher.search(query, jakeFilter, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
        self.assertEqual("jakes sensitive info",
                         searcher.doc(scoreDocs[0].doc).get("keywords"),
                         "elwood is safe")
コード例 #3
0
    def search(self, topic):

        query = self.query_parser.parse(topic.title)
        results = self.searcher.search(query, self.top_n)

        score_pairs = {}
        for hit in results.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            for field in ["title", "heading", "text"]:
                terms = doc.get(field).split()
                for term in terms:
                    if (field, term) in score_pairs:
                        score_pairs[(field, term)].increment()
                    else:
                        score_pairs[(field, term)] = ScorePair(
                            self.reader, field, term)  # XXX

        top_terms = score_pairs.values()
        top_terms.sort(key=lambda x: x.score(), reverse=True)
        top_terms = top_terms[:25]

        # print([term.term for term in top_terms])

        bq = BooleanQuery()
        query.setBoost(float(10000000))
        bq.add(query, BooleanClause.Occur.SHOULD)
        for score_pair in top_terms:
            term = score_pair.to_term()
            bq.add(TermQuery(term), BooleanClause.Occur.SHOULD)

        return self.searcher.search(bq, 5000)
コード例 #4
0
ファイル: ChineseTest.py プロジェクト: lauromoraes/pylucene
    def testChinese(self):

        searcher = IndexSearcher(self.directory, True)
        query = TermQuery(Term("contents", "道"))
        scoreDocs = searcher.search(query, 50).scoreDocs

        self.assertEqual(1, len(scoreDocs), "tao")
コード例 #5
0
    def purgeDocuments(self,
                       txn,
                       counter,
                       indexSearcher,
                       indexReader,
                       uItem,
                       toVersion=None):

        term = Term("item", uItem.str64())

        if toVersion is None:
            counter.documentCount += indexReader.deleteDocuments(term)

        else:
            x, keep = self.store._items.findValues(None, toVersion, uItem,
                                                   None, True)
            keep = set(keep)

            for hit in indexSearcher.search(TermQuery(term)):
                hit = Hit.cast_(hit)

                doc = hit.getDocument()
                ver = long(doc['version'])
                if ver <= toVersion and UUID(doc['value']) not in keep:
                    indexReader.deleteDocument(hit.getId())
                    counter.documentCount += 1
コード例 #6
0
ファイル: FacetExample.py プロジェクト: Riolu/Project_Set
 def searchWithDrillDown(cls, indexReader, taxoReader):
     """
     Search an index with facets drill-down.
     returns a List<FacetResult>
     """
     # base query the user is interested in
     baseQuery = TermQuery(Term(TEXT, "white"))
     # facet of interest
     facetRequest = CountFacetRequest(createCategoryPath(["root", "a"]), 10)
     # initial search - all docs matching the base query will contribute to the accumulation
     res1 = cls.searchWithRequest(indexReader, taxoReader, None,
                                  facetRequest)
     # a single result (because there was a single request)
     fres = res1.get(0)
     # assume the user is interested in the second sub-result
     # (just take the second sub-result returned by the iterator - we know there are 3 results!)
     subResults = fres.getFacetResultNode().getSubResults()
     # NOTE: .getSubResults() yields an "Iterable<? extends FacetResultNode>:"
     #  the elements of this iterator are of type Object and need to be casted to
     #  FacetResultNode by calling FacetResultNode.cast_(obj) first
     resIterator = subResults.iterator()
     resIterator.next()  # skip first result
     resultNode = resIterator.next()
     resultNode = FacetResultNode.cast_(resultNode)
     categoryOfInterest = resultNode.getLabel()
     # drill-down preparation: turn the base query into a drill-down query for the category of interest
     query2 = DrillDown.query(baseQuery, [
         categoryOfInterest,
     ])
     # that's it - search with the new query and we're done!
     # only documents both matching the base query AND containing the
     # category of interest will contribute to the new accumulation
     return cls.searchWithRequestAndQuery(query2, indexReader, taxoReader,
                                          None, facetRequest)
コード例 #7
0
ファイル: HighlightIt.py プロジェクト: lauromoraes/pylucene
    def main(cls, argv):

        query = TermQuery(Term("f", "ipsum"))
        scorer = QueryScorer(query)
        formatter = SimpleHTMLFormatter("<span class=\"highlight\">",
                                        "</span>")
        highlighter = Highlighter(formatter, scorer)
        fragmenter = SimpleFragmenter(50)
        highlighter.setTextFragmenter(fragmenter)

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        tokenStream = analyzer.tokenStream("f", StringReader(cls.text))
        result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...")

        stdout.write("<html>")
        stdout.write("<style>\n")
        stdout.write(".highlight {\n")
        stdout.write(" background: yellow\n")
        stdout.write("}\n")
        stdout.write("</style>")

        stdout.write("<body>")
        stdout.write(result)
        stdout.write("</body></html>\n")
        stdout.flush()
コード例 #8
0
    def testKeyword(self):

        searcher = IndexSearcher(self.directory, True)
        t = Term("isbn", "1930110995")
        query = TermQuery(t)
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs), "JUnit in Action")
コード例 #9
0
    def undoDocuments(self, indexSearcher, indexReader, uItem, version):

        term = Term("item", uItem.str64())

        for hit in indexSearcher.search(TermQuery(term)):
            hit = Hit.cast_(hit)
            if long(hit.getDocument()['version']) == version:
                indexReader.deleteDocument(hit.getId())
コード例 #10
0
    def testPhraseQuery(self):

        analyzer = StandardAnalyzer(Version.LUCENE_24)
        q = QueryParser(Version.LUCENE_24, "field", analyzer).parse('"This is Some Phrase*"')
        self.assertEqual('"some phrase"', q.toString("field"), "analyzed")

        q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"term"')
        self.assert_(TermQuery.instance_(q), "reduced to TermQuery")
コード例 #11
0
    def main(cls, argv):

        if len(argv) != 2:
            print "Usage: BerkeleyDbSearcher <index dir>"
            return

        dbHome = argv[1]

        env = DBEnv()
        env.set_flags(DB_LOG_INMEMORY, 1);
        if os.name == 'nt':
            env.set_cachesize(0, 0x4000000, 1)
        elif os.name == 'posix':
            from commands import getstatusoutput
            if getstatusoutput('uname') == (0, 'Linux'):
                env.set_cachesize(0, 0x4000000, 1)

        env.open(dbHome, (DB_THREAD |
                          DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0)

        index = DB(env)
        blocks = DB(env)
        txn = None

        try:
            txn = env.txn_begin(None)
            index.open(filename = '__index__', dbtype = DB_BTREE,
                       flags = DB_THREAD, txn = txn)
            blocks.open(filename = '__blocks__', dbtype = DB_BTREE,
                        flags = DB_THREAD, txn = txn)
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.commit()
            txn = None

        try:
            txn = env.txn_begin(None)
            directory = DbDirectory(txn, index, blocks, 0)
            searcher = IndexSearcher(directory, True)

            topDocs = searcher.search(TermQuery(Term("contents", "fox")), 50)
            print topDocs.totalHits, "document(s) found"
            searcher.close()
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.abort()

            index.close()
            blocks.close()
            env.close()
コード例 #12
0
    def getHitCount(self, fieldName, searchString):

        searcher = IndexSearcher(self.dir, True)
        t = Term(fieldName, searchString)
        query = TermQuery(t)
        hitCount = len(searcher.search(query, 50).scoreDocs)
        searcher.close()

        return hitCount
コード例 #13
0
    def testPhraseQuery(self):

        parser = CustomQueryParser("field", self.analyzer)

        query = parser.parse("singleTerm")
        self.assert_(TermQuery.instance_(query), "TermQuery")

        query = parser.parse("\"a phrase\"")
        self.assert_(SpanNearQuery.instance_(query), "SpanNearQuery")
コード例 #14
0
ファイル: FacetExample.py プロジェクト: Riolu/Project_Set
 def searchWithRequest(cls, indexReader, taxoReader, indexingParams,
                       facetRequest):
     """
     Search an index with facets for given facet requests.
     returns a List<FacetResult>
     """
     query = TermQuery(Term(TEXT, "white"))
     return cls.searchWithRequestAndQuery(query, indexReader, taxoReader,
                                          indexingParams, facetRequest)
コード例 #15
0
    def testToString(self):

        query = BooleanQuery()
        query.add(FuzzyQuery(Term("field", "kountry")),
                  BooleanClause.Occur.MUST)
        query.add(TermQuery(Term("title", "western")),
                  BooleanClause.Occur.SHOULD)

        self.assertEqual("+kountry~0.5 title:western", query.toString("field"),
                         "both kinds")
コード例 #16
0
    def testPhraseQuery(self):

        analyzer = StandardAnalyzer(Version.LUCENE_24)
        q = QueryParser(Version.LUCENE_24, "field",
                        analyzer).parse('"This is Some Phrase*"')
        self.assertEqual("\"some phrase\"", q.toString("field"), "analyzed")

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse('"term"')
        self.assert_(TermQuery.instance_(q), "reduced to TermQuery")
コード例 #17
0
    def testSearchByAPI(self):

        tq = TermQuery(Term("content", "hops"))
        topDocs = self.searcher.search(tq, 50)
        self.assertEqual(1, topDocs.totalHits)

        pq = PhraseQuery()
        pq.add(Term("content", "fox"))
        pq.add(Term("content", "hops"))
        topDocs = self.searcher.search(pq, 50)
        self.assertEquals(1, topDocs.totalHits)
コード例 #18
0
    def testOr(self):

        methodologyBooks = TermQuery(
            Term("category", "/technology/computers/programming/methodology"))
        easternPhilosophyBooks = TermQuery(
            Term("category", "/philosophy/eastern"))

        enlightenmentBooks = BooleanQuery()
        enlightenmentBooks.add(methodologyBooks, BooleanClause.Occur.SHOULD)
        enlightenmentBooks.add(easternPhilosophyBooks,
                               BooleanClause.Occur.SHOULD)

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(enlightenmentBooks, 50).scoreDocs
        print "or =", enlightenmentBooks

        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Extreme Programming Explained")
        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    u"Tao Te Ching \u9053\u5FB7\u7D93")
コード例 #19
0
    def getSynonyms(self, word):

        synList = []
        topDocs = self.searcher.search(TermQuery(Term("word", word)), 50)

        for scoreDoc in topDocs.scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            for value in doc.getValues("syn"):
                synList.append(value)

        return synList
コード例 #20
0
    def testHighlighting(self):

        text = "The quick brown fox jumps over the lazy dog"

        query = TermQuery(Term("field", "fox"))
        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream(
            "field", StringReader(text))

        self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog",
                         highlighter.getBestFragment(tokenStream, text))
コード例 #21
0
    def testAnd(self):

        searchingBooks = TermQuery(Term("subject", "search"))
        books2004 = NumericRangeQuery.newIntRange("pubmonth", Integer(200401),
                                                  Integer(200412), True, True)

        searchingBooks2004 = BooleanQuery()
        searchingBooks2004.add(searchingBooks, BooleanClause.Occur.MUST)
        searchingBooks2004.add(books2004, BooleanClause.Occur.MUST)

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(searchingBooks2004, 50).scoreDocs

        self.assertHitsIncludeTitle(searcher, scoreDocs, "Lucene in Action")
コード例 #22
0
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        self.addPoint(writer, "El Charro", "restaurant", 1, 2)
        self.addPoint(writer, "Cafe Poca Cosa", "restaurant", 5, 9)
        self.addPoint(writer, "Los Betos", "restaurant", 9, 6)
        self.addPoint(writer, "Nico's Taco Shop", "restaurant", 3, 8)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.query = TermQuery(Term("type", "restaurant"))
コード例 #23
0
ファイル: BooksLikeThis.py プロジェクト: lauromoraes/pylucene
    def docsLike(self, id, doc, max):

        authors = doc.getValues("author")
        authorQuery = BooleanQuery()
        for author in authors:
            authorQuery.add(TermQuery(Term("author", author)),
                            BooleanClause.Occur.SHOULD)
        authorQuery.setBoost(2.0)

        vector = self.reader.getTermFreqVector(id, "subject")

        subjectQuery = BooleanQuery()
        for term in vector.getTerms():
            tq = TermQuery(Term("subject", term))
            subjectQuery.add(tq, BooleanClause.Occur.SHOULD)

        likeThisQuery = BooleanQuery()
        likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD)
        likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD)

        # exclude myself
        likeThisQuery.add(TermQuery(Term("isbn", doc.get("isbn"))),
                          BooleanClause.Occur.MUST_NOT)

        print "  Query:", likeThisQuery.toString("contents")
        scoreDocs = self.searcher.search(likeThisQuery, 50).scoreDocs

        docs = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            if len(docs) < max:
                docs.append(doc)
            else:
                break

        return docs
コード例 #24
0
ファイル: CollectorTest.py プロジェクト: lauromoraes/pylucene
    def testCollecting(self):

        query = TermQuery(Term("contents", "junit"))
        searcher = IndexSearcher(self.directory, True)

        collector = BookLinkCollector(searcher)
        searcher.search(query, collector)

        links = collector.getLinks()
        self.assertEqual("java development with ant",
                         links["http://www.manning.com/antbook"])

        scoreDocs = searcher.search(query, 10).scoreDocs
        self.dumpHits(searcher, scoreDocs)

        searcher.close()
コード例 #25
0
    def testPrefix(self):

        searcher = IndexSearcher(self.directory, True)

        # search for programming books, including subcategories
        term = Term("category", "/technology/computers/programming")
        query = PrefixQuery(term)

        topDocs = searcher.search(query, 50)
        programmingAndBelow = topDocs.totalHits

        # only programming books, not subcategories
        topDocs = searcher.search(TermQuery(term), 50)
        justProgramming = topDocs.totalHits

        self.assert_(programmingAndBelow > justProgramming)
コード例 #26
0
    def testHits(self):

        searcher = IndexSearcher(self.directory, True)
        query = TermQuery(Term("title", "action"))
        scoreDocs = searcher.search(query, 50).scoreDocs

        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            title = doc["title"]
            stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream(
                "title", StringReader(title))
            fragment = highlighter.getBestFragment(stream, title)

            print fragment
コード例 #27
0
    def testFilteredQuery(self):

        isbns = ["0854402624"]  # Steiner

        accessor = TestSpecialsAccessor(isbns)
        filter = SpecialsFilter(accessor)

        educationBooks = WildcardQuery(Term("category", "*education*"))
        edBooksOnSpecial = FilteredQuery(educationBooks, filter)

        logoBooks = TermQuery(Term("subject", "logo"))

        logoOrEdBooks = BooleanQuery()
        logoOrEdBooks.add(logoBooks, BooleanClause.Occur.SHOULD)
        logoOrEdBooks.add(edBooksOnSpecial, BooleanClause.Occur.SHOULD)

        topDocs = self.searcher.search(logoOrEdBooks, 50)
        print logoOrEdBooks
        self.assertEqual(2, topDocs.totalHits, "Papert and Steiner")
コード例 #28
0
ファイル: ScoreTest.py プロジェクト: lauromoraes/pylucene
    def testSimple(self):

        class SimpleSimilarity(PythonSimilarity):

            def lengthNorm(_self, field, numTerms):
                return 1.0

            def queryNorm(_self, sumOfSquaredWeights):
                return 1.0

            def tf(_self, freq):
                return freq

            def sloppyFreq(_self, distance):
                return 2.0

            def idfTerms(_self, terms, searcher):
                return 1.0

            def idf(_self, docFreq, numDocs):
                return 1.0

            def coord(_self, overlap, maxOverlap):
                return 1.0

            def scorePayload(_self, docId, fieldName, start, end, payload,
                             offset, length):
                return 1.0

        self.indexSingleFieldDocs([Field("contents", "x", Field.Store.YES,
                                         Field.Index.ANALYZED)])
        searcher = IndexSearcher(self.directory)
        searcher.setSimilarity(SimpleSimilarity())

        query = TermQuery(Term("contents", "x"))
        explanation = searcher.explain(query, 0)
        print explanation

        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))

        self.assertEqual(scoreDocs[0].score, 1.0)
        searcher.close()
コード例 #29
0
    def testTermQuery(self):

        query = TermQuery(Term("partnum", "Q36"))
        scoreDocs = self.searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
コード例 #30
0
    def searchDocuments(self, view, version, query=None, attribute=None):

        store = self.store

        if query is None:
            query = MatchAllDocsQuery()
        else:
            query = QueryParser("contents", StandardAnalyzer()).parse(query)

        if attribute:
            combinedQuery = BooleanQuery()
            combinedQuery.add(query, BooleanClause.Occur.MUST)
            combinedQuery.add(TermQuery(Term("attribute", attribute.str64())),
                              BooleanClause.Occur.MUST)
            query = combinedQuery

        class _collector(PythonHitCollector):
            def __init__(_self):

                super(_collector, _self).__init__()
                _self.hits = []

            def collect(_self, id, score):

                _self.hits.append((-score, id))

        class _iterator(object):
            def __init__(_self):

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __del__(_self):

                try:
                    if _self.searcher is not None:
                        _self.searcher.close()
                    store.abortTransaction(view, _self.txnStatus)
                except:
                    store.repository.logger.exception("in __del__")

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __iter__(_self):

                _self.txnStatus = store.startTransaction(view)
                _self.searcher = searcher = self.getIndexSearcher()
                _self.collector = _collector()

                searcher.search(query, _self.collector)
                hits = _self.collector.hits

                if hits:
                    heapify(hits)
                    while hits:
                        score, id = heappop(hits)
                        doc = searcher.doc(id)
                        uItem = UUID(doc['item'])

                        if long(doc['version']) <= version:
                            if store._items.isValue(view, version, uItem,
                                                    UUID(doc['value'])):
                                yield uItem, UUID(doc['attribute'])

        return _iterator()