Ejemplo n.º 1
0
    def testSpanNearQuery(self):

        quick_brown_dog = [self.quick, self.brown, self.dog]
        snq = SpanNearQuery(quick_brown_dog, 0, True)
        self.assertNoMatches(snq)
        self.dumpSpans(snq)

        snq = SpanNearQuery(quick_brown_dog, 4, True)
        self.assertNoMatches(snq)
        self.dumpSpans(snq)

        snq = SpanNearQuery(quick_brown_dog, 5, True)
        self.assertOnlyBrownFox(snq)
        self.dumpSpans(snq)

        # interesting - even a sloppy phrase query would require
        # more slop to match
        snq = SpanNearQuery([self.lazy, self.fox], 3, False)
        self.assertOnlyBrownFox(snq)
        self.dumpSpans(snq)

        pq = PhraseQuery()
        pq.add(Term("f", "lazy"))
        pq.add(Term("f", "fox"))
        pq.setSlop(4)
        self.assertNoMatches(pq)

        pq.setSlop(5)
        self.assertOnlyBrownFox(pq)
Ejemplo n.º 2
0
    def testToString(self):

        query = BooleanQuery()
        query.add(FuzzyQuery(Term("field", "kountry")),
                  BooleanClause.Occur.MUST)
        query.add(TermQuery(Term("title", "western")),
                  BooleanClause.Occur.SHOULD)

        self.assertEqual("+kountry~0.5 title:western", query.toString("field"),
                         "both kinds")
Ejemplo n.º 3
0
    def testSearchByAPI(self):

        tq = TermQuery(Term("content", "hops"))
        topDocs = self.searcher.search(tq, 50)
        self.assertEqual(1, topDocs.totalHits)

        pq = PhraseQuery()
        pq.add(Term("content", "fox"))
        pq.add(Term("content", "hops"))
        topDocs = self.searcher.search(pq, 50)
        self.assertEquals(1, topDocs.totalHits)
Ejemplo n.º 4
0
    def testTerm(self):

        searcher = IndexSearcher(self.directory, True)
        t = Term("subject", "ant")
        query = TermQuery(t)
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs), "JDwA")

        t = Term("subject", "junit")
        scoreDocs = searcher.search(TermQuery(t), 50).scoreDocs
        self.assertEqual(2, len(scoreDocs))

        searcher.close()
Ejemplo n.º 5
0
    def testBasic(self):

        query = MultiPhraseQuery()
        query.add([Term("field", "quick"), Term("field", "fast")])
        query.add(Term("field", "fox"))
        print query

        topDocs = self.searcher.search(query, 10)
        self.assertEqual(1, topDocs.totalHits, "fast fox match")

        query.setSlop(1)
        topDocs = self.searcher.search(query, 10)
        self.assertEqual(2, topDocs.totalHits, "both match")
Ejemplo n.º 6
0
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("f", "the quick brown fox jumps over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("f", "the quick red fox jumps over the sleepy cat",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))
Ejemplo n.º 7
0
    def testSecurityFilter(self):

        query = TermQuery(Term("keywords", "info"))

        searcher = IndexSearcher(self.directory, True)
        topDocs = searcher.search(query, 50)
        self.assertEqual(2, topDocs.totalHits, "Both documents match")

        jakeFilter = QueryWrapperFilter(TermQuery(Term("owner", "jake")))

        scoreDocs = searcher.search(query, jakeFilter, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
        self.assertEqual("jakes sensitive info",
                         searcher.doc(scoreDocs[0].doc).get("keywords"),
                         "elwood is safe")
Ejemplo n.º 8
0
    def testKeyword(self):

        searcher = IndexSearcher(self.directory, True)
        t = Term("isbn", "1930110995")
        query = TermQuery(t)
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs), "JUnit in Action")
Ejemplo n.º 9
0
    def testChinese(self):

        searcher = IndexSearcher(self.directory, True)
        query = TermQuery(Term("contents", "道"))
        scoreDocs = searcher.search(query, 50).scoreDocs

        self.assertEqual(1, len(scoreDocs), "tao")
Ejemplo n.º 10
0
 def __init__(self, reader, field, term):
     self.count = 1
     self.idf = (
         1 + math.log(reader.numDocs() /
                      (float(reader.docFreq(Term(field, term))) + 1)))**2
     self.field = field
     self.term = term
Ejemplo n.º 11
0
    def _searchIndex(self, searcher, row, colname, id_col):
        """
		function to search text in the lucene index, iterates over inputDF row 
		by row, and search the matched candidates with a match score

		By default - WhitespaceAnalyzer is used, other Analyzers are also available.
		"""

        text = row[colname]
        idd = row[id_col]

        # Search word by word of a text containing multiple keywords
        words = text.split()
        results = []
        for word in words:
            query = FuzzyQuery(Term("field", word))
            scoreDocs = searcher.search(query, 50).scoreDocs

            candidates = str(scoreDocs).split("[")[1].split("]")[0].split(",")
            for i, candidate in enumerate(candidates):
                if not candidate:
                    continue

                # Matches : Value + Score
                score = float(
                    candidate.split("score=")[1].replace(">", "").strip())

                value = searcher.doc(scoreDocs[i].doc).get("field")
                if value in results:
                    continue
                results.append((idd, text, value))

        return results
    def purgeDocuments(self,
                       txn,
                       counter,
                       indexSearcher,
                       indexReader,
                       uItem,
                       toVersion=None):

        term = Term("item", uItem.str64())

        if toVersion is None:
            counter.documentCount += indexReader.deleteDocuments(term)

        else:
            x, keep = self.store._items.findValues(None, toVersion, uItem,
                                                   None, True)
            keep = set(keep)

            for hit in indexSearcher.search(TermQuery(term)):
                hit = Hit.cast_(hit)

                doc = hit.getDocument()
                ver = long(doc['version'])
                if ver <= toVersion and UUID(doc['value']) not in keep:
                    indexReader.deleteDocument(hit.getId())
                    counter.documentCount += 1
Ejemplo n.º 13
0
 def searchWithDrillDown(cls, indexReader, taxoReader):
     """
     Search an index with facets drill-down.
     returns a List<FacetResult>
     """
     # base query the user is interested in
     baseQuery = TermQuery(Term(TEXT, "white"))
     # facet of interest
     facetRequest = CountFacetRequest(createCategoryPath(["root", "a"]), 10)
     # initial search - all docs matching the base query will contribute to the accumulation
     res1 = cls.searchWithRequest(indexReader, taxoReader, None,
                                  facetRequest)
     # a single result (because there was a single request)
     fres = res1.get(0)
     # assume the user is interested in the second sub-result
     # (just take the second sub-result returned by the iterator - we know there are 3 results!)
     subResults = fres.getFacetResultNode().getSubResults()
     # NOTE: .getSubResults() yields an "Iterable<? extends FacetResultNode>:"
     #  the elements of this iterator are of type Object and need to be casted to
     #  FacetResultNode by calling FacetResultNode.cast_(obj) first
     resIterator = subResults.iterator()
     resIterator.next()  # skip first result
     resultNode = resIterator.next()
     resultNode = FacetResultNode.cast_(resultNode)
     categoryOfInterest = resultNode.getLabel()
     # drill-down preparation: turn the base query into a drill-down query for the category of interest
     query2 = DrillDown.query(baseQuery, [
         categoryOfInterest,
     ])
     # that's it - search with the new query and we're done!
     # only documents both matching the base query AND containing the
     # category of interest will contribute to the new accumulation
     return cls.searchWithRequestAndQuery(query2, indexReader, taxoReader,
                                          None, facetRequest)
Ejemplo n.º 14
0
    def testAgainstOR(self):

        quickFox = PhraseQuery()
        quickFox.setSlop(1)
        quickFox.add(Term("field", "quick"))
        quickFox.add(Term("field", "fox"))

        fastFox = PhraseQuery()
        fastFox.add(Term("field", "fast"))
        fastFox.add(Term("field", "fox"))

        query = BooleanQuery()
        query.add(quickFox, BooleanClause.Occur.SHOULD)
        query.add(fastFox, BooleanClause.Occur.SHOULD)
        topDocs = self.searcher.search(query, 10)
        self.assertEqual(2, topDocs.totalHits)
Ejemplo n.º 15
0
 def remove(self, id):
     print "removing index for", id
     spec.remove(id)
     # Delete old versions
     WorkflowIndexer.writer.deleteDocuments(
         [Term('workflow_id', id)] )
     print "done removing", id
Ejemplo n.º 16
0
    def testUpdate(self):

        self.assertEqual(1, self.getHitCount("city", "Amsterdam"))

        reader = IndexReader.open(self.dir, False)
        reader.deleteDocuments(Term("city", "Amsterdam"))
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO))
        doc.add(
            Field("contents", "St. Petersburg has lots of bridges",
                  Field.Store.NO, Field.Index.ANALYZED))
        doc.add(
            Field("city", "St. Petersburg", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.optimize()
        writer.close()

        self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
        self.assertEqual(1, self.getHitCount("city", "Petersburg"))
Ejemplo n.º 17
0
    def main(cls, argv):

        query = TermQuery(Term("f", "ipsum"))
        scorer = QueryScorer(query)
        formatter = SimpleHTMLFormatter("<span class=\"highlight\">",
                                        "</span>")
        highlighter = Highlighter(formatter, scorer)
        fragmenter = SimpleFragmenter(50)
        highlighter.setTextFragmenter(fragmenter)

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        tokenStream = analyzer.tokenStream("f", StringReader(cls.text))
        result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...")

        stdout.write("<html>")
        stdout.write("<style>\n")
        stdout.write(".highlight {\n")
        stdout.write(" background: yellow\n")
        stdout.write("}\n")
        stdout.write("</style>")

        stdout.write("<body>")
        stdout.write(result)
        stdout.write("</body></html>\n")
        stdout.flush()
    def undoDocuments(self, indexSearcher, indexReader, uItem, version):

        term = Term("item", uItem.str64())

        for hit in indexSearcher.search(TermQuery(term)):
            hit = Hit.cast_(hit)
            if long(hit.getDocument()['version']) == version:
                indexReader.deleteDocument(hit.getId())
Ejemplo n.º 19
0
    def main(cls, argv):

        if len(argv) != 2:
            print "Usage: BerkeleyDbSearcher <index dir>"
            return

        dbHome = argv[1]

        env = DBEnv()
        env.set_flags(DB_LOG_INMEMORY, 1);
        if os.name == 'nt':
            env.set_cachesize(0, 0x4000000, 1)
        elif os.name == 'posix':
            from commands import getstatusoutput
            if getstatusoutput('uname') == (0, 'Linux'):
                env.set_cachesize(0, 0x4000000, 1)

        env.open(dbHome, (DB_THREAD |
                          DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0)

        index = DB(env)
        blocks = DB(env)
        txn = None

        try:
            txn = env.txn_begin(None)
            index.open(filename = '__index__', dbtype = DB_BTREE,
                       flags = DB_THREAD, txn = txn)
            blocks.open(filename = '__blocks__', dbtype = DB_BTREE,
                        flags = DB_THREAD, txn = txn)
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.commit()
            txn = None

        try:
            txn = env.txn_begin(None)
            directory = DbDirectory(txn, index, blocks, 0)
            searcher = IndexSearcher(directory, True)

            topDocs = searcher.search(TermQuery(Term("contents", "fox")), 50)
            print topDocs.totalHits, "document(s) found"
            searcher.close()
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.abort()

            index.close()
            blocks.close()
            env.close()
Ejemplo n.º 20
0
    def getHitCount(self, fieldName, searchString):

        searcher = IndexSearcher(self.dir, True)
        t = Term(fieldName, searchString)
        query = TermQuery(t)
        hitCount = len(searcher.search(query, 50).scoreDocs)
        searcher.close()

        return hitCount
Ejemplo n.º 21
0
 def searchWithRequest(cls, indexReader, taxoReader, indexingParams,
                       facetRequest):
     """
     Search an index with facets for given facet requests.
     returns a List<FacetResult>
     """
     query = TermQuery(Term(TEXT, "white"))
     return cls.searchWithRequestAndQuery(query, indexReader, taxoReader,
                                          indexingParams, facetRequest)
Ejemplo n.º 22
0
 def addCrowd(self, id, text):
     doc = Document()
     doc.add(
         Field(CrowdFields.id, id, Field.Store.YES,
               Field.Index.NOT_ANALYZED))
     doc.add(
         Field(CrowdFields.text, text, Field.Store.YES,
               Field.Index.ANALYZED))
     self.writer.updateDocument(Term(CrowdFields.id, id), doc)
Ejemplo n.º 23
0
    def testFilteredQuery(self):

        isbns = ["0854402624"]  # Steiner

        accessor = TestSpecialsAccessor(isbns)
        filter = SpecialsFilter(accessor)

        educationBooks = WildcardQuery(Term("category", "*education*"))
        edBooksOnSpecial = FilteredQuery(educationBooks, filter)

        logoBooks = TermQuery(Term("subject", "logo"))

        logoOrEdBooks = BooleanQuery()
        logoOrEdBooks.add(logoBooks, BooleanClause.Occur.SHOULD)
        logoOrEdBooks.add(edBooksOnSpecial, BooleanClause.Occur.SHOULD)

        topDocs = self.searcher.search(logoOrEdBooks, 50)
        print logoOrEdBooks
        self.assertEqual(2, topDocs.totalHits, "Papert and Steiner")
Ejemplo n.º 24
0
    def matched(self, phrase, slop):

        query = PhraseQuery()
        query.setSlop(slop)

        for word in phrase:
            query.add(Term("field", word))

        topDocs = self.searcher.search(query, 50)

        return topDocs.totalHits > 0
Ejemplo n.º 25
0
    def getSynonyms(self, word):

        synList = []
        topDocs = self.searcher.search(TermQuery(Term("word", word)), 50)

        for scoreDoc in topDocs.scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            for value in doc.getValues("syn"):
                synList.append(value)

        return synList
Ejemplo n.º 26
0
    def testOr(self):

        methodologyBooks = TermQuery(
            Term("category", "/technology/computers/programming/methodology"))
        easternPhilosophyBooks = TermQuery(
            Term("category", "/philosophy/eastern"))

        enlightenmentBooks = BooleanQuery()
        enlightenmentBooks.add(methodologyBooks, BooleanClause.Occur.SHOULD)
        enlightenmentBooks.add(easternPhilosophyBooks,
                               BooleanClause.Occur.SHOULD)

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(enlightenmentBooks, 50).scoreDocs
        print "or =", enlightenmentBooks

        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Extreme Programming Explained")
        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    u"Tao Te Ching \u9053\u5FB7\u7D93")
Ejemplo n.º 27
0
    def testPlay(self):

        orQ = SpanOrQuery([self.quick, self.fox])
        self.dumpSpans(orQ)

        quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
        sfq = SpanFirstQuery(quick_fox, 4)
        self.dumpSpans(sfq)

        self.dumpSpans(SpanTermQuery(Term("f", "the")))

        quick_brown = SpanNearQuery([self.quick, self.brown], 0, False)
        self.dumpSpans(quick_brown)
Ejemplo n.º 28
0
    def testHighlighting(self):

        text = "The quick brown fox jumps over the lazy dog"

        query = TermQuery(Term("field", "fox"))
        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream(
            "field", StringReader(text))

        self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog",
                         highlighter.getBestFragment(tokenStream, text))
Ejemplo n.º 29
0
def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs
Ejemplo n.º 30
0
    def testAnd(self):

        searchingBooks = TermQuery(Term("subject", "search"))
        books2004 = NumericRangeQuery.newIntRange("pubmonth", Integer(200401),
                                                  Integer(200412), True, True)

        searchingBooks2004 = BooleanQuery()
        searchingBooks2004.add(searchingBooks, BooleanClause.Occur.MUST)
        searchingBooks2004.add(books2004, BooleanClause.Occur.MUST)

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(searchingBooks2004, 50).scoreDocs

        self.assertHitsIncludeTitle(searcher, scoreDocs, "Lucene in Action")