def testQueryParser(self):

        searcher = IndexSearcher(self.directory, True)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            SimpleAnalyzer()).parse("+JUNIT +ANT -MOCK")
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
        d = searcher.doc(scoreDocs[0].doc)
        self.assertEqual("Java Development with Ant", d.get("title"))

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            SimpleAnalyzer()).parse("mock OR junit")
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(2, len(scoreDocs), "JDwA and JIA")
Beispiel #2
0
    def main(cls, argv):

        if len(argv) < 5:
            print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
            return

        docsInIndex = int(argv[1])

        # create an index called 'index-dir' in a temp directory
        indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
                                'index-dir')
        dir = FSDirectory.getDirectory(indexDir, True)
        analyzer = SimpleAnalyzer()
        writer = IndexWriter(dir, analyzer, True)

        # set variables that affect speed of indexing
        writer.setMergeFactor(int(argv[2]))
        writer.setMaxMergeDocs(int(argv[3]))
        writer.setMaxBufferedDocs(int(argv[4]))
        # writer.infoStream = System.out

        print "Merge factor:  ", writer.getMergeFactor()
        print "Max merge docs:", writer.getMaxMergeDocs()
        print "Max buffered docs:", writer.getMaxBufferedDocs()

        start = time()
        for i in xrange(docsInIndex):
            doc = Document()
            doc.add(
                Field("fieldname", "Bibamus", Field.Store.YES,
                      Field.Index.TOKENIZED))
            writer.addDocument(doc)

        writer.close()
        print "Time: ", timedelta(seconds=time() - start)
Beispiel #3
0
    def addDocuments(self, dir):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        #
        # change to adjust performance of indexing with FSDirectory
        # writer.mergeFactor = writer.mergeFactor
        # writer.maxMergeDocs = writer.maxMergeDocs
        # writer.minMergeDocs = writer.minMergeDocs
        #

        for word in self.docs:
            doc = Document()
            doc.add(
                Field("keyword", word, Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO))
            doc.add(
                Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
Beispiel #4
0
    def testWriteLock(self):

        writer1 = IndexWriter(self.dir, SimpleAnalyzer(),
                              IndexWriter.MaxFieldLength.UNLIMITED)
        writer2 = None

        try:
            try:
                writer2 = IndexWriter(self.dir, SimpleAnalyzer(),
                                      IndexWriter.MaxFieldLength.UNLIMITED)
                self.fail("We should never reach this point")
            except:
                pass
        finally:
            writer1.close()
            self.assert_(writer2 is None)
    def testBasicQueryParser(self):

        analyzer = SimpleAnalyzer()
        query = QueryParser(Version.LUCENE_CURRENT, "description",
                            analyzer).parse("partnum:Q36 AND SPACE")

        scoreDocs = self.searcher.search(query, 50).scoreDocs
        self.assertEqual("+partnum:q +space", query.toString("description"),
                         "note Q36 -> q")
        self.assertEqual(0, len(scoreDocs), "doc not found :(")
Beispiel #6
0
    def main(cls, argv):

        print "SimpleAnalyzer"
        cls.displayTokensWithFullDetails(SimpleAnalyzer(),
                                         "The quick brown fox....")

        print "\n----"
        print "StandardAnalyzer"
        cls.displayTokensWithFullDetails(
            StandardAnalyzer(Version.LUCENE_CURRENT),
            "I'll e-mail you at [email protected]")
    def testPerFieldAnalyzer(self):

        analyzer = PerFieldAnalyzerWrapper(SimpleAnalyzer())
        analyzer.addAnalyzer("partnum", KeywordAnalyzer())

        query = QueryParser(Version.LUCENE_CURRENT, "description",
                            analyzer).parse("partnum:Q36 AND SPACE")
        scoreDocs = self.searcher.search(query, 50).scoreDocs

        #self.assertEqual("+partnum:Q36 +space", query.toString("description"))
        self.assertEqual(1, len(scoreDocs), "doc found!")
Beispiel #8
0
    def testHighlighting(self):

        text = "The quick brown fox jumps over the lazy dog"

        query = TermQuery(Term("field", "fox"))
        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream(
            "field", StringReader(text))

        self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog",
                         highlighter.getBestFragment(tokenStream, text))
Beispiel #9
0
    def testSpecifiedOperator(self):

        MUST = BooleanClause.Occur.MUST
        query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,
                                            "development",
                                            ["title", "subject"], [MUST, MUST],
                                            SimpleAnalyzer())

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(query, 50).scoreDocs

        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Java Development with Ant")
        self.assertEqual(1, len(scoreDocs), "one and only one")
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(
            Field("description", "Illidium Space Modulator", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
Beispiel #11
0
    def testDefaultOperator(self):

        SHOULD = BooleanClause.Occur.SHOULD
        query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,
                                            "development",
                                            ["title", "subject"],
                                            [SHOULD, SHOULD], SimpleAnalyzer())

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(query, 50).scoreDocs

        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Java Development with Ant")

        # has "development" in the subject field
        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Extreme Programming Explained")
Beispiel #12
0
    def index(self):

        dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
                               "verbose-index")
        dir = FSDirectory.getDirectory(dirPath, True)
        writer = IndexWriter(dir, SimpleAnalyzer(), True)

        writer.setInfoStream(System.out)

        for i in xrange(100):
            doc = Document()
            doc.add(Field("keyword", "goober",
                             Field.Store.YES, Field.Index.UN_TOKENIZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
Beispiel #13
0
    def testHits(self):

        searcher = IndexSearcher(self.directory, True)
        query = TermQuery(Term("title", "action"))
        scoreDocs = searcher.search(query, 50).scoreDocs

        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            title = doc["title"]
            stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream(
                "title", StringReader(title))
            fragment = highlighter.getBestFragment(stream, title)

            print fragment
Beispiel #14
0
    def addDocuments(self, dir, maxFieldLength):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength(maxFieldLength))
        
        for keyword, unindexed, unstored, text in \
                izip(self.keywords, self.unindexed, self.unstored, self.text):
            doc = Document()
            doc.add(Field("id", keyword,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("country", unindexed,
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("contents", unstored,
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("city", text,
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
Beispiel #15
0
class AnalyzerDemo(object):

    examples = [
        "The quick brown fox jumped over the lazy dogs",
        "XY&Z Corporation - [email protected]"
    ]

    analyzers = [
        WhitespaceAnalyzer(),
        SimpleAnalyzer(),
        StopAnalyzer(Version.LUCENE_CURRENT),
        StandardAnalyzer(Version.LUCENE_CURRENT)
    ]

    def main(cls, argv):

        # Use the embedded example strings, unless
        # command line arguments are specified, then use those.
        strings = cls.examples

        if len(argv) > 1:
            strings = argv[1:]

        for string in strings:
            cls.analyze(string)

    def analyze(cls, text):

        print 'Analyzing "%s"' % (text)

        for analyzer in cls.analyzers:
            name = type(analyzer).__name__
            print " %s:" % (name),
            AnalyzerUtils.displayTokens(analyzer, text)
            print
        print

    main = classmethod(main)
    analyze = classmethod(analyze)
Beispiel #16
0
    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: Explainer <index dir> <query>"

        else:
            indexDir = argv[1]
            queryExpression = argv[2]

            directory = SimpleFSDirectory(File(indexDir))
            query = QueryParser(Version.LUCENE_CURRENT, "contents",
                                SimpleAnalyzer()).parse(queryExpression)

            print "Query:", queryExpression

            searcher = IndexSearcher(directory)
            scoreDocs = searcher.search(query, 50).scoreDocs

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                explanation = searcher.explain(query, scoreDoc.doc)
                print "----------"
                print doc["title"].encode('utf-8')
                print explanation
Beispiel #17
0
class AnalyzerDemo(object):

    examples = ["http://www.baidu.com/ www.baidu.com",
                "联系 本站 版权 所有 上海 交通 大学BBS 饮水思源 站 沪ICP备020861".decode('gbk')]
    
    analyzers = [WhitespaceAnalyzer(),
                 SimpleAnalyzer(),
                 StopAnalyzer(Version.LUCENE_CURRENT),
                 StandardAnalyzer(Version.LUCENE_CURRENT),
                 CJKAnalyzer(Version.LUCENE_CURRENT)]

    def main(cls, argv):

        # Use the embedded example strings, unless
        # command line arguments are specified, then use those.
        strings = cls.examples

        if len(argv) > 1:
            strings = argv[1:]

        for string in strings:
            cls.analyze(string)

    def analyze(cls, text):

        print 'Analyzing "%s"' %(text)

        for analyzer in cls.analyzers:
            name = type(analyzer).__name__
            print " %s:" %(name),
            AnalyzerUtils.displayTokens(analyzer, text)
            print
        print

    main = classmethod(main)
    analyze = classmethod(analyze)
    def getAnalyzer(self):

        return SimpleAnalyzer()