Example #1
0
 def testTwoElementPipeline(self):
     lexicon = Lexicon(Splitter(),
                       StupidPipelineElement('cats', 'fish'),
                       WackyReversePipelineElement('fish'))
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('hsif')
     self.assertEqual(wids, [1])
Example #2
0
 def testThreeElementPipeline(self):
     lexicon = Lexicon(Splitter(), StopWordPipelineElement({'and': 1}),
                       StupidPipelineElement('dogs', 'fish'),
                       WackyReversePipelineElement('fish'))
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('hsif')
     self.assertEqual(wids, [2])
Example #3
0
def make_old_index():
    from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
    from Products.PluginIndexes.TextIndex.Lexicon  import Lexicon
    from Products.ZCTextIndex.StopDict import get_stopdict

    l = Lexicon(get_stopdict())
    l.SplitterFunc = MySplitter()
    return TextIndex("read", lexicon=l)
Example #4
0
def make_old_index():
    from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
    from Products.PluginIndexes.TextIndex.Lexicon  import Lexicon
    from Products.ZCTextIndex.StopDict import get_stopdict

    l = Lexicon(get_stopdict())
    l.SplitterFunc = MySplitter()
    return TextIndex("read", lexicon=l)
Example #5
0
    def testTermToWordIdsWithProcess_post_glob(self):
        """This test is for added process_post_glob"""
        class AddedSplitter(Splitter):
            def process_post_glob(self, lst):
                assert lst == ['dogs']
                return ['dogs']

        lexicon = Lexicon(AddedSplitter())
        wids = lexicon.sourceToWordIds('cats and dogs')
        wids = lexicon.termToWordIds('dogs')
        self.assertEqual(wids, [3])
 def setUp(self):
     from Products.ZCTextIndex.QueryParser import QueryParser
     from Products.ZCTextIndex.Lexicon import Lexicon
     from Products.ZCTextIndex.Lexicon import Splitter
     # Only 'stop' is a stopword (but 'and' is still an operator)
     self.lexicon = Lexicon(Splitter(), FakeStopWordRemover())
     self.parser = QueryParser(self.lexicon)
Example #7
0
    def test_reindex_doc_conflict(self):
        self.index = OkapiIndex(Lexicon())
        self.index.index_doc(0, 'Sometimes change is good')
        self.index.index_doc(1, 'Then again, who asked')
        self.openDB()
        r1 = self.db.open().root()
        r1['i'] = self.index
        transaction.commit()

        r2 = self.db.open().root()
        copy = r2['i']
        # Make sure the data is loaded
        list(copy._docweight.items())
        list(copy._docwords.items())
        list(copy._wordinfo.items())
        list(copy._lexicon._wids.items())
        list(copy._lexicon._words.items())

        self.assertEqual(self.index._p_serial, copy._p_serial)

        self.index.index_doc(0, 'Sometimes change isn\'t bad')
        transaction.commit()

        copy.index_doc(1, 'Then again, who asked you?')
        transaction.commit()
Example #8
0
 def test_upgrade_totaldoclen(self):
     self.index1 = OkapiIndex(Lexicon())
     self.index2 = OkapiIndex(Lexicon())
     self.index1.index_doc(0, 'The quiet of night')
     self.index2.index_doc(0, 'The quiet of night')
     # Revert index1 back to a long to simulate an older index instance
     self.index1._totaldoclen = int(self.index1._totaldoclen())
     self.index1.index_doc(1, 'gazes upon my shadow')
     self.index2.index_doc(1, 'gazes upon my shadow')
     self.assertEqual(
         self.index1._totaldoclen(), self.index2._totaldoclen())
     self.index1._totaldoclen = int(self.index1._totaldoclen())
     self.index1.unindex_doc(0)
     self.index2.unindex_doc(0)
     self.assertEqual(
         self.index1._totaldoclen(), self.index2._totaldoclen())
Example #9
0
 def test_upgrade_document_count(self):
     self.index1 = OkapiIndex(Lexicon())
     self.index2 = OkapiIndex(Lexicon())
     self.index1.index_doc(0, 'The quiet of night')
     self.index2.index_doc(0, 'The quiet of night')
     # Revert index1 back to simulate an older index instance
     del self.index1.document_count
     self.index1.index_doc(1, 'gazes upon my shadow')
     self.index2.index_doc(1, 'gazes upon my shadow')
     self.assertIs(self.index1.document_count.__class__, Length)
     self.assertEqual(
         self.index1.document_count(), self.index2.document_count())
     del self.index1.document_count
     self.index1.unindex_doc(0)
     self.index2.unindex_doc(0)
     self.assertIs(self.index1.document_count.__class__, Length)
     self.assertEqual(
         self.index1.document_count(), self.index2.document_count())
Example #10
0
def index(rt, mboxfile, db, profiler):
    global NUM
    idx_time = 0
    pack_time = 0
    start_time = time.time()

    lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
    extra = Extra()
    extra.lexicon_id = 'lexicon'
    extra.doc_attr = 'text'
    extra.index_type = 'Okapi BM25 Rank'
    caller = Extra()
    caller.lexicon = lexicon
    rt["index"] = idx = ZCTextIndex("index", extra, caller)
    if not EXCLUDE_TEXT:
        rt["documents"] = docs = IOBTree()
    else:
        docs = None
    transaction.commit()

    mbox = mailbox.UnixMailbox(open(mboxfile, 'rb'))
    if VERBOSE:
        print "opened", mboxfile
    if not NUM:
        NUM = sys.maxint

    if profiler:
        itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db)
    else:
        itime, ptime, i = indexmbox(mbox, idx, docs, db)
    idx_time += itime
    pack_time += ptime

    transaction.commit()

    if PACK_INTERVAL and i % PACK_INTERVAL != 0:
        if VERBOSE >= 2:
            print "packing one last time..."
        p0 = time.clock()
        db.pack(time.time())
        p1 = time.clock()
        if VERBOSE:
            print "pack took %s sec" % (p1 - p0)
        pack_time += p1 - p0

    if VERBOSE:
        finish_time = time.time()
        print
        print "Index time", round(idx_time / 60, 3), "minutes"
        print "Pack time", round(pack_time / 60, 3), "minutes"
        print "Index bytes", Message.total_bytes
        rate = (Message.total_bytes / idx_time) / 1024
        print "Index rate %.2f KB/sec" % rate
        print "Indexing began", time.ctime(start_time)
        print "Indexing ended", time.ctime(finish_time)
        print "Wall clock minutes", round((finish_time - start_time) / 60, 3)
Example #11
0
def make_zc_index():
    # there's an elaborate dance necessary to construct an index
    class Struct:
        pass
    extra = Struct()
    extra.doc_attr = "read"
    extra.lexicon_id = "lexicon"
    caller = Struct()
    caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
    return ZCTextIndex("read", extra, caller)
Example #12
0
class TestLexiconConflict(unittest.TestCase):
    
    db = None

    def tearDown(self):
        if self.db is not None:
            self.db.close()
            self.storage.cleanup()

    def openDB(self):
        from ZODB.FileStorage import FileStorage
        from ZODB.DB import DB
        n = 'fs_tmp__%s' % os.getpid()
        self.storage = FileStorage(n)
        self.db = DB(self.storage)
        
    def testAddWordConflict(self):
        self.l = Lexicon(Splitter())
        self.openDB()
        r1 = self.db.open().root()
        r1['l'] = self.l
        transaction.commit()
        
        r2 = self.db.open().root()
        copy = r2['l']
        # Make sure the data is loaded
        list(copy._wids.items())
        list(copy._words.items())
        copy.length()
        
        self.assertEqual(self.l._p_serial, copy._p_serial)
        
        self.l.sourceToWordIds('mary had a little lamb')
        transaction.commit()
        
        copy.sourceToWordIds('whose fleece was')
        copy.sourceToWordIds('white as snow')
        transaction.commit()
        self.assertEqual(copy.length(), 11)
        self.assertEqual(copy.length(), len(copy._words))
Example #13
0
class TestLexiconConflict(unittest.TestCase):

    db = None

    def tearDown(self):
        if self.db is not None:
            self.db.close()
            self.storage.cleanup()

    def openDB(self):
        from ZODB.FileStorage import FileStorage
        from ZODB.DB import DB
        n = 'fs_tmp__%s' % os.getpid()
        self.storage = FileStorage(n)
        self.db = DB(self.storage)

    def testAddWordConflict(self):
        self.l = Lexicon(Splitter())
        self.openDB()
        r1 = self.db.open().root()
        r1['l'] = self.l
        transaction.commit()

        r2 = self.db.open().root()
        copy = r2['l']
        # Make sure the data is loaded
        list(copy._wids.items())
        list(copy._words.items())
        copy.length()

        self.assertEqual(self.l._p_serial, copy._p_serial)

        self.l.sourceToWordIds('mary had a little lamb')
        transaction.commit()

        copy.sourceToWordIds('whose fleece was')
        copy.sourceToWordIds('white as snow')
        transaction.commit()
        self.assertEqual(copy.length(), 11)
        self.assertEqual(copy.length(), len(copy._words))
Example #14
0
    def testAddWordConflict(self):
        self.l = Lexicon(Splitter())
        self.openDB()
        r1 = self.db.open().root()
        r1['l'] = self.l
        transaction.commit()

        r2 = self.db.open().root()
        copy = r2['l']
        # Make sure the data is loaded
        list(copy._wids.items())
        list(copy._words.items())
        copy.length()

        self.assertEqual(self.l._p_serial, copy._p_serial)

        self.l.sourceToWordIds('mary had a little lamb')
        transaction.commit()

        copy.sourceToWordIds('whose fleece was')
        copy.sourceToWordIds('white as snow')
        transaction.commit()
        self.assertEqual(copy.length(), 11)
        self.assertEqual(copy.length(), len(copy._words))
Example #15
0
    def test_index_doc_conflict(self):
        self.index = OkapiIndex(Lexicon())
        self.openDB()
        r1 = self.db.open().root()
        r1['i'] = self.index
        transaction.commit()

        r2 = self.db.open().root()
        copy = r2['i']
        # Make sure the data is loaded
        list(copy._docweight.items())
        list(copy._docwords.items())
        list(copy._wordinfo.items())
        list(copy._lexicon._wids.items())
        list(copy._lexicon._words.items())

        self.assertEqual(self.index._p_serial, copy._p_serial)

        self.index.index_doc(0, 'The time has come')
        transaction.commit()

        copy.index_doc(1, 'That time has gone')
        transaction.commit()
Example #16
0
 def setUp(self):
     app = Application()
     catalog = ZCatalog('Catalog')
     app._setObject('Catalog', catalog)
     self.catalog = catalog = app._getOb('Catalog')
     install_products(app, 'ManagableIndex')
     # field
     self.fi = self._createIndex('id', FieldIndex)
     # keyword
     self.ki = self._createIndex('kw', KeywordIndex)
     # range
     self.ri = self._createIndex(
         'ri',
         RangeIndex,
         dict(CombineType='aggregate',
              ValueProviders=[
                  dict(id='rlow', type='AttributeLookup'),
                  dict(id='rhigh', type='AttributeLookup'),
              ]),
     )
     # word
     lexicon = Lexicon(Splitter())
     app._setObject('lexicon', lexicon)
     self.wi = self._createIndex('wi', WordIndex, dict(Lexicon='lexicon'))
     # simple text
     self.sti = self._createIndex('sti', SimpleTextIndex,
                                  dict(Lexicon='lexicon'))
     # path
     self.pi = self._createIndex('pi', PathIndex)
     # create objects
     self.obj1 = obj1 = _Object()
     obj1.kw = (1, 2)
     obj1.fkw = _Caller(lambda obj: obj.kw)
     obj1.fid = _Caller(lambda obj: obj.id)
     self.obj2 = obj2 = _Object().__of__(obj1)
     obj2.id = 'id'
Example #17
0
 def testAddWordConflict(self):
     self.l = Lexicon(Splitter())
     self.openDB()
     r1 = self.db.open().root()
     r1['l'] = self.l
     transaction.commit()
     
     r2 = self.db.open().root()
     copy = r2['l']
     # Make sure the data is loaded
     list(copy._wids.items())
     list(copy._words.items())
     copy.length()
     
     self.assertEqual(self.l._p_serial, copy._p_serial)
     
     self.l.sourceToWordIds('mary had a little lamb')
     transaction.commit()
     
     copy.sourceToWordIds('whose fleece was')
     copy.sourceToWordIds('white as snow')
     transaction.commit()
     self.assertEqual(copy.length(), 11)
     self.assertEqual(copy.length(), len(copy._words))
Example #18
0
 def testSplitterAdaptorNofold(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds('CATS and dogs')
     wids = lexicon.termToWordIds('cats and dogs')
     self.assertEqual(wids, [0, 2, 3])
Example #19
0
 def testSplitterAdaptorNofold(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds('CATS and dogs')
     wids = lexicon.termToWordIds('cats and dogs')
     self.assertEqual(wids, [0, 2, 3])
Example #20
0
 def setUp(self):
     self.lexicon = Lexicon(Splitter())
     self.index = self.IndexFactory(self.lexicon)
Example #21
0
 def testOnePipelineElement(self):
     lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('fish')
     self.assertEqual(wids, [3])
Example #22
0
 def testUpgradeLength(self):
     from BTrees.Length import Length
     lexicon = Lexicon(Splitter())
     del lexicon.length # Older instances don't override length
     lexicon.sourceToWordIds('how now brown cow')
     self.assert_(lexicon.length.__class__ is Length)        
Example #23
0
class IndexTest(object):
    # Subclasses must set a class variable IndexFactory to the appropriate
    # index object constructor.
    IndexFactory = None

    def setUp(self):
        self.lexicon = Lexicon(Splitter())
        self.index = self.IndexFactory(self.lexicon)

    def test_index_document(self, docid=1):
        doc = 'simple document contains five words'
        self.assertFalse(self.index.has_doc(docid))
        self.index.index_doc(docid, doc)
        self.assertTrue(self.index.has_doc(docid))
        self.assertTrue(self.index._docweight[docid])
        self.assertEqual(len(self.index._docweight), 1)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(docid)), 5)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assertIn(docid, map)

    def test_unindex_document(self):
        docid = 1
        self.test_index_document(docid)
        self.index.unindex_doc(docid)
        self.assertEqual(len(self.index._docweight), 0)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertEqual(len(self.index._wordinfo), 0)
        self.assertEqual(len(self.index._docwords), 0)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())

    def test_index_two_documents(self):
        self.test_index_document()
        doc = 'another document just four'
        docid = 2
        self.index.index_doc(docid, doc)
        self.assertTrue(self.index._docweight[docid])
        self.assertEqual(len(self.index._docweight), 2)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertEqual(len(self.index._wordinfo), 8)
        self.assertEqual(len(self.index._docwords), 2)
        self.assertEqual(len(self.index.get_words(docid)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        wids = self.lexicon.termToWordIds('document')
        self.assertEqual(len(wids), 1)
        document_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            if wid == document_wid:
                self.assertEqual(len(map), 2)
                self.assertIn(1, map)
                self.assertIn(docid, map)
            else:
                self.assertEqual(len(map), 1)

    def test_index_two_unindex_one(self):
        # index two documents, unindex one, and test the results
        self.test_index_two_documents()
        self.index.unindex_doc(1)
        docid = 2
        self.assertEqual(len(self.index._docweight), 1)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertTrue(self.index._docweight[docid])
        self.assertEqual(len(self.index._wordinfo), 4)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(docid)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assertIn(docid, map)

    def test_index_duplicated_words(self, docid=1):
        doc = 'very simple repeat repeat repeat document test'
        self.index.index_doc(docid, doc)
        self.assertTrue(self.index._docweight[docid])
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(docid)), 7)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        wids = self.lexicon.termToWordIds('repeat')
        self.assertEqual(len(wids), 1)
        for wid, map in self.index._wordinfo.items():
            self.assertEqual(len(map), 1)
            self.assertIn(docid, map)

    def test_simple_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search('document')
        self.assertEqual(list(results.keys()), [1])

    def test_simple_query_noresults(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search('frobnicate')
        self.assertEqual(list(results.keys()), [])

    def test_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        self.index.index_doc(2, 'something about something else')
        results = self.index.search('document')
        self.assertEqual(list(results.keys()), [1])

    def test_search_phrase(self):
        self.index.index_doc(1, 'the quick brown fox jumps over the lazy dog')
        self.index.index_doc(2, 'the quick fox jumps lazy over the brown dog')
        results = self.index.search_phrase('quick brown fox')
        self.assertEqual(list(results.keys()), [1])

    def test_search_glob(self):
        self.index.index_doc(1, 'how now brown cow')
        self.index.index_doc(2, 'hough nough browne cough')
        self.index.index_doc(3, 'bar brawl')
        results = self.index.search_glob('bro*')
        self.assertEqual(list(results.keys()), [1, 2])
        results = self.index.search_glob('b*')
        self.assertEqual(list(results.keys()), [1, 2, 3])
Example #24
0
class IndexTest(TestCase):

    def setUp(self):
        self.lexicon = Lexicon(Splitter())
        self.index = self.IndexFactory(self.lexicon)

    def test_index_document(self, DOCID=1):
        doc = "simple document contains five words"
        self.assert_(not self.index.has_doc(DOCID))
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index.has_doc(DOCID))
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._docweight), 1)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 5)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_unindex_document(self):
        DOCID = 1
        self.test_index_document(DOCID)
        self.index.unindex_doc(DOCID)
        self.assertEqual(len(self.index._docweight), 0)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertEqual(len(self.index._wordinfo), 0)
        self.assertEqual(len(self.index._docwords), 0)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())

    def test_index_two_documents(self):
        self.test_index_document()
        doc = "another document just four"
        DOCID = 2
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._docweight), 2)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertEqual(len(self.index._wordinfo), 8)
        self.assertEqual(len(self.index._docwords), 2)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        wids = self.lexicon.termToWordIds("document")
        self.assertEqual(len(wids), 1)
        document_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            if wid == document_wid:
                self.assertEqual(len(map), 2)
                self.assert_(map.has_key(1))
                self.assert_(map.has_key(DOCID))
            else:
                self.assertEqual(len(map), 1)

    def test_index_two_unindex_one(self):
        # index two documents, unindex one, and test the results
        self.test_index_two_documents()
        self.index.unindex_doc(1)
        DOCID = 2
        self.assertEqual(len(self.index._docweight), 1)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 4)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_index_duplicated_words(self, DOCID=1):
        doc = "very simple repeat repeat repeat document test"
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 7)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        wids = self.lexicon.termToWordIds("repeat")
        self.assertEqual(len(wids), 1)
        repititive_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_simple_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search("document")
        self.assertEqual(list(results.keys()), [1])

    def test_simple_query_noresults(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search("frobnicate")
        self.assertEqual(list(results.keys()), [])

    def test_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        self.index.index_doc(2, 'something about something else')
        results = self.index.search("document")
        self.assertEqual(list(results.keys()), [1])

    def test_search_phrase(self):
        self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
        self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
        results = self.index.search_phrase("quick brown fox")
        self.assertEqual(list(results.keys()), [1])

    def test_search_glob(self):
        self.index.index_doc(1, "how now brown cow")
        self.index.index_doc(2, "hough nough browne cough")
        self.index.index_doc(3, "bar brawl")
        results = self.index.search_glob("bro*")
        self.assertEqual(list(results.keys()), [1, 2])
        results = self.index.search_glob("b*")
        self.assertEqual(list(results.keys()), [1, 2, 3])
Example #25
0
 def testTermToWordIds(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('dogs')
     self.assertEqual(wids, [3])
Example #26
0
 def testUpgradeLength(self):
     from BTrees.Length import Length
     lexicon = Lexicon(Splitter())
     del lexicon.length  # Older instances don't override length
     lexicon.sourceToWordIds('how now brown cow')
     self.assert_(lexicon.length.__class__ is Length)
Example #27
0
 def setUp(self):
     # Only 'stop' is a stopword (but 'and' is still an operator)
     self.lexicon = Lexicon(Splitter(), FakeStopWordRemover())
     self.parser = QueryParser(self.lexicon)
Example #28
0
 def testOnePipelineElement(self):
     lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('fish')
     self.assertEqual(wids, [3])
Example #29
0
class IndexTest(TestCase):

    def setUp(self):
        self.lexicon = Lexicon(Splitter())
        self.index = self.IndexFactory(self.lexicon)

    def test_index_document(self, DOCID=1):
        doc = "simple document contains five words"
        self.assert_(not self.index.has_doc(DOCID))
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index.has_doc(DOCID))
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._docweight), 1)
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 5)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_unindex_document(self):
        DOCID = 1
        self.test_index_document(DOCID)
        self.index.unindex_doc(DOCID)
        self.assertEqual(len(self.index._docweight), 0)
        self.assertEqual(len(self.index._wordinfo), 0)
        self.assertEqual(len(self.index._docwords), 0)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())

    def test_index_two_documents(self):
        self.test_index_document()
        doc = "another document just four"
        DOCID = 2
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._docweight), 2)
        self.assertEqual(len(self.index._wordinfo), 8)
        self.assertEqual(len(self.index._docwords), 2)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        wids = self.lexicon.termToWordIds("document")
        self.assertEqual(len(wids), 1)
        document_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            if wid == document_wid:
                self.assertEqual(len(map), 2)
                self.assert_(map.has_key(1))
                self.assert_(map.has_key(DOCID))
            else:
                self.assertEqual(len(map), 1)

    def test_index_two_unindex_one(self):
        # index two documents, unindex one, and test the results
        self.test_index_two_documents()
        self.index.unindex_doc(1)
        DOCID = 2
        self.assertEqual(len(self.index._docweight), 1)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 4)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_index_duplicated_words(self, DOCID=1):
        doc = "very simple repeat repeat repeat document test"
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 7)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        wids = self.lexicon.termToWordIds("repeat")
        self.assertEqual(len(wids), 1)
        repititive_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_simple_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search("document")
        self.assertEqual(list(results.keys()), [1])

    def test_simple_query_noresults(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search("frobnicate")
        self.assertEqual(list(results.keys()), [])

    def test_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        self.index.index_doc(2, 'something about something else')
        results = self.index.search("document")
        self.assertEqual(list(results.keys()), [1])

    def test_search_phrase(self):
        self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
        self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
        results = self.index.search_phrase("quick brown fox")
        self.assertEqual(list(results.keys()), [1])

    def test_search_glob(self):
        self.index.index_doc(1, "how now brown cow")
        self.index.index_doc(2, "hough nough browne cough")
        self.index.index_doc(3, "bar brawl")
        results = self.index.search_glob("bro*")
        self.assertEqual(list(results.keys()), [1, 2])
        results = self.index.search_glob("b*")
        self.assertEqual(list(results.keys()), [1, 2, 3])
Example #30
0
 def setUp(self):
     self.lexicon = Lexicon(Splitter())
     self.parser = QueryParser(self.lexicon)
Example #31
0
 def testMissingTermToWordIds(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('boxes')
     self.assertEqual(wids, [0])
Example #32
0
 def testMissingTermToWordIds(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('boxes')
     self.assertEqual(wids, [0])
Example #33
0
 def testTermToWordIds(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('dogs')
     self.assertEqual(wids, [3])
 def setUp(self):
     from Products.ZCTextIndex.QueryParser import QueryParser
     from Products.ZCTextIndex.Lexicon import Lexicon
     from Products.ZCTextIndex.Lexicon import Splitter
     self.lexicon = Lexicon(Splitter())
     self.parser = QueryParser(self.lexicon)
Example #35
0
 def __init__(self):
     self.lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
     self.index = OkapiIndex(self.lexicon)
Example #36
0
 def test_query_before_document_count_upgrade(self):
     self.index1 = OkapiIndex(Lexicon(Splitter()))
     self.index1.index_doc(0, 'The quiet of night')
     # Revert index1 back to a long to simulate an older index instance
     del self.index1.document_count
     self.assertEqual(len(self.index1.search('night')), 1)
Example #37
0
 def setUp(self):
     self.lexicon = Lexicon(Splitter())
     self.index = self.IndexFactory(self.lexicon)
Example #38
0
 def setUp(self):
     self.lexicon = Lexicon(Splitter())
     self.parser = QueryParser(self.lexicon)
     self.index = FauxIndex()