Ejemplo n.º 1
0
 def enumerateLexicons(self):
     return (
              ( 'plaintext_lexicon'
              , Splitter()
              , CaseNormalizer()
              , StopWordRemover()
              )
            , ( 'htmltext_lexicon'
              , HTMLWordSplitter()
              , CaseNormalizer()
              , StopWordRemover()
              )
            )
Ejemplo n.º 2
0
def index():
    os.environ['STUPID_LOG_FILE'] = ''
    os.environ['STUPID_LOG_SEVERITY'] = '-111'
    import Zope2, Products.ZCatalog.ZCatalog
    import AccessControl.SecurityManagement, AccessControl.SpecialUsers
    app = Zope2.app()
    Products.ZCatalog.ZCatalog.manage_addZCatalog(app, 'cat', '')
    try:
        app.cat.threshold = atoi(sys.argv[2])
    except IndexError:
        app.cat.threashold = 1000

    from Products.ZCTextIndex.ZCTextIndex \
         import PLexicon
    from Products.ZCTextIndex.Lexicon \
         import Splitter, CaseNormalizer

    app.cat._setObject('lex', PLexicon('lex', '', Splitter(),
                                       CaseNormalizer()))

    class extra:
        doc_attr = 'PrincipiaSearchSource'
        lexicon_id = 'lex'
        index_type = 'Okapi BM25 Rank'

    app.cat.addIndex('PrincipiaSearchSource', 'ZCTextIndex', extra)

    transaction.commit()
    system = AccessControl.SpecialUsers.system
    AccessControl.SecurityManagement.newSecurityManager(None, system)
    r = RE()
    r.PARENTS = [app.cat, app]
    print do(Zope2.DB, indexf, (app, ))
    #hist(sys.argv[2])
    Zope2.DB.close()
Ejemplo n.º 3
0
 def setUp(self):
     self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(),
                             StopWordRemover())
     caller = LexiconHolder(self.lexicon)
     self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                                 'text', 'lexicon')
     self.index = self.zc_index.index
Ejemplo n.º 4
0
def setup_catalog(context):
    portal = context.getSite()

    catalog_name = 'marginalia_catalog'
    try:
        catalog = cmfutils.getToolByName(portal, catalog_name)
    except AttributeError:
        # register catalog
        catalog = ZCatalog(catalog_name, u'Marginalia catalog', None, portal)
        portal._setObject(catalog_name, catalog)

    # add indexes and columns
    plaintext_extra = SimpleRecord(lexicon_id='plaintext_lexicon',
                                   index_type='Okapi BM25 Rank')

    indexes = catalog.indexes()
    columns = catalog.schema()

    # install lexicon
    _id = 'plaintext_lexicon'
    if not hasattr(catalog, _id):
        lexicon = PLexicon(_id, '', Splitter(), CaseNormalizer(),
                           StopWordRemover())
        catalog._setObject(_id, lexicon)

        for indexName, indexType, extra in (('edit_type', 'FieldIndex',
                                             None), ('note', 'ZCTextIndex',
                                                     plaintext_extra),
                                            ('link_title', 'FieldIndex',
                                             None)):

            if indexName not in indexes:
                catalog.addIndex(indexName, indexType, extra=extra)
Ejemplo n.º 5
0
    def _initSite(self, foo=2):
        site = Folder(id='site').__of__(self.app)
        ctool = CatalogTool()
        getSiteManager().registerUtility(ctool, ICatalogTool)

        for obj_id in ctool.objectIds():
            ctool._delObject(obj_id)
        for idx_id in ctool.indexes():
            ctool.delIndex(idx_id)
        for col in list(ctool.schema()):
            ctool.delColumn(col)

        if foo > 0:
            ctool._setObject('foo_plexicon', PLexicon('foo_plexicon'))
            lex = ctool.foo_plexicon
            lex._pipeline = (Splitter(), CaseNormalizer(), StopWordRemover())

            extra = _extra()
            extra.lexicon_id = 'foo_plexicon'
            extra.index_type = 'Okapi BM25 Rank'
            ctool.addIndex('foo_zctext', 'ZCTextIndex', extra)

            ctool.addColumn('foo_zctext')

        return site, ctool
Ejemplo n.º 6
0
 def testReindex(self):
     lexicon = PLexicon('lexicon', '',
                         Splitter(),
                         CaseNormalizer(),
                         StopWordRemover())
     caller = LexiconHolder(self.lexicon)
     zc_index = ZCTextIndex('name',
                             None,
                             caller,
                             self.IndexFactory,
                            'text',
                            'lexicon')
     doc = Indexable('Hello Tim')
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('glorious')
     self.assertEqual(len(nbest), 0)
     nbest, total = zc_index.query('Tim')
     self.assertEqual(len(nbest), 1)
     # reindex with another value
     doc.text = 'Goodbye George'
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('Tim')
     self.assertEqual(len(nbest), 0)
     nbest, total = zc_index.query('Goodbye')
     self.assertEqual(len(nbest), 1)
     # reindex with an empty value
     doc.text = ''
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('George')
     self.assertEqual(len(nbest), 0)
Ejemplo n.º 7
0
 def updateIndexes(self):
     if not getattr(self, 'audit_lexicon', None):
         # installing, add lexicon, indexes and metadata
         self.addIndex('last_audited_date', 'DateIndex')
         self.addIndex('audited_action', 'KeywordIndex')
         self.addColumn('Title')
         self.addColumn('id')
         self.addColumn('UID')
         self.addColumn('last_audited_date')
         self.addColumn('audited_action')
         l = PLexicon('audit_lexicon', '', HTMLWordSplitter(),
                      CaseNormalizer(), StopWordRemover())
         self._setObject('audit_lexicon', l)
     catalog = portal_api.get_tool('portal_catalog')
     indexes = catalog._catalog.indexes
     for name, index in indexes.items():
         if name in self._catalog.indexes.keys():
             continue
         if index.meta_type == 'DateRecurringIndex':
             continue
         elif index.meta_type == 'ZCTextIndex':
             extras = Empty()
             extras.doc_attr = name
             extras.index_type = 'Okapi BM25 Rank'
             extras.lexicon_id = 'audit_lexicon'
             self.addIndex(name, index.meta_type, extras)
         else:
             self.addIndex(name, index.meta_type)
Ejemplo n.º 8
0
def setup(lib_python):
    try:
        os.remove(os.path.join(lib_python, '..', '..', 'var', 'Data.fs'))
    except:
        pass
    import Zope2
    import Products
    import AccessControl.SecurityManagement
    app=Zope2.app()

    Products.ZCatalog.ZCatalog.manage_addZCatalog(app, 'cat', '')

    from Products.ZCTextIndex.ZCTextIndex import PLexicon
    from Products.ZCTextIndex.Lexicon import Splitter, CaseNormalizer

    app.cat._setObject('lex',
                       PLexicon('lex', '', Splitter(), CaseNormalizer())
                       )

    class extra:
        doc_attr = 'PrincipiaSearchSource'
        lexicon_id = 'lex'
        index_type = 'Okapi BM25 Rank'

    app.cat.addIndex('PrincipiaSearchSource', 'ZCTextIndex', extra)

    transaction.commit()

    system = AccessControl.SpecialUsers.system
    AccessControl.SecurityManagement.newSecurityManager(None, system)

    app._p_jar.close()
Ejemplo n.º 9
0
def index(rt, mboxfile, db, profiler):
    global NUM
    idx_time = 0
    pack_time = 0
    start_time = time.time()

    lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
    extra = Extra()
    extra.lexicon_id = 'lexicon'
    extra.doc_attr = 'text'
    extra.index_type = 'Okapi BM25 Rank'
    caller = Extra()
    caller.lexicon = lexicon
    rt["index"] = idx = ZCTextIndex("index", extra, caller)
    if not EXCLUDE_TEXT:
        rt["documents"] = docs = IOBTree()
    else:
        docs = None
    transaction.commit()

    mbox = mailbox.UnixMailbox(open(mboxfile, 'rb'))
    if VERBOSE:
        print "opened", mboxfile
    if not NUM:
        NUM = sys.maxint

    if profiler:
        itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db)
    else:
        itime, ptime, i = indexmbox(mbox, idx, docs, db)
    idx_time += itime
    pack_time += ptime

    transaction.commit()

    if PACK_INTERVAL and i % PACK_INTERVAL != 0:
        if VERBOSE >= 2:
            print "packing one last time..."
        p0 = time.clock()
        db.pack(time.time())
        p1 = time.clock()
        if VERBOSE:
            print "pack took %s sec" % (p1 - p0)
        pack_time += p1 - p0

    if VERBOSE:
        finish_time = time.time()
        print
        print "Index time", round(idx_time / 60, 3), "minutes"
        print "Pack time", round(pack_time / 60, 3), "minutes"
        print "Index bytes", Message.total_bytes
        rate = (Message.total_bytes / idx_time) / 1024
        print "Index rate %.2f KB/sec" % rate
        print "Indexing began", time.ctime(start_time)
        print "Indexing ended", time.ctime(finish_time)
        print "Wall clock minutes", round((finish_time - start_time) / 60, 3)
Ejemplo n.º 10
0
 def prescan(self, f, msgs, uniqwords):
     pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
     for n in msgs:
         print "prescanning", n
         m = f.openmessage(n)
         text = self.getmessagetext(m, f.name)
         for p in pipeline:
             text = p.process(text)
         for word in text:
             uniqwords[word] = uniqwords.get(word, 0) + 1
Ejemplo n.º 11
0
    def testSplitterAdaptorFold(self):
        from Products.ZCTextIndex.Lexicon import CaseNormalizer
        from Products.ZCTextIndex.Lexicon import Splitter

        lexicon = self._makeOne(Splitter(), CaseNormalizer())
        wids = lexicon.sourceToWordIds('CATS and dogs')
        wids = lexicon.termToWordIds('cats and dogs')
        self.assertEqual(len(wids), 3)
        first = wids[0]
        self.assertEqual(wids, [first, first + 1, first + 2])
Ejemplo n.º 12
0
 def test_queryLexicon_uses_pipeline_for_normalization(self):
     from Products.ZCTextIndex.Lexicon import CaseNormalizer
     WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
     lexicon = self._makeOne('test', 'Testing', CaseNormalizer())
     lexicon.sourceToWordIds(WORDS)
     info = lexicon.queryLexicon(REQUEST=None, words=['AA*', 'Bbb*'])
     self.assertEqual(info['page'], 0)
     self.assertEqual(info['rows'], 20)
     self.assertEqual(info['cols'], 4)
     self.assertEqual(info['start_word'], 1)
     self.assertEqual(info['end_word'], 2)
     self.assertEqual(info['word_count'], 2)
     self.assertEqual(list(info['page_range']), [0])
     self.assertEqual(info['page_columns'], [['aaa', 'bbb']])
Ejemplo n.º 13
0
 def testMultipleAttributes(self):
     lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(),
                        StopWordRemover())
     caller = LexiconHolder(self.lexicon)
     zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                            'text1,text2', 'lexicon')
     doc = Indexable2('foo bar', 'alpha omega')
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('foo')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('foo alpha')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('foo alpha gamma')
     self.assertEqual(len(nbest), 0)
Ejemplo n.º 14
0
 def testListAttributes(self):
     lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(),
                        StopWordRemover())
     caller = LexiconHolder(self.lexicon)
     zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                            'text1,text2', 'lexicon')
     doc = Indexable2('Hello Tim', \
                      ['Now is the winter of our discontent',
                       'Made glorious summer by this sun of York', ])
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('glorious')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('York Tim')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('Tuesday Tim York')
     self.assertEqual(len(nbest), 0)
Ejemplo n.º 15
0
    def __init__(self, id='Help', title=''):
        self.id = id
        self.title = title
        c = self.catalog = ZCatalog('catalog')

        l = PLexicon('lexicon', '', HTMLWordSplitter(), CaseNormalizer(),
                     StopWordRemover())
        c._setObject('lexicon', l)
        i = ZCTextIndex('SearchableText',
                        caller=c,
                        index_factory=OkapiIndex,
                        lexicon_id=l.id)
        # not using c.addIndex because it depends on Product initialization
        c._catalog.addIndex('SearchableText', i)
        c._catalog.addIndex('categories', KeywordIndex('categories'))
        c._catalog.addIndex('permissions', KeywordIndex('permissions'))
        c.addColumn('categories')
        c.addColumn('permissions')
        c.addColumn('title_or_id')
        c.addColumn('url')
        c.addColumn('id')
Ejemplo n.º 16
0
    def _populate(self, obj):
        from Products.ZCTextIndex.Lexicon import CaseNormalizer
        from Products.ZCTextIndex.Lexicon import Splitter
        from Products.ZCTextIndex.Lexicon import StopWordRemover
        from Products.ZCTextIndex.ZCTextIndex import PLexicon

        obj._setObject('foo_plexicon', PLexicon('foo_plexicon'))
        lex = obj.foo_plexicon
        lex._pipeline = (Splitter(), CaseNormalizer(), StopWordRemover())

        obj.addIndex('foo_date', 'DateIndex')

        obj.addIndex('foo_daterange', 'DateRangeIndex')
        idx = obj._catalog.getIndex('foo_daterange')
        idx._edit('bar', 'baz')

        obj.addIndex('foo_field', 'FieldIndex')
        idx = obj._catalog.getIndex('foo_field')
        idx.indexed_attrs = ('bar', )

        obj.addIndex('foo_keyword', 'KeywordIndex')
        idx = obj._catalog.getIndex('foo_keyword')
        idx.indexed_attrs = ('bar', )

        obj.addIndex('foo_path', 'PathIndex')

        obj.addIndex('foo_topic', 'TopicIndex')
        idx = obj._catalog.getIndex('foo_topic')
        idx.addFilteredSet('bar', 'PythonFilteredSet', 'True')
        idx.addFilteredSet('baz', 'PythonFilteredSet', 'False')

        extra = _extra()
        extra.lexicon_id = 'foo_plexicon'
        extra.index_type = 'Okapi BM25 Rank'
        obj.addIndex('foo_zctext', 'ZCTextIndex', extra)

        obj.addColumn('spam')
        obj.addColumn('eggs')
Ejemplo n.º 17
0
    def _initSite(self, foo=2):
        site = self.root.site = Folder(id='site')
        ctool = site.portal_catalog = CatalogTool()

        for obj_id in ctool.objectIds():
            ctool._delObject(obj_id)
        for idx_id in ctool.indexes():
            ctool.delIndex(idx_id)
        for col in ctool.schema()[:]:
            ctool.delColumn(col)

        if foo > 0:
            ctool._setObject('foo_plexicon', PLexicon('foo_plexicon'))
            lex = ctool.foo_plexicon
            lex._pipeline = (Splitter(), CaseNormalizer(), StopWordRemover())

            extra = _extra()
            extra.lexicon_id = 'foo_plexicon'
            extra.index_type = 'Okapi BM25 Rank'
            ctool.addIndex('foo_zctext', 'ZCTextIndex', extra)

            ctool.addColumn('foo_zctext')

        return site
Ejemplo n.º 18
0
 def testSplitterAdaptorFold(self):
     lexicon = Lexicon(Splitter(), CaseNormalizer())
     wids = lexicon.sourceToWordIds('CATS and dogs')
     wids = lexicon.termToWordIds('cats and dogs')
     self.assertEqual(wids, [1, 2, 3])
Ejemplo n.º 19
0
 def _populate(self, obj):
     from Products.ZCTextIndex.Lexicon import CaseNormalizer
     from Products.ZCTextIndex.Lexicon import Splitter
     from Products.ZCTextIndex.Lexicon import StopWordRemover
     obj._pipeline = (Splitter(), CaseNormalizer(), StopWordRemover())
Ejemplo n.º 20
0
 def __init__(self):
     self.lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
     self.index = OkapiIndex(self.lexicon)