Exemple #1
0
 def enumerateLexicons(self):
     return (
              ( 'plaintext_lexicon'
              , Splitter()
              , CaseNormalizer()
              , StopWordRemover()
              )
            , ( 'htmltext_lexicon'
              , HTMLWordSplitter()
              , CaseNormalizer()
              , StopWordRemover()
              )
            )
Exemple #2
0
 def updateIndexes(self):
     if not getattr(self, 'audit_lexicon', None):
         # installing, add lexicon, indexes and metadata
         self.addIndex('last_audited_date', 'DateIndex')
         self.addIndex('audited_action', 'KeywordIndex')
         self.addColumn('Title')
         self.addColumn('id')
         self.addColumn('UID')
         self.addColumn('last_audited_date')
         self.addColumn('audited_action')
         l = PLexicon('audit_lexicon', '', HTMLWordSplitter(),
                      CaseNormalizer(), StopWordRemover())
         self._setObject('audit_lexicon', l)
     catalog = portal_api.get_tool('portal_catalog')
     indexes = catalog._catalog.indexes
     for name, index in indexes.items():
         if name in self._catalog.indexes.keys():
             continue
         if index.meta_type == 'DateRecurringIndex':
             continue
         elif index.meta_type == 'ZCTextIndex':
             extras = Empty()
             extras.doc_attr = name
             extras.index_type = 'Okapi BM25 Rank'
             extras.lexicon_id = 'audit_lexicon'
             self.addIndex(name, index.meta_type, extras)
         else:
             self.addIndex(name, index.meta_type)
Exemple #3
0
 def setUp(self):
     self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(),
                             StopWordRemover())
     caller = LexiconHolder(self.lexicon)
     self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                                 'text', 'lexicon')
     self.index = self.zc_index.index
Exemple #4
0
    def _initSite(self, foo=2):
        site = Folder(id='site').__of__(self.app)
        ctool = CatalogTool()
        getSiteManager().registerUtility(ctool, ICatalogTool)

        for obj_id in ctool.objectIds():
            ctool._delObject(obj_id)
        for idx_id in ctool.indexes():
            ctool.delIndex(idx_id)
        for col in list(ctool.schema()):
            ctool.delColumn(col)

        if foo > 0:
            ctool._setObject('foo_plexicon', PLexicon('foo_plexicon'))
            lex = ctool.foo_plexicon
            lex._pipeline = (Splitter(), CaseNormalizer(), StopWordRemover())

            extra = _extra()
            extra.lexicon_id = 'foo_plexicon'
            extra.index_type = 'Okapi BM25 Rank'
            ctool.addIndex('foo_zctext', 'ZCTextIndex', extra)

            ctool.addColumn('foo_zctext')

        return site, ctool
def setup_catalog(context):
    portal = context.getSite()

    catalog_name = 'marginalia_catalog'
    try:
        catalog = cmfutils.getToolByName(portal, catalog_name)
    except AttributeError:
        # register catalog
        catalog = ZCatalog(catalog_name, u'Marginalia catalog', None, portal)
        portal._setObject(catalog_name, catalog)

    # add indexes and columns
    plaintext_extra = SimpleRecord(lexicon_id='plaintext_lexicon',
                                   index_type='Okapi BM25 Rank')

    indexes = catalog.indexes()
    columns = catalog.schema()

    # install lexicon
    _id = 'plaintext_lexicon'
    if not hasattr(catalog, _id):
        lexicon = PLexicon(_id, '', Splitter(), CaseNormalizer(),
                           StopWordRemover())
        catalog._setObject(_id, lexicon)

        for indexName, indexType, extra in (('edit_type', 'FieldIndex',
                                             None), ('note', 'ZCTextIndex',
                                                     plaintext_extra),
                                            ('link_title', 'FieldIndex',
                                             None)):

            if indexName not in indexes:
                catalog.addIndex(indexName, indexType, extra=extra)
 def testReindex(self):
     lexicon = PLexicon('lexicon', '',
                         Splitter(),
                         CaseNormalizer(),
                         StopWordRemover())
     caller = LexiconHolder(self.lexicon)
     zc_index = ZCTextIndex('name',
                             None,
                             caller,
                             self.IndexFactory,
                            'text',
                            'lexicon')
     doc = Indexable('Hello Tim')
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('glorious')
     self.assertEqual(len(nbest), 0)
     nbest, total = zc_index.query('Tim')
     self.assertEqual(len(nbest), 1)
     # reindex with another value
     doc.text = 'Goodbye George'
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('Tim')
     self.assertEqual(len(nbest), 0)
     nbest, total = zc_index.query('Goodbye')
     self.assertEqual(len(nbest), 1)
     # reindex with an empty value
     doc.text = ''
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('George')
     self.assertEqual(len(nbest), 0)
def index(rt, mboxfile, db, profiler):
    global NUM
    idx_time = 0
    pack_time = 0
    start_time = time.time()

    lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
    extra = Extra()
    extra.lexicon_id = 'lexicon'
    extra.doc_attr = 'text'
    extra.index_type = 'Okapi BM25 Rank'
    caller = Extra()
    caller.lexicon = lexicon
    rt["index"] = idx = ZCTextIndex("index", extra, caller)
    if not EXCLUDE_TEXT:
        rt["documents"] = docs = IOBTree()
    else:
        docs = None
    transaction.commit()

    mbox = mailbox.UnixMailbox(open(mboxfile, 'rb'))
    if VERBOSE:
        print "opened", mboxfile
    if not NUM:
        NUM = sys.maxint

    if profiler:
        itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db)
    else:
        itime, ptime, i = indexmbox(mbox, idx, docs, db)
    idx_time += itime
    pack_time += ptime

    transaction.commit()

    if PACK_INTERVAL and i % PACK_INTERVAL != 0:
        if VERBOSE >= 2:
            print "packing one last time..."
        p0 = time.clock()
        db.pack(time.time())
        p1 = time.clock()
        if VERBOSE:
            print "pack took %s sec" % (p1 - p0)
        pack_time += p1 - p0

    if VERBOSE:
        finish_time = time.time()
        print
        print "Index time", round(idx_time / 60, 3), "minutes"
        print "Pack time", round(pack_time / 60, 3), "minutes"
        print "Index bytes", Message.total_bytes
        rate = (Message.total_bytes / idx_time) / 1024
        print "Index rate %.2f KB/sec" % rate
        print "Indexing began", time.ctime(start_time)
        print "Indexing ended", time.ctime(finish_time)
        print "Wall clock minutes", round((finish_time - start_time) / 60, 3)
 def prescan(self, f, msgs, uniqwords):
     pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
     for n in msgs:
         print "prescanning", n
         m = f.openmessage(n)
         text = self.getmessagetext(m, f.name)
         for p in pipeline:
             text = p.process(text)
         for word in text:
             uniqwords[word] = uniqwords.get(word, 0) + 1
Exemple #9
0
def make_zc_index():
    # there's an elaborate dance necessary to construct an index
    class Struct:
        pass
    extra = Struct()
    extra.doc_attr = "read"
    extra.lexicon_id = "lexicon"
    caller = Struct()
    caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
    return ZCTextIndex("read", extra, caller)
Exemple #10
0
 def testMultipleAttributes(self):
     lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(),
                        StopWordRemover())
     caller = LexiconHolder(self.lexicon)
     zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                            'text1,text2', 'lexicon')
     doc = Indexable2('foo bar', 'alpha omega')
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('foo')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('foo alpha')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('foo alpha gamma')
     self.assertEqual(len(nbest), 0)
Exemple #11
0
 def testListAttributes(self):
     lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(),
                        StopWordRemover())
     caller = LexiconHolder(self.lexicon)
     zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                            'text1,text2', 'lexicon')
     doc = Indexable2('Hello Tim', \
                      ['Now is the winter of our discontent',
                       'Made glorious summer by this sun of York', ])
     zc_index.index_object(1, doc)
     nbest, total = zc_index.query('glorious')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('York Tim')
     self.assertEqual(len(nbest), 1)
     nbest, total = zc_index.query('Tuesday Tim York')
     self.assertEqual(len(nbest), 0)
Exemple #12
0
    def __init__(self, id='Help', title=''):
        self.id = id
        self.title = title
        c = self.catalog = ZCatalog('catalog')

        l = PLexicon('lexicon', '', HTMLWordSplitter(), CaseNormalizer(),
                     StopWordRemover())
        c._setObject('lexicon', l)
        i = ZCTextIndex('SearchableText',
                        caller=c,
                        index_factory=OkapiIndex,
                        lexicon_id=l.id)
        # not using c.addIndex because it depends on Product initialization
        c._catalog.addIndex('SearchableText', i)
        c._catalog.addIndex('categories', KeywordIndex('categories'))
        c._catalog.addIndex('permissions', KeywordIndex('permissions'))
        c.addColumn('categories')
        c.addColumn('permissions')
        c.addColumn('title_or_id')
        c.addColumn('url')
        c.addColumn('id')
Exemple #13
0
    def _populate(self, obj):
        from Products.ZCTextIndex.Lexicon import CaseNormalizer
        from Products.ZCTextIndex.Lexicon import Splitter
        from Products.ZCTextIndex.Lexicon import StopWordRemover
        from Products.ZCTextIndex.ZCTextIndex import PLexicon

        obj._setObject('foo_plexicon', PLexicon('foo_plexicon'))
        lex = obj.foo_plexicon
        lex._pipeline = (Splitter(), CaseNormalizer(), StopWordRemover())

        obj.addIndex('foo_date', 'DateIndex')

        obj.addIndex('foo_daterange', 'DateRangeIndex')
        idx = obj._catalog.getIndex('foo_daterange')
        idx._edit('bar', 'baz')

        obj.addIndex('foo_field', 'FieldIndex')
        idx = obj._catalog.getIndex('foo_field')
        idx.indexed_attrs = ('bar', )

        obj.addIndex('foo_keyword', 'KeywordIndex')
        idx = obj._catalog.getIndex('foo_keyword')
        idx.indexed_attrs = ('bar', )

        obj.addIndex('foo_path', 'PathIndex')

        obj.addIndex('foo_topic', 'TopicIndex')
        idx = obj._catalog.getIndex('foo_topic')
        idx.addFilteredSet('bar', 'PythonFilteredSet', 'True')
        idx.addFilteredSet('baz', 'PythonFilteredSet', 'False')

        extra = _extra()
        extra.lexicon_id = 'foo_plexicon'
        extra.index_type = 'Okapi BM25 Rank'
        obj.addIndex('foo_zctext', 'ZCTextIndex', extra)

        obj.addColumn('spam')
        obj.addColumn('eggs')
Exemple #14
0
    def _initSite(self, foo=2):
        site = self.root.site = Folder(id='site')
        ctool = site.portal_catalog = CatalogTool()

        for obj_id in ctool.objectIds():
            ctool._delObject(obj_id)
        for idx_id in ctool.indexes():
            ctool.delIndex(idx_id)
        for col in ctool.schema()[:]:
            ctool.delColumn(col)

        if foo > 0:
            ctool._setObject('foo_plexicon', PLexicon('foo_plexicon'))
            lex = ctool.foo_plexicon
            lex._pipeline = (Splitter(), CaseNormalizer(), StopWordRemover())

            extra = _extra()
            extra.lexicon_id = 'foo_plexicon'
            extra.index_type = 'Okapi BM25 Rank'
            ctool.addIndex('foo_zctext', 'ZCTextIndex', extra)

            ctool.addColumn('foo_zctext')

        return site
Exemple #15
0
 def _populate(self, obj):
     from Products.ZCTextIndex.Lexicon import CaseNormalizer
     from Products.ZCTextIndex.Lexicon import Splitter
     from Products.ZCTextIndex.Lexicon import StopWordRemover
     obj._pipeline = (Splitter(), CaseNormalizer(), StopWordRemover())
 def __init__(self):
     self.lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
     self.index = OkapiIndex(self.lexicon)