def enumerateLexicons(self): return ( ( 'plaintext_lexicon' , Splitter() , CaseNormalizer() , StopWordRemover() ) , ( 'htmltext_lexicon' , HTMLWordSplitter() , CaseNormalizer() , StopWordRemover() ) )
def index(): os.environ['STUPID_LOG_FILE'] = '' os.environ['STUPID_LOG_SEVERITY'] = '-111' import Zope2, Products.ZCatalog.ZCatalog import AccessControl.SecurityManagement, AccessControl.SpecialUsers app = Zope2.app() Products.ZCatalog.ZCatalog.manage_addZCatalog(app, 'cat', '') try: app.cat.threshold = atoi(sys.argv[2]) except IndexError: app.cat.threashold = 1000 from Products.ZCTextIndex.ZCTextIndex \ import PLexicon from Products.ZCTextIndex.Lexicon \ import Splitter, CaseNormalizer app.cat._setObject('lex', PLexicon('lex', '', Splitter(), CaseNormalizer())) class extra: doc_attr = 'PrincipiaSearchSource' lexicon_id = 'lex' index_type = 'Okapi BM25 Rank' app.cat.addIndex('PrincipiaSearchSource', 'ZCTextIndex', extra) transaction.commit() system = AccessControl.SpecialUsers.system AccessControl.SecurityManagement.newSecurityManager(None, system) r = RE() r.PARENTS = [app.cat, app] print do(Zope2.DB, indexf, (app, )) #hist(sys.argv[2]) Zope2.DB.close()
def setUp(self): self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') self.index = self.zc_index.index
def setup_catalog(context): portal = context.getSite() catalog_name = 'marginalia_catalog' try: catalog = cmfutils.getToolByName(portal, catalog_name) except AttributeError: # register catalog catalog = ZCatalog(catalog_name, u'Marginalia catalog', None, portal) portal._setObject(catalog_name, catalog) # add indexes and columns plaintext_extra = SimpleRecord(lexicon_id='plaintext_lexicon', index_type='Okapi BM25 Rank') indexes = catalog.indexes() columns = catalog.schema() # install lexicon _id = 'plaintext_lexicon' if not hasattr(catalog, _id): lexicon = PLexicon(_id, '', Splitter(), CaseNormalizer(), StopWordRemover()) catalog._setObject(_id, lexicon) for indexName, indexType, extra in (('edit_type', 'FieldIndex', None), ('note', 'ZCTextIndex', plaintext_extra), ('link_title', 'FieldIndex', None)): if indexName not in indexes: catalog.addIndex(indexName, indexType, extra=extra)
def _initSite(self, foo=2): site = Folder(id='site').__of__(self.app) ctool = CatalogTool() getSiteManager().registerUtility(ctool, ICatalogTool) for obj_id in ctool.objectIds(): ctool._delObject(obj_id) for idx_id in ctool.indexes(): ctool.delIndex(idx_id) for col in list(ctool.schema()): ctool.delColumn(col) if foo > 0: ctool._setObject('foo_plexicon', PLexicon('foo_plexicon')) lex = ctool.foo_plexicon lex._pipeline = (Splitter(), CaseNormalizer(), StopWordRemover()) extra = _extra() extra.lexicon_id = 'foo_plexicon' extra.index_type = 'Okapi BM25 Rank' ctool.addIndex('foo_zctext', 'ZCTextIndex', extra) ctool.addColumn('foo_zctext') return site, ctool
def testReindex(self): lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') doc = Indexable('Hello Tim') zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 0) nbest, total = zc_index.query('Tim') self.assertEqual(len(nbest), 1) # reindex with another value doc.text = 'Goodbye George' zc_index.index_object(1, doc) nbest, total = zc_index.query('Tim') self.assertEqual(len(nbest), 0) nbest, total = zc_index.query('Goodbye') self.assertEqual(len(nbest), 1) # reindex with an empty value doc.text = '' zc_index.index_object(1, doc) nbest, total = zc_index.query('George') self.assertEqual(len(nbest), 0)
def updateIndexes(self): if not getattr(self, 'audit_lexicon', None): # installing, add lexicon, indexes and metadata self.addIndex('last_audited_date', 'DateIndex') self.addIndex('audited_action', 'KeywordIndex') self.addColumn('Title') self.addColumn('id') self.addColumn('UID') self.addColumn('last_audited_date') self.addColumn('audited_action') l = PLexicon('audit_lexicon', '', HTMLWordSplitter(), CaseNormalizer(), StopWordRemover()) self._setObject('audit_lexicon', l) catalog = portal_api.get_tool('portal_catalog') indexes = catalog._catalog.indexes for name, index in indexes.items(): if name in self._catalog.indexes.keys(): continue if index.meta_type == 'DateRecurringIndex': continue elif index.meta_type == 'ZCTextIndex': extras = Empty() extras.doc_attr = name extras.index_type = 'Okapi BM25 Rank' extras.lexicon_id = 'audit_lexicon' self.addIndex(name, index.meta_type, extras) else: self.addIndex(name, index.meta_type)
def setup(lib_python): try: os.remove(os.path.join(lib_python, '..', '..', 'var', 'Data.fs')) except: pass import Zope2 import Products import AccessControl.SecurityManagement app=Zope2.app() Products.ZCatalog.ZCatalog.manage_addZCatalog(app, 'cat', '') from Products.ZCTextIndex.ZCTextIndex import PLexicon from Products.ZCTextIndex.Lexicon import Splitter, CaseNormalizer app.cat._setObject('lex', PLexicon('lex', '', Splitter(), CaseNormalizer()) ) class extra: doc_attr = 'PrincipiaSearchSource' lexicon_id = 'lex' index_type = 'Okapi BM25 Rank' app.cat.addIndex('PrincipiaSearchSource', 'ZCTextIndex', extra) transaction.commit() system = AccessControl.SpecialUsers.system AccessControl.SecurityManagement.newSecurityManager(None, system) app._p_jar.close()
def index(rt, mboxfile, db, profiler): global NUM idx_time = 0 pack_time = 0 start_time = time.time() lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover()) extra = Extra() extra.lexicon_id = 'lexicon' extra.doc_attr = 'text' extra.index_type = 'Okapi BM25 Rank' caller = Extra() caller.lexicon = lexicon rt["index"] = idx = ZCTextIndex("index", extra, caller) if not EXCLUDE_TEXT: rt["documents"] = docs = IOBTree() else: docs = None transaction.commit() mbox = mailbox.UnixMailbox(open(mboxfile, 'rb')) if VERBOSE: print "opened", mboxfile if not NUM: NUM = sys.maxint if profiler: itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db) else: itime, ptime, i = indexmbox(mbox, idx, docs, db) idx_time += itime pack_time += ptime transaction.commit() if PACK_INTERVAL and i % PACK_INTERVAL != 0: if VERBOSE >= 2: print "packing one last time..." p0 = time.clock() db.pack(time.time()) p1 = time.clock() if VERBOSE: print "pack took %s sec" % (p1 - p0) pack_time += p1 - p0 if VERBOSE: finish_time = time.time() print print "Index time", round(idx_time / 60, 3), "minutes" print "Pack time", round(pack_time / 60, 3), "minutes" print "Index bytes", Message.total_bytes rate = (Message.total_bytes / idx_time) / 1024 print "Index rate %.2f KB/sec" % rate print "Indexing began", time.ctime(start_time) print "Indexing ended", time.ctime(finish_time) print "Wall clock minutes", round((finish_time - start_time) / 60, 3)
def prescan(self, f, msgs, uniqwords): pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()] for n in msgs: print "prescanning", n m = f.openmessage(n) text = self.getmessagetext(m, f.name) for p in pipeline: text = p.process(text) for word in text: uniqwords[word] = uniqwords.get(word, 0) + 1
def testSplitterAdaptorFold(self): from Products.ZCTextIndex.Lexicon import CaseNormalizer from Products.ZCTextIndex.Lexicon import Splitter lexicon = self._makeOne(Splitter(), CaseNormalizer()) wids = lexicon.sourceToWordIds('CATS and dogs') wids = lexicon.termToWordIds('cats and dogs') self.assertEqual(len(wids), 3) first = wids[0] self.assertEqual(wids, [first, first + 1, first + 2])
def test_queryLexicon_uses_pipeline_for_normalization(self): from Products.ZCTextIndex.Lexicon import CaseNormalizer WORDS = 'aaa bbb ccc ddd eee fff ggg'.split() lexicon = self._makeOne('test', 'Testing', CaseNormalizer()) lexicon.sourceToWordIds(WORDS) info = lexicon.queryLexicon(REQUEST=None, words=['AA*', 'Bbb*']) self.assertEqual(info['page'], 0) self.assertEqual(info['rows'], 20) self.assertEqual(info['cols'], 4) self.assertEqual(info['start_word'], 1) self.assertEqual(info['end_word'], 2) self.assertEqual(info['word_count'], 2) self.assertEqual(list(info['page_range']), [0]) self.assertEqual(info['page_columns'], [['aaa', 'bbb']])
def testMultipleAttributes(self): lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('foo bar', 'alpha omega') zc_index.index_object(1, doc) nbest, total = zc_index.query('foo') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha gamma') self.assertEqual(len(nbest), 0)
def testListAttributes(self): lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('Hello Tim', \ ['Now is the winter of our discontent', 'Made glorious summer by this sun of York', ]) zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('York Tim') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('Tuesday Tim York') self.assertEqual(len(nbest), 0)
def __init__(self, id='Help', title=''): self.id = id self.title = title c = self.catalog = ZCatalog('catalog') l = PLexicon('lexicon', '', HTMLWordSplitter(), CaseNormalizer(), StopWordRemover()) c._setObject('lexicon', l) i = ZCTextIndex('SearchableText', caller=c, index_factory=OkapiIndex, lexicon_id=l.id) # not using c.addIndex because it depends on Product initialization c._catalog.addIndex('SearchableText', i) c._catalog.addIndex('categories', KeywordIndex('categories')) c._catalog.addIndex('permissions', KeywordIndex('permissions')) c.addColumn('categories') c.addColumn('permissions') c.addColumn('title_or_id') c.addColumn('url') c.addColumn('id')
def _populate(self, obj): from Products.ZCTextIndex.Lexicon import CaseNormalizer from Products.ZCTextIndex.Lexicon import Splitter from Products.ZCTextIndex.Lexicon import StopWordRemover from Products.ZCTextIndex.ZCTextIndex import PLexicon obj._setObject('foo_plexicon', PLexicon('foo_plexicon')) lex = obj.foo_plexicon lex._pipeline = (Splitter(), CaseNormalizer(), StopWordRemover()) obj.addIndex('foo_date', 'DateIndex') obj.addIndex('foo_daterange', 'DateRangeIndex') idx = obj._catalog.getIndex('foo_daterange') idx._edit('bar', 'baz') obj.addIndex('foo_field', 'FieldIndex') idx = obj._catalog.getIndex('foo_field') idx.indexed_attrs = ('bar', ) obj.addIndex('foo_keyword', 'KeywordIndex') idx = obj._catalog.getIndex('foo_keyword') idx.indexed_attrs = ('bar', ) obj.addIndex('foo_path', 'PathIndex') obj.addIndex('foo_topic', 'TopicIndex') idx = obj._catalog.getIndex('foo_topic') idx.addFilteredSet('bar', 'PythonFilteredSet', 'True') idx.addFilteredSet('baz', 'PythonFilteredSet', 'False') extra = _extra() extra.lexicon_id = 'foo_plexicon' extra.index_type = 'Okapi BM25 Rank' obj.addIndex('foo_zctext', 'ZCTextIndex', extra) obj.addColumn('spam') obj.addColumn('eggs')
def _initSite(self, foo=2): site = self.root.site = Folder(id='site') ctool = site.portal_catalog = CatalogTool() for obj_id in ctool.objectIds(): ctool._delObject(obj_id) for idx_id in ctool.indexes(): ctool.delIndex(idx_id) for col in ctool.schema()[:]: ctool.delColumn(col) if foo > 0: ctool._setObject('foo_plexicon', PLexicon('foo_plexicon')) lex = ctool.foo_plexicon lex._pipeline = (Splitter(), CaseNormalizer(), StopWordRemover()) extra = _extra() extra.lexicon_id = 'foo_plexicon' extra.index_type = 'Okapi BM25 Rank' ctool.addIndex('foo_zctext', 'ZCTextIndex', extra) ctool.addColumn('foo_zctext') return site
def testSplitterAdaptorFold(self): lexicon = Lexicon(Splitter(), CaseNormalizer()) wids = lexicon.sourceToWordIds('CATS and dogs') wids = lexicon.termToWordIds('cats and dogs') self.assertEqual(wids, [1, 2, 3])
def _populate(self, obj): from Products.ZCTextIndex.Lexicon import CaseNormalizer from Products.ZCTextIndex.Lexicon import Splitter from Products.ZCTextIndex.Lexicon import StopWordRemover obj._pipeline = (Splitter(), CaseNormalizer(), StopWordRemover())
def __init__(self): self.lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover()) self.index = OkapiIndex(self.lexicon)