def setUp(self): self._catalog = self._makeOne() self._catalog.lexicon = PLexicon('lexicon') col1 = FieldIndex('col1') col2 = ZCTextIndex('col2', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') col3 = KeywordIndex('col3') self._catalog.addIndex('col1', col1) self._catalog.addIndex('col2', col2) self._catalog.addIndex('col3', col3) self._catalog.addColumn('col1') self._catalog.addColumn('col2') self._catalog.addColumn('col3') att1 = FieldIndex('att1') att2 = ZCTextIndex('att2', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') att3 = KeywordIndex('att3') num = FieldIndex('num') self._catalog.addIndex('att1', att1) self._catalog.addIndex('att2', att2) self._catalog.addIndex('att3', att3) self._catalog.addIndex('num', num) self._catalog.addColumn('att1') self._catalog.addColumn('att2') self._catalog.addColumn('att3') self._catalog.addColumn('num') for x in range(0, self.upper): self._catalog.catalogObject(dummy(self.nums[x]), repr(x)) self._catalog = self._catalog.__of__(dummy('foo'))
def testAddTextIndex(self): self._catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('id', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') self._catalog.addIndex('id', idx) i = self._catalog.indexes['id'] self.assert_(isinstance(i, ZCTextIndex), 'add text index failed')
def setUp(self): self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') self.index = self.zc_index.index
def test_ZCTextIndex(self): from xml.dom.minidom import parseString from Products.ZCTextIndex.ZCTextIndex import PLexicon from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex from Products.GenericSetup.testing import DummySetupEnviron from Products.GenericSetup.ZCTextIndex.exportimport \ import ZCTextIndexNodeAdapter _XML = """\ <index name="foo_zctext" meta_type="ZCTextIndex"> <indexed_attr value="bar"/> <extra name="index_type" value="Okapi BM25 Rank"/> <extra name="lexicon_id" value="foo_plexicon"/> </index> """ environ = DummySetupEnviron() def _no_clear(*a): raise AssertionError("Don't clear me!") catalog = DummyCatalog() catalog.foo_plexicon = PLexicon('foo_plexicon') extra = _extra() extra.lexicon_id = 'foo_plexicon' extra.index_type = 'Okapi BM25 Rank' index = ZCTextIndex('foo_field', extra=extra, field_name='bar', caller=catalog).__of__(catalog) index.clear = _no_clear adapted = ZCTextIndexNodeAdapter(index, environ) adapted.node = parseString(_XML).documentElement # no raise
def _make_one(self, extra=None): from Products.ZCatalog.Catalog import Catalog catalog = Catalog() catalog.lexicon = PLexicon('lexicon') att1 = FieldIndex('att1') att2 = ZCTextIndex('att2', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') catalog.addIndex('att2', att2) num = FieldIndex('num') catalog.addIndex('att1', att1) catalog.addIndex('num', num) catalog.addColumn('num') foo = MultiFieldIndex('foo') catalog.addIndex('foo', foo) if extra is not None: extra(catalog) for x in range(0, self.upper): catalog.catalogObject(Dummy(self.nums[x]), repr(x)) return catalog.__of__(Dummy('foo'))
def testReindex(self): caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') doc = Indexable('Hello Tim') zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 0) nbest, total = zc_index.query('Tim') self.assertEqual(len(nbest), 1) # reindex with another value doc.text = 'Goodbye George' zc_index.index_object(1, doc) nbest, total = zc_index.query('Tim') self.assertEqual(len(nbest), 0) nbest, total = zc_index.query('Goodbye') self.assertEqual(len(nbest), 1) # reindex with an empty value doc.text = '' zc_index.index_object(1, doc) nbest, total = zc_index.query('George') self.assertEqual(len(nbest), 0)
def testLexiconIsNotFoundRaisesLookupError(self): caller = LexiconHolder(self.lexicon) with self.assertRaises(LookupError): ZCTextIndex( 'name', extra=None, caller=caller, )
def test_add_text_index(self): catalog = self._make_one() catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('id', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') catalog.addIndex('id', idx) i = catalog.indexes['id'] self.assertIsInstance(i, ZCTextIndex)
def testDelTextIndex(self): self._catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('id', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') self._catalog.addIndex('id', idx) self._catalog.delIndex('id') self.assert_('id' not in self._catalog.indexes, 'del index failed')
def test_del_text_index(self): catalog = self._make_one() catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('id', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') catalog.addIndex('id', idx) catalog.delIndex('id') self.assertNotIn('id', catalog.indexes)
def index(rt, mboxfile, db, profiler): global NUM idx_time = 0 pack_time = 0 start_time = time.time() lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover()) extra = Extra() extra.lexicon_id = 'lexicon' extra.doc_attr = 'text' extra.index_type = 'Okapi BM25 Rank' caller = Extra() caller.lexicon = lexicon rt["index"] = idx = ZCTextIndex("index", extra, caller) if not EXCLUDE_TEXT: rt["documents"] = docs = IOBTree() else: docs = None transaction.commit() mbox = mailbox.UnixMailbox(open(mboxfile, 'rb')) if VERBOSE: print "opened", mboxfile if not NUM: NUM = sys.maxint if profiler: itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db) else: itime, ptime, i = indexmbox(mbox, idx, docs, db) idx_time += itime pack_time += ptime transaction.commit() if PACK_INTERVAL and i % PACK_INTERVAL != 0: if VERBOSE >= 2: print "packing one last time..." p0 = time.clock() db.pack(time.time()) p1 = time.clock() if VERBOSE: print "pack took %s sec" % (p1 - p0) pack_time += p1 - p0 if VERBOSE: finish_time = time.time() print print "Index time", round(idx_time / 60, 3), "minutes" print "Pack time", round(pack_time / 60, 3), "minutes" print "Index bytes", Message.total_bytes rate = (Message.total_bytes / idx_time) / 1024 print "Index rate %.2f KB/sec" % rate print "Indexing began", time.ctime(start_time) print "Indexing ended", time.ctime(finish_time) print "Wall clock minutes", round((finish_time - start_time) / 60, 3)
def make_zc_index(): # there's an elaborate dance necessary to construct an index class Struct: pass extra = Struct() extra.doc_attr = "read" extra.lexicon_id = "lexicon" caller = Struct() caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover()) return ZCTextIndex("read", extra, caller)
def testMultipleAttributes(self): caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('foo bar', 'alpha omega') zc_index.index_object(1, doc) nbest, total = zc_index.query('foo') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha gamma') self.assertEqual(len(nbest), 0)
def setUp(self): from Products.ZCTextIndex.ZCTextIndex import PLexicon from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex catalog = DummyCatalog() catalog.foo_plexicon = PLexicon('foo_plexicon') extra = _extra() extra.lexicon_id = 'foo_plexicon' extra.index_type = 'Okapi BM25 Rank' self._obj = ZCTextIndex('foo_zctext', extra=extra, caller=catalog).__of__(catalog) self._XML = _ZCTEXT_XML
def testInvalidIndexTypeRaisesValueError(self): caller = LexiconHolder(self.lexicon) class Extra(object): index_type = 'Some invalid index type' with self.assertRaises(ValueError): ZCTextIndex( 'name', extra=Extra, caller=caller, index_factory=None, lexicon_id='lexicon' )
def test_fixOkapiIndexes(self): catalog = ZCatalog('catalog') catalog.lexicon = PLexicon('lexicon') catalog.addIndex( 'test', ZCTextIndex('test', index_factory=OkapiIndex, caller=catalog, lexicon_id='lexicon')) catalog.Indexes['test'].index._totaldoclen = -1000 from plone.app.upgrade.v41.final import fixOkapiIndexes fixOkapiIndexes(catalog) self.assertEqual(0, catalog.Indexes['test'].index._totaldoclen())
def setUp(self): self._catalog = self._makeOne() self._catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('title', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') self._catalog.addIndex('title', idx) self._catalog.addIndex('true', FieldIndex('true')) self._catalog.addColumn('title') cat = self._get_catalog() for i in (1, 2, 3, 10, 11, 110, 111): obj = zdummy(i) obj.true = True if i == 110: obj.true = False cat.catalogObject(obj, str(i))
def testListAttributes(self): caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('Hello Tim', [ 'Now is the winter of our discontent', 'Made glorious summer by this sun of York', ]) zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('York Tim') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('Tuesday Tim York') self.assertEqual(len(nbest), 0)
def _make_one(self): from Products.ZCatalog.Catalog import Catalog catalog = Catalog() catalog.lexicon = PLexicon('lexicon') att1 = FieldIndex('att1') att2 = ZCTextIndex('att2', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') att3 = KeywordIndex('att3') catalog.addIndex('att1', att1) catalog.addIndex('att2', att2) catalog.addIndex('att3', att3) for x in range(0, self.upper): catalog.catalogObject(Dummy(x), repr(x)) return catalog.__of__(Dummy('foo'))
def _make_one(self): from Products.ZCatalog.Catalog import Catalog catalog = Catalog() catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('title', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') catalog.addIndex('title', idx) catalog.addIndex('true', FieldIndex('true')) catalog.addColumn('title') for i in (1, 2, 3, 10, 11, 110, 111): obj = ZDummy(i) obj.true = True if i == 110: obj.true = False catalog.catalogObject(obj, str(i)) return catalog.__of__(ZDummy(1))
def setUp(self): self.catalogs = [] for i in range(3): cat = self._makeOne() cat.lexicon = PLexicon('lexicon') cat.addIndex('num', FieldIndex('num')) cat.addIndex('big', FieldIndex('big')) cat.addIndex('number', FieldIndex('number')) i = ZCTextIndex('title', caller=cat, index_factory=OkapiIndex, lexicon_id='lexicon') cat.addIndex('title', i) cat = cat.__of__(zdummy(16336)) for i in range(10): obj = zdummy(i) obj.big = i > 5 obj.number = True cat.catalogObject(obj, str(i)) self.catalogs.append(cat)
def _make_many(self): from Products.ZCatalog.Catalog import mergeResults catalogs = [] for i in range(3): cat = self._make_one() cat.lexicon = PLexicon('lexicon') cat.addIndex('num', FieldIndex('num')) cat.addIndex('big', FieldIndex('big')) cat.addIndex('number', FieldIndex('number')) i = ZCTextIndex('title', caller=cat, index_factory=OkapiIndex, lexicon_id='lexicon') cat.addIndex('title', i) cat = cat.__of__(ZDummy(16336)) for i in range(10): obj = ZDummy(i) obj.big = i > 5 obj.number = True cat.catalogObject(obj, str(i)) catalogs.append(cat) return catalogs, mergeResults
def __init__(self, id='Help', title=''): self.id = id self.title = title c = self.catalog = ZCatalog('catalog') l = PLexicon('lexicon', '', HTMLWordSplitter(), CaseNormalizer(), StopWordRemover()) c._setObject('lexicon', l) i = ZCTextIndex('SearchableText', caller=c, index_factory=OkapiIndex, lexicon_id=l.id) # not using c.addIndex because it depends on Product initialization c._catalog.addIndex('SearchableText', i) c._catalog.addIndex('categories', KeywordIndex('categories')) c._catalog.addIndex('permissions', KeywordIndex('permissions')) c.addColumn('categories') c.addColumn('permissions') c.addColumn('title_or_id') c.addColumn('url') c.addColumn('id')