def setUp(self): self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') self.index = self.zc_index.index
def setup_lexicons(self): # setup the lexicons as some Plone-Catalog internal code depends on it # we wouldn't relly have to be so thorough as we don't actually want # to support full text searches on the people catalog, but setting # up the people catalog as close as possible to the portal catalog # ensures that there are no surprises lexicons = { 'plone_lexicon': [ ('Unicode Whitespace splitter', 'Word Splitter'), ('Unicode Ignoring Accents Case Normalizer', 'Case Normalizer'), ], 'plaintext_lexicon': [ ('HTML aware splitter', 'Word Splitter'), ('Case Normalizer', 'Case Normalizer'), ('Remove listed stop words only', 'Stop Words') ], 'htmltext_lexicon': [ ('HTML aware splitter', 'Word Splitter'), ('Case Normalizer', 'Case Normalizer'), ('Remove listed stop words only', 'Stop Words') ] } for lexicon, elements in lexicons.items(): pipeline = [] for element in elements: element = element_factory.instantiate(element[1], element[0]) pipeline.append(element) plexicon = PLexicon(lexicon) plexicon._pipeline = pipeline self._setObject(lexicon, plexicon)
def _make_one(self, extra=None): from Products.ZCatalog.Catalog import Catalog catalog = Catalog() catalog.lexicon = PLexicon('lexicon') att1 = FieldIndex('att1') att2 = ZCTextIndex('att2', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') catalog.addIndex('att2', att2) num = FieldIndex('num') catalog.addIndex('att1', att1) catalog.addIndex('num', num) catalog.addColumn('num') foo = MultiFieldIndex('foo') catalog.addIndex('foo', foo) if extra is not None: extra(catalog) for x in range(0, self.upper): catalog.catalogObject(Dummy(self.nums[x]), repr(x)) return catalog.__of__(Dummy('foo'))
def setup_catalog(context): portal = context.getSite() catalog_name = 'marginalia_catalog' try: catalog = cmfutils.getToolByName(portal, catalog_name) except AttributeError: # register catalog catalog = ZCatalog(catalog_name, u'Marginalia catalog', None, portal) portal._setObject(catalog_name, catalog) # add indexes and columns plaintext_extra = SimpleRecord(lexicon_id='plaintext_lexicon', index_type='Okapi BM25 Rank') indexes = catalog.indexes() columns = catalog.schema() # install lexicon _id = 'plaintext_lexicon' if not hasattr(catalog, _id): lexicon = PLexicon(_id, '', Splitter(), CaseNormalizer(), StopWordRemover()) catalog._setObject(_id, lexicon) for indexName, indexType, extra in (('edit_type', 'FieldIndex', None), ('note', 'ZCTextIndex', plaintext_extra), ('link_title', 'FieldIndex', None)): if indexName not in indexes: catalog.addIndex(indexName, indexType, extra=extra)
def test_ZCTextIndex(self): from xml.dom.minidom import parseString from Products.ZCTextIndex.ZCTextIndex import PLexicon from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex from Products.GenericSetup.testing import DummySetupEnviron from Products.GenericSetup.ZCTextIndex.exportimport \ import ZCTextIndexNodeAdapter _XML = """\ <index name="foo_zctext" meta_type="ZCTextIndex"> <indexed_attr value="bar"/> <extra name="index_type" value="Okapi BM25 Rank"/> <extra name="lexicon_id" value="foo_plexicon"/> </index> """ environ = DummySetupEnviron() def _no_clear(*a): raise AssertionError("Don't clear me!") catalog = DummyCatalog() catalog.foo_plexicon = PLexicon('foo_plexicon') extra = _extra() extra.lexicon_id = 'foo_plexicon' extra.index_type = 'Okapi BM25 Rank' index = ZCTextIndex('foo_field', extra=extra, field_name='bar', caller=catalog).__of__(catalog) index.clear = _no_clear adapted = ZCTextIndexNodeAdapter(index, environ) adapted.node = parseString(_XML).documentElement # no raise
def _initSite(self, foo=2): site = Folder(id='site').__of__(self.app) ctool = CatalogTool() getSiteManager().registerUtility(ctool, ICatalogTool) for obj_id in ctool.objectIds(): ctool._delObject(obj_id) for idx_id in ctool.indexes(): ctool.delIndex(idx_id) for col in list(ctool.schema()): ctool.delColumn(col) if foo > 0: ctool._setObject('foo_plexicon', PLexicon('foo_plexicon')) lex = ctool.foo_plexicon lex._pipeline = (Splitter(), CaseNormalizer(), StopWordRemover()) extra = _extra() extra.lexicon_id = 'foo_plexicon' extra.index_type = 'Okapi BM25 Rank' ctool.addIndex('foo_zctext', 'ZCTextIndex', extra) ctool.addColumn('foo_zctext') return site, ctool
def testReindex(self): lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') doc = Indexable('Hello Tim') zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 0) nbest, total = zc_index.query('Tim') self.assertEqual(len(nbest), 1) # reindex with another value doc.text = 'Goodbye George' zc_index.index_object(1, doc) nbest, total = zc_index.query('Tim') self.assertEqual(len(nbest), 0) nbest, total = zc_index.query('Goodbye') self.assertEqual(len(nbest), 1) # reindex with an empty value doc.text = '' zc_index.index_object(1, doc) nbest, total = zc_index.query('George') self.assertEqual(len(nbest), 0)
def index(): os.environ['STUPID_LOG_FILE'] = '' os.environ['STUPID_LOG_SEVERITY'] = '-111' import Zope2, Products.ZCatalog.ZCatalog import AccessControl.SecurityManagement, AccessControl.SpecialUsers app = Zope2.app() Products.ZCatalog.ZCatalog.manage_addZCatalog(app, 'cat', '') try: app.cat.threshold = atoi(sys.argv[2]) except IndexError: app.cat.threashold = 1000 from Products.ZCTextIndex.ZCTextIndex \ import PLexicon from Products.ZCTextIndex.Lexicon \ import Splitter, CaseNormalizer app.cat._setObject('lex', PLexicon('lex', '', Splitter(), CaseNormalizer())) class extra: doc_attr = 'PrincipiaSearchSource' lexicon_id = 'lex' index_type = 'Okapi BM25 Rank' app.cat.addIndex('PrincipiaSearchSource', 'ZCTextIndex', extra) transaction.commit() system = AccessControl.SpecialUsers.system AccessControl.SecurityManagement.newSecurityManager(None, system) r = RE() r.PARENTS = [app.cat, app] print do(Zope2.DB, indexf, (app, )) #hist(sys.argv[2]) Zope2.DB.close()
def updateIndexes(self): if not getattr(self, 'audit_lexicon', None): # installing, add lexicon, indexes and metadata self.addIndex('last_audited_date', 'DateIndex') self.addIndex('audited_action', 'KeywordIndex') self.addColumn('Title') self.addColumn('id') self.addColumn('UID') self.addColumn('last_audited_date') self.addColumn('audited_action') l = PLexicon('audit_lexicon', '', HTMLWordSplitter(), CaseNormalizer(), StopWordRemover()) self._setObject('audit_lexicon', l) catalog = portal_api.get_tool('portal_catalog') indexes = catalog._catalog.indexes for name, index in indexes.items(): if name in self._catalog.indexes.keys(): continue if index.meta_type == 'DateRecurringIndex': continue elif index.meta_type == 'ZCTextIndex': extras = Empty() extras.doc_attr = name extras.index_type = 'Okapi BM25 Rank' extras.lexicon_id = 'audit_lexicon' self.addIndex(name, index.meta_type, extras) else: self.addIndex(name, index.meta_type)
def setUp(self): self._catalog = self._makeOne() self._catalog.lexicon = PLexicon('lexicon') col1 = FieldIndex('col1') col2 = ZCTextIndex('col2', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') col3 = KeywordIndex('col3') self._catalog.addIndex('col1', col1) self._catalog.addIndex('col2', col2) self._catalog.addIndex('col3', col3) self._catalog.addColumn('col1') self._catalog.addColumn('col2') self._catalog.addColumn('col3') att1 = FieldIndex('att1') att2 = ZCTextIndex('att2', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') att3 = KeywordIndex('att3') num = FieldIndex('num') self._catalog.addIndex('att1', att1) self._catalog.addIndex('att2', att2) self._catalog.addIndex('att3', att3) self._catalog.addIndex('num', num) self._catalog.addColumn('att1') self._catalog.addColumn('att2') self._catalog.addColumn('att3') self._catalog.addColumn('num') for x in range(0, self.upper): self._catalog.catalogObject(dummy(self.nums[x]), repr(x)) self._catalog = self._catalog.__of__(dummy('foo'))
def setup(lib_python): try: os.remove(os.path.join(lib_python, '..', '..', 'var', 'Data.fs')) except: pass import Zope2 import Products import AccessControl.SecurityManagement app=Zope2.app() Products.ZCatalog.ZCatalog.manage_addZCatalog(app, 'cat', '') from Products.ZCTextIndex.ZCTextIndex import PLexicon from Products.ZCTextIndex.Lexicon import Splitter, CaseNormalizer app.cat._setObject('lex', PLexicon('lex', '', Splitter(), CaseNormalizer()) ) class extra: doc_attr = 'PrincipiaSearchSource' lexicon_id = 'lex' index_type = 'Okapi BM25 Rank' app.cat.addIndex('PrincipiaSearchSource', 'ZCTextIndex', extra) transaction.commit() system = AccessControl.SpecialUsers.system AccessControl.SecurityManagement.newSecurityManager(None, system) app._p_jar.close()
def testAddTextIndex(self): self._catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('id', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') self._catalog.addIndex('id', idx) i = self._catalog.indexes['id'] self.assert_(isinstance(i, ZCTextIndex), 'add text index failed')
def test_add_text_index(self): catalog = self._make_one() catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('id', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') catalog.addIndex('id', idx) i = catalog.indexes['id'] self.assertIsInstance(i, ZCTextIndex)
def testDelTextIndex(self): self._catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('id', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') self._catalog.addIndex('id', idx) self._catalog.delIndex('id') self.assert_('id' not in self._catalog.indexes, 'del index failed')
def test_del_text_index(self): catalog = self._make_one() catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('id', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') catalog.addIndex('id', idx) catalog.delIndex('id') self.assertNotIn('id', catalog.indexes)
def setUp(self): from Products.ZCTextIndex.ZCTextIndex import PLexicon PlacelessSetup.setUp(self) zcml.load_config('meta.zcml', Products.Five) zcml.load_config('configure.zcml', Products.GenericSetup.ZCTextIndex) self._obj = PLexicon('foo_plexicon') self._XML = _PLEXICON_XML
def setUp(self): import Products.GenericSetup.ZCTextIndex from Products.ZCTextIndex.ZCTextIndex import PLexicon NodeAdapterTestCase.setUp(self) zcml.load_config('configure.zcml', Products.GenericSetup.ZCTextIndex) self._obj = PLexicon('foo_plexicon') self._XML = _PLEXICON_XML
def setUp(self): from Products.ZCTextIndex.ZCTextIndex import PLexicon from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex catalog = DummyCatalog() catalog.foo_plexicon = PLexicon('foo_plexicon') extra = _extra() extra.lexicon_id = 'foo_plexicon' extra.index_type = 'Okapi BM25 Rank' self._obj = ZCTextIndex('foo_zctext', extra=extra, caller=catalog).__of__(catalog) self._XML = _ZCTEXT_XML
def _populate_special(self, obj): from Products.ZCTextIndex.ZCTextIndex import PLexicon self._populate(self._obj) obj._setObject('old_plexicon', PLexicon('old_plexicon')) extra = _extra() extra.lexicon_id = 'old_plexicon' extra.index_type = 'Cosine Measure' obj.addIndex('foo_text', 'ZCTextIndex', extra) obj.addColumn('bacon')
def testMultipleAttributes(self): lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('foo bar', 'alpha omega') zc_index.index_object(1, doc) nbest, total = zc_index.query('foo') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha gamma') self.assertEqual(len(nbest), 0)
def test_fixOkapiIndexes(self): catalog = ZCatalog('catalog') catalog.lexicon = PLexicon('lexicon') catalog.addIndex( 'test', ZCTextIndex('test', index_factory=OkapiIndex, caller=catalog, lexicon_id='lexicon')) catalog.Indexes['test'].index._totaldoclen = -1000 from plone.app.upgrade.v41.final import fixOkapiIndexes fixOkapiIndexes(catalog) self.assertEqual(0, catalog.Indexes['test'].index._totaldoclen())
def _make_one(self): from Products.ZCatalog.Catalog import Catalog catalog = Catalog() catalog.lexicon = PLexicon('lexicon') att1 = FieldIndex('att1') att2 = ZCTextIndex('att2', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') att3 = KeywordIndex('att3') catalog.addIndex('att1', att1) catalog.addIndex('att2', att2) catalog.addIndex('att3', att3) for x in range(0, self.upper): catalog.catalogObject(Dummy(x), repr(x)) return catalog.__of__(Dummy('foo'))
def setUp(self): self._catalog = self._makeOne() self._catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('title', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') self._catalog.addIndex('title', idx) self._catalog.addIndex('true', FieldIndex('true')) self._catalog.addColumn('title') cat = self._get_catalog() for i in (1, 2, 3, 10, 11, 110, 111): obj = zdummy(i) obj.true = True if i == 110: obj.true = False cat.catalogObject(obj, str(i))
def testListAttributes(self): lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('Hello Tim', \ ['Now is the winter of our discontent', 'Made glorious summer by this sun of York', ]) zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('York Tim') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('Tuesday Tim York') self.assertEqual(len(nbest), 0)
def setUp(self): from Products.ZCTextIndex.ZCTextIndex import PLexicon from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex PlacelessSetup.setUp(self) zcml.load_config('meta.zcml', Products.Five) zcml.load_config('configure.zcml', Products.GenericSetup.ZCTextIndex) catalog = DummyCatalog() catalog.foo_plexicon = PLexicon('foo_plexicon') extra = _extra() extra.lexicon_id = 'foo_plexicon' extra.index_type = 'Okapi BM25 Rank' self._obj = ZCTextIndex('foo_zctext', extra=extra, caller=catalog).__of__(catalog) self._XML = _ZCTEXT_XML
def _initIndexes(self): # ZCTextIndex lexicons for id, splitter, normalizer, sw_remover in self.enumerateLexicons(): lexicon = PLexicon(id, '', splitter, normalizer, sw_remover) self._setObject(id, lexicon) # Content indexes self._catalog.indexes.clear() for index_name, index_type, extra in self.enumerateIndexes(): self.addIndex(index_name, index_type, extra=extra) # Cached metadata self._catalog.names = () self._catalog.schema.clear() for column_name in self.enumerateColumns(): self.addColumn(column_name)
def _make_one(self): from Products.ZCatalog.Catalog import Catalog catalog = Catalog() catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('title', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') catalog.addIndex('title', idx) catalog.addIndex('true', FieldIndex('true')) catalog.addColumn('title') for i in (1, 2, 3, 10, 11, 110, 111): obj = ZDummy(i) obj.true = True if i == 110: obj.true = False catalog.catalogObject(obj, str(i)) return catalog.__of__(ZDummy(1))
def __init__(self, FULLTEXT=False): """ """ self.no_refresh = True CatalogTool.__init__(self) self._catalog = PlominoCatalog() lexicon = PLexicon('plaintext_lexicon', '', Splitter(), CaseNormalizer()) self._setObject('plaintext_lexicon', lexicon) self.addIndex('Form', "FieldIndex") self.addIndex('id', "FieldIndex") self.addColumn('id') self.addIndex('getPlominoReaders', "KeywordIndex") self.addIndex('path', "ExtendedPathIndex") if FULLTEXT: self.createFieldIndex('SearchableText', 'RICHTEXT') self.no_refresh = False
def setUp(self): self.catalogs = [] for i in range(3): cat = self._makeOne() cat.lexicon = PLexicon('lexicon') cat.addIndex('num', FieldIndex('num')) cat.addIndex('big', FieldIndex('big')) cat.addIndex('number', FieldIndex('number')) i = ZCTextIndex('title', caller=cat, index_factory=OkapiIndex, lexicon_id='lexicon') cat.addIndex('title', i) cat = cat.__of__(zdummy(16336)) for i in range(10): obj = zdummy(i) obj.big = i > 5 obj.number = True cat.catalogObject(obj, str(i)) self.catalogs.append(cat)
def _make_many(self): from Products.ZCatalog.Catalog import mergeResults catalogs = [] for i in range(3): cat = self._make_one() cat.lexicon = PLexicon('lexicon') cat.addIndex('num', FieldIndex('num')) cat.addIndex('big', FieldIndex('big')) cat.addIndex('number', FieldIndex('number')) i = ZCTextIndex('title', caller=cat, index_factory=OkapiIndex, lexicon_id='lexicon') cat.addIndex('title', i) cat = cat.__of__(ZDummy(16336)) for i in range(10): obj = ZDummy(i) obj.big = i > 5 obj.number = True cat.catalogObject(obj, str(i)) catalogs.append(cat) return catalogs, mergeResults
def __init__(self, FULLTEXT=False): """ """ self.no_refresh = True ZCatalog.__init__(self, self.getId()) self._catalog = PlominoCatalog() # TODO: use TextindexNG3 #lexicon = PLexicon('plaintext_lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) lexicon = PLexicon('plaintext_lexicon', '', Splitter(), CaseNormalizer()) self._setObject('plaintext_lexicon', lexicon) #self.createFieldIndex('Form', 'SELECTION') #self.createFieldIndex('getPlominoReaders', 'SELECTION') self.addIndex('Form', "FieldIndex") self.addIndex('id', "FieldIndex") self.addColumn('id') self.addIndex('getPlominoReaders', "KeywordIndex") if FULLTEXT: self.createFieldIndex('SearchableText', 'RICHTEXT') self.no_refresh = False
class ZCIndexTestsBase: def setUp(self): self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') self.index = self.zc_index.index def parserFailure(self, query): self.assertRaises(ParseError, self.zc_index.query, query) def parserSuccess(self, query, n): r, num = self.zc_index.query(query) self.assertEqual(num, n) if n: self.assertEqual(r[0][0], 1) def testMultipleAttributes(self): lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('foo bar', 'alpha omega') zc_index.index_object(1, doc) nbest, total = zc_index.query('foo') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha gamma') self.assertEqual(len(nbest), 0) def testListAttributes(self): lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('Hello Tim', \ ['Now is the winter of our discontent', 'Made glorious summer by this sun of York', ]) zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('York Tim') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('Tuesday Tim York') self.assertEqual(len(nbest), 0) def testStopWords(self): # the only non-stopword is question text = ("to be or not to be " "that is the question") doc = Indexable(text) self.zc_index.index_object(1, doc) for word in text.split(): if word != "question": wids = self.lexicon.termToWordIds(word) self.assertEqual(wids, []) self.assertEqual(len(self.index.get_words(1)), 1) self.parserSuccess('question', 1) self.parserSuccess('question AND to AND be', 1) self.parserSuccess('to AND question AND be', 1) self.parserSuccess('question AND NOT gardenia', 1) self.parserSuccess('question AND gardenia', 0) self.parserSuccess('gardenia', 0) self.parserSuccess('question OR gardenia', 1) self.parserSuccess('question AND NOT to AND NOT be', 1) self.parserSuccess('question OR to OR be', 1) self.parserSuccess('question to be', 1) self.parserFailure('to be') self.parserFailure('to AND be') self.parserFailure('to OR be') self.parserFailure('to AND NOT be') self.parserFailure('to AND NOT question') self.parserFailure('to AND NOT gardenia') def testDocUpdate(self): docid = 1 # doesn't change -- we index the same doc repeatedly N = len(text) stop = get_stopdict() d = {} # word -> list of version numbers containing that word for version, i in zip(text, range(N)): # use a simple splitter rather than an official one words = [w for w in re.split("\W+", version.lower()) if len(w) > 1 and not stop.has_key(w)] word_seen = {} for w in words: if not word_seen.has_key(w): d.setdefault(w, []).append(i) word_seen[w] = 1 unique = {} # version number -> list of words unique to that version common = [] # list of words common to all versions for w, versionlist in d.items(): if len(versionlist) == 1: unique.setdefault(versionlist[0], []).append(w) elif len(versionlist) == N: common.append(w) self.assert_(len(common) > 0) self.assert_(len(unique) > 0) for version, i in zip(text, range(N)): doc = Indexable(version) self.zc_index.index_object(docid, doc) for w in common: nbest, total = self.zc_index.query(w) self.assertEqual(total, 1, "did not find %s" % w) for k, v in unique.items(): if k == i: continue for w in v: nbest, total = self.zc_index.query(w) self.assertEqual(total, 0, "did not expect to find %s" % w)
class ZCIndexTestsBase(object): def setUp(self): self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') self.index = self.zc_index.index def parserFailure(self, query): self.assertRaises(ParseError, self.zc_index.query, query) def parserSuccess(self, query, n): r, num = self.zc_index.query(query) self.assertEqual(num, n) if n: self.assertEqual(r[0][0], 1) def testMultipleAttributes(self): caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('foo bar', 'alpha omega') zc_index.index_object(1, doc) nbest, total = zc_index.query('foo') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha gamma') self.assertEqual(len(nbest), 0) def testListAttributes(self): caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('Hello Tim', ['Now is the winter of our discontent', 'Made glorious summer by this sun of York', ]) zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('York Tim') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('Tuesday Tim York') self.assertEqual(len(nbest), 0) def testReindex(self): caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') doc = Indexable('Hello Tim') zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 0) nbest, total = zc_index.query('Tim') self.assertEqual(len(nbest), 1) # reindex with another value doc.text = 'Goodbye George' zc_index.index_object(1, doc) nbest, total = zc_index.query('Tim') self.assertEqual(len(nbest), 0) nbest, total = zc_index.query('Goodbye') self.assertEqual(len(nbest), 1) # reindex with an empty value doc.text = '' zc_index.index_object(1, doc) nbest, total = zc_index.query('George') self.assertEqual(len(nbest), 0) def testStopWords(self): # the only non-stopword is question text = ('to be or not to be ' 'that is the question') doc = Indexable(text) self.zc_index.index_object(1, doc) for word in text.split(): if word != 'question': wids = self.lexicon.termToWordIds(word) self.assertEqual(wids, []) self.assertEqual(len(self.index.get_words(1)), 1) self.parserSuccess('question', 1) self.parserSuccess('question AND to AND be', 1) self.parserSuccess('to AND question AND be', 1) self.parserSuccess('question AND NOT gardenia', 1) self.parserSuccess('question AND gardenia', 0) self.parserSuccess('gardenia', 0) self.parserSuccess('question OR gardenia', 1) self.parserSuccess('question AND NOT to AND NOT be', 1) self.parserSuccess('question OR to OR be', 1) self.parserSuccess('question to be', 1) self.parserFailure('to be') self.parserFailure('to AND be') self.parserFailure('to OR be') self.parserFailure('to AND NOT be') self.parserFailure('to AND NOT question') self.parserFailure('to AND NOT gardenia') def testDocUpdate(self): docid = 1 # doesn't change -- we index the same doc repeatedly N = len(text) stop = get_stopdict() d = {} # word -> list of version numbers containing that word for version, i in zip(text, range(N)): # use a simple splitter rather than an official one words = [w for w in re.split(r'\W+', version.lower()) if len(w) > 1 and w not in stop] word_seen = {} for w in words: if w not in word_seen: d.setdefault(w, []).append(i) word_seen[w] = 1 unique = {} # version number -> list of words unique to that version common = [] # list of words common to all versions for w, versionlist in d.items(): if len(versionlist) == 1: unique.setdefault(versionlist[0], []).append(w) elif len(versionlist) == N: common.append(w) self.assertGreater(len(common), 0) self.assertGreater(len(unique), 0) for version, i in zip(text, range(N)): doc = Indexable(version) self.zc_index.index_object(docid, doc) for w in common: nbest, total = self.zc_index.query(w) self.assertEqual(total, 1, 'did not find {0}'.format(w)) for k, v in unique.items(): if k == i: continue for w in v: nbest, total = self.zc_index.query(w) self.assertEqual( total, 0, 'did not expect to find {0}'.format(w) ) def testLexiconIsNotFoundRaisesLookupError(self): caller = LexiconHolder(self.lexicon) with self.assertRaises(LookupError): ZCTextIndex( 'name', extra=None, caller=caller, ) def testInvalidIndexTypeRaisesValueError(self): caller = LexiconHolder(self.lexicon) class Extra(object): index_type = 'Some invalid index type' with self.assertRaises(ValueError): ZCTextIndex( 'name', extra=Extra, caller=caller, index_factory=None, lexicon_id='lexicon' )