def test_ZCTextIndex(self): from xml.dom.minidom import parseString from Products.ZCTextIndex.ZCTextIndex import PLexicon from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex from Products.GenericSetup.testing import DummySetupEnviron from Products.GenericSetup.ZCTextIndex.exportimport \ import ZCTextIndexNodeAdapter _XML = """\ <index name="foo_zctext" meta_type="ZCTextIndex"> <indexed_attr value="bar"/> <extra name="index_type" value="Okapi BM25 Rank"/> <extra name="lexicon_id" value="foo_plexicon"/> </index> """ environ = DummySetupEnviron() def _no_clear(*a): raise AssertionError("Don't clear me!") catalog = DummyCatalog() catalog.foo_plexicon = PLexicon('foo_plexicon') extra = _extra() extra.lexicon_id = 'foo_plexicon' extra.index_type='Okapi BM25 Rank' index = ZCTextIndex('foo_field', extra=extra, field_name='bar', caller=catalog).__of__(catalog) index.clear = _no_clear adapted = ZCTextIndexNodeAdapter(index, environ) adapted.node = parseString(_XML).documentElement # no raise
def setUp(self): self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') self.index = self.zc_index.index
def setUp(self): self._catalog = self._makeOne() self._catalog.lexicon = PLexicon('lexicon') col1 = FieldIndex('col1') col2 = ZCTextIndex('col2', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') col3 = KeywordIndex('col3') self._catalog.addIndex('col1', col1) self._catalog.addIndex('col2', col2) self._catalog.addIndex('col3', col3) self._catalog.addColumn('col1') self._catalog.addColumn('col2') self._catalog.addColumn('col3') att1 = FieldIndex('att1') att2 = ZCTextIndex('att2', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') att3 = KeywordIndex('att3') num = FieldIndex('num') self._catalog.addIndex('att1', att1) self._catalog.addIndex('att2', att2) self._catalog.addIndex('att3', att3) self._catalog.addIndex('num', num) self._catalog.addColumn('att1') self._catalog.addColumn('att2') self._catalog.addColumn('att3') self._catalog.addColumn('num') for x in range(0, self.upper): self._catalog.catalogObject(dummy(self.nums[x]), repr(x)) self._catalog = self._catalog.__of__(dummy('foo'))
def test_ZCTextIndex(self): from xml.dom.minidom import parseString from Products.ZCTextIndex.ZCTextIndex import PLexicon from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex from Products.GenericSetup.testing import DummySetupEnviron from Products.GenericSetup.ZCTextIndex.exportimport \ import ZCTextIndexNodeAdapter _XML = """\ <index name="foo_zctext" meta_type="ZCTextIndex"> <indexed_attr value="bar"/> <extra name="index_type" value="Okapi BM25 Rank"/> <extra name="lexicon_id" value="foo_plexicon"/> </index> """ environ = DummySetupEnviron() def _no_clear(*a): raise AssertionError("Don't clear me!") catalog = DummyCatalog() catalog.foo_plexicon = PLexicon('foo_plexicon') extra = _extra() extra.lexicon_id = 'foo_plexicon' extra.index_type = 'Okapi BM25 Rank' index = ZCTextIndex('foo_field', extra=extra, field_name='bar', caller=catalog).__of__(catalog) index.clear = _no_clear adapted = ZCTextIndexNodeAdapter(index, environ) adapted.node = parseString(_XML).documentElement # no raise
def testAddTextIndex(self): self._catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('id', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') self._catalog.addIndex('id', idx) i = self._catalog.indexes['id'] self.assert_(isinstance(i, ZCTextIndex), 'add text index failed')
def _make_one(self, extra=None): from Products.ZCatalog.Catalog import Catalog catalog = Catalog() catalog.lexicon = PLexicon('lexicon') att1 = FieldIndex('att1') att2 = ZCTextIndex('att2', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') catalog.addIndex('att2', att2) num = FieldIndex('num') catalog.addIndex('att1', att1) catalog.addIndex('num', num) catalog.addColumn('num') foo = MultiFieldIndex('foo') catalog.addIndex('foo', foo) if extra is not None: extra(catalog) for x in range(0, self.upper): catalog.catalogObject(Dummy(self.nums[x]), repr(x)) return catalog.__of__(Dummy('foo'))
def test_add_text_index(self): catalog = self._make_one() catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('id', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') catalog.addIndex('id', idx) i = catalog.indexes['id'] self.assertIsInstance(i, ZCTextIndex)
class QueryTestsBase(object): # Subclasses of QueryTestsBase must set a class variable IndexFactory # to the kind of index to be constructed. IndexFactory = None # The FauxIndex in testQueryEngine contains four documents. # docid 1: foo, bar, ham # docid 2: bar, ham # docid 3: foo, ham # docid 4: ham docs = ['foo bar ham', 'bar ham', 'foo ham', 'ham'] def setUp(self): self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') self.parser = QueryParser(self.lexicon) self.index = self.zc_index.index self.add_docs() def add_docs(self): for i in range(len(self.docs)): text = self.docs[i] obj = Indexable(text) self.zc_index.index_object(i + 1, obj) def compareSet(self, set, dict): # The FauxIndex and the real Index score documents very # differently. The set comparison can't actually compare the # items, but it can compare the keys. That will have to do for now. setkeys = list(set.keys()) dictkeys = list(dict.keys()) setkeys.sort() dictkeys.sort() self.assertEqual(setkeys, dictkeys)
def testDelTextIndex(self): self._catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('id', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') self._catalog.addIndex('id', idx) self._catalog.delIndex('id') self.assert_('id' not in self._catalog.indexes, 'del index failed')
def test_del_text_index(self): catalog = self._make_one() catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('id', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') catalog.addIndex('id', idx) catalog.delIndex('id') self.assertNotIn('id', catalog.indexes)
def testLexiconIsNotFoundRaisesLookupError(self): caller = LexiconHolder(self.lexicon) with self.assertRaises(LookupError): ZCTextIndex( 'name', extra=None, caller=caller, )
def index(rt, mboxfile, db, profiler): global NUM idx_time = 0 pack_time = 0 start_time = time.time() lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover()) extra = Extra() extra.lexicon_id = 'lexicon' extra.doc_attr = 'text' extra.index_type = 'Okapi BM25 Rank' caller = Extra() caller.lexicon = lexicon rt["index"] = idx = ZCTextIndex("index", extra, caller) if not EXCLUDE_TEXT: rt["documents"] = docs = IOBTree() else: docs = None transaction.commit() mbox = mailbox.UnixMailbox(open(mboxfile, 'rb')) if VERBOSE: print "opened", mboxfile if not NUM: NUM = sys.maxint if profiler: itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db) else: itime, ptime, i = indexmbox(mbox, idx, docs, db) idx_time += itime pack_time += ptime transaction.commit() if PACK_INTERVAL and i % PACK_INTERVAL != 0: if VERBOSE >= 2: print "packing one last time..." p0 = time.clock() db.pack(time.time()) p1 = time.clock() if VERBOSE: print "pack took %s sec" % (p1 - p0) pack_time += p1 - p0 if VERBOSE: finish_time = time.time() print print "Index time", round(idx_time / 60, 3), "minutes" print "Pack time", round(pack_time / 60, 3), "minutes" print "Index bytes", Message.total_bytes rate = (Message.total_bytes / idx_time) / 1024 print "Index rate %.2f KB/sec" % rate print "Indexing began", time.ctime(start_time) print "Indexing ended", time.ctime(finish_time) print "Wall clock minutes", round((finish_time - start_time) / 60, 3)
def make_zc_index(): # there's an elaborate dance necessary to construct an index class Struct: pass extra = Struct() extra.doc_attr = "read" extra.lexicon_id = "lexicon" caller = Struct() caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover()) return ZCTextIndex("read", extra, caller)
def getLexicon(self): """Get the lexicon for this index """ try: return ZCTextIndex.getLexicon(self) except: lexicon = getattr(getToolByName(getSite(), 'portal_catalog'), self.lexicon_id) if not ILexicon.providedBy(lexicon): raise TypeError('Object "%s" is not a ZCTextIndex Lexicon' % repr(lexicon)) self._v_lexicon = lexicon return lexicon
def setUp(self): from Products.ZCTextIndex.ZCTextIndex import PLexicon from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex catalog = DummyCatalog() catalog.foo_plexicon = PLexicon('foo_plexicon') extra = _extra() extra.lexicon_id = 'foo_plexicon' extra.index_type = 'Okapi BM25 Rank' self._obj = ZCTextIndex('foo_zctext', extra=extra, caller=catalog).__of__(catalog) self._XML = _ZCTEXT_XML
def testInvalidIndexTypeRaisesValueError(self): caller = LexiconHolder(self.lexicon) class Extra(object): index_type = 'Some invalid index type' with self.assertRaises(ValueError): ZCTextIndex( 'name', extra=Extra, caller=caller, index_factory=None, lexicon_id='lexicon' )
def testMultipleAttributes(self): caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('foo bar', 'alpha omega') zc_index.index_object(1, doc) nbest, total = zc_index.query('foo') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha gamma') self.assertEqual(len(nbest), 0)
def test_fixOkapiIndexes(self): catalog = ZCatalog('catalog') catalog.lexicon = PLexicon('lexicon') catalog.addIndex( 'test', ZCTextIndex('test', index_factory=OkapiIndex, caller=catalog, lexicon_id='lexicon')) catalog.Indexes['test'].index._totaldoclen = -1000 from plone.app.upgrade.v41.final import fixOkapiIndexes fixOkapiIndexes(catalog) self.assertEqual(0, catalog.Indexes['test'].index._totaldoclen())
def _make_one(self): from Products.ZCatalog.Catalog import Catalog catalog = Catalog() catalog.lexicon = PLexicon('lexicon') att1 = FieldIndex('att1') att2 = ZCTextIndex('att2', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') att3 = KeywordIndex('att3') catalog.addIndex('att1', att1) catalog.addIndex('att2', att2) catalog.addIndex('att3', att3) for x in range(0, self.upper): catalog.catalogObject(Dummy(x), repr(x)) return catalog.__of__(Dummy('foo'))
def setUp(self): self._catalog = self._makeOne() self._catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('title', caller=self._catalog, index_factory=OkapiIndex, lexicon_id='lexicon') self._catalog.addIndex('title', idx) self._catalog.addIndex('true', FieldIndex('true')) self._catalog.addColumn('title') cat = self._get_catalog() for i in (1, 2, 3, 10, 11, 110, 111): obj = zdummy(i) obj.true = True if i == 110: obj.true = False cat.catalogObject(obj, str(i))
def _make_one(self): from Products.ZCatalog.Catalog import Catalog catalog = Catalog() catalog.lexicon = PLexicon('lexicon') idx = ZCTextIndex('title', caller=catalog, index_factory=OkapiIndex, lexicon_id='lexicon') catalog.addIndex('title', idx) catalog.addIndex('true', FieldIndex('true')) catalog.addColumn('title') for i in (1, 2, 3, 10, 11, 110, 111): obj = ZDummy(i) obj.true = True if i == 110: obj.true = False catalog.catalogObject(obj, str(i)) return catalog.__of__(ZDummy(1))
def testListAttributes(self): caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('Hello Tim', [ 'Now is the winter of our discontent', 'Made glorious summer by this sun of York', ]) zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('York Tim') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('Tuesday Tim York') self.assertEqual(len(nbest), 0)
def setUp(self): self.catalogs = [] for i in range(3): cat = self._makeOne() cat.lexicon = PLexicon('lexicon') cat.addIndex('num', FieldIndex('num')) cat.addIndex('big', FieldIndex('big')) cat.addIndex('number', FieldIndex('number')) i = ZCTextIndex('title', caller=cat, index_factory=OkapiIndex, lexicon_id='lexicon') cat.addIndex('title', i) cat = cat.__of__(zdummy(16336)) for i in range(10): obj = zdummy(i) obj.big = i > 5 obj.number = True cat.catalogObject(obj, str(i)) self.catalogs.append(cat)
def _make_many(self): from Products.ZCatalog.Catalog import mergeResults catalogs = [] for i in range(3): cat = self._make_one() cat.lexicon = PLexicon('lexicon') cat.addIndex('num', FieldIndex('num')) cat.addIndex('big', FieldIndex('big')) cat.addIndex('number', FieldIndex('number')) i = ZCTextIndex('title', caller=cat, index_factory=OkapiIndex, lexicon_id='lexicon') cat.addIndex('title', i) cat = cat.__of__(ZDummy(16336)) for i in range(10): obj = ZDummy(i) obj.big = i > 5 obj.number = True cat.catalogObject(obj, str(i)) catalogs.append(cat) return catalogs, mergeResults
def __init__(self, id='Help', title=''): self.id = id self.title = title c = self.catalog = ZCatalog('catalog') l = PLexicon('lexicon', '', HTMLWordSplitter(), CaseNormalizer(), StopWordRemover()) c._setObject('lexicon', l) i = ZCTextIndex('SearchableText', caller=c, index_factory=OkapiIndex, lexicon_id=l.id) # not using c.addIndex because it depends on Product initialization c._catalog.addIndex('SearchableText', i) c._catalog.addIndex('categories', KeywordIndex('categories')) c._catalog.addIndex('permissions', KeywordIndex('permissions')) c.addColumn('categories') c.addColumn('permissions') c.addColumn('title_or_id') c.addColumn('url') c.addColumn('id')
def testListAttributes(self): caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('Hello Tim', ['Now is the winter of our discontent', 'Made glorious summer by this sun of York', ]) zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('York Tim') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('Tuesday Tim York') self.assertEqual(len(nbest), 0)
class ZCIndexTestsBase: def setUp(self): self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') self.index = self.zc_index.index def parserFailure(self, query): self.assertRaises(ParseError, self.zc_index.query, query) def parserSuccess(self, query, n): r, num = self.zc_index.query(query) self.assertEqual(num, n) if n: self.assertEqual(r[0][0], 1) def testMultipleAttributes(self): lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('foo bar', 'alpha omega') zc_index.index_object(1, doc) nbest, total = zc_index.query('foo') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha gamma') self.assertEqual(len(nbest), 0) def testListAttributes(self): lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('Hello Tim', \ ['Now is the winter of our discontent', 'Made glorious summer by this sun of York', ]) zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('York Tim') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('Tuesday Tim York') self.assertEqual(len(nbest), 0) def testStopWords(self): # the only non-stopword is question text = ("to be or not to be " "that is the question") doc = Indexable(text) self.zc_index.index_object(1, doc) for word in text.split(): if word != "question": wids = self.lexicon.termToWordIds(word) self.assertEqual(wids, []) self.assertEqual(len(self.index.get_words(1)), 1) self.parserSuccess('question', 1) self.parserSuccess('question AND to AND be', 1) self.parserSuccess('to AND question AND be', 1) self.parserSuccess('question AND NOT gardenia', 1) self.parserSuccess('question AND gardenia', 0) self.parserSuccess('gardenia', 0) self.parserSuccess('question OR gardenia', 1) self.parserSuccess('question AND NOT to AND NOT be', 1) self.parserSuccess('question OR to OR be', 1) self.parserSuccess('question to be', 1) self.parserFailure('to be') self.parserFailure('to AND be') self.parserFailure('to OR be') self.parserFailure('to AND NOT be') self.parserFailure('to AND NOT question') self.parserFailure('to AND NOT gardenia') def testDocUpdate(self): docid = 1 # doesn't change -- we index the same doc repeatedly N = len(text) stop = get_stopdict() d = {} # word -> list of version numbers containing that word for version, i in zip(text, range(N)): # use a simple splitter rather than an official one words = [ w for w in re.split("\W+", version.lower()) if len(w) > 1 and not stop.has_key(w) ] word_seen = {} for w in words: if not word_seen.has_key(w): d.setdefault(w, []).append(i) word_seen[w] = 1 unique = {} # version number -> list of words unique to that version common = [] # list of words common to all versions for w, versionlist in d.items(): if len(versionlist) == 1: unique.setdefault(versionlist[0], []).append(w) elif len(versionlist) == N: common.append(w) self.assert_(len(common) > 0) self.assert_(len(unique) > 0) for version, i in zip(text, range(N)): doc = Indexable(version) self.zc_index.index_object(docid, doc) for w in common: nbest, total = self.zc_index.query(w) self.assertEqual(total, 1, "did not find %s" % w) for k, v in unique.items(): if k == i: continue for w in v: nbest, total = self.zc_index.query(w) self.assertEqual(total, 0, "did not expect to find %s" % w)
class ZCIndexTestsBase: def setUp(self): self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') self.index = self.zc_index.index def parserFailure(self, query): self.assertRaises(ParseError, self.zc_index.query, query) def parserSuccess(self, query, n): r, num = self.zc_index.query(query) self.assertEqual(num, n) if n: self.assertEqual(r[0][0], 1) def testMultipleAttributes(self): lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('foo bar', 'alpha omega') zc_index.index_object(1, doc) nbest, total = zc_index.query('foo') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha gamma') self.assertEqual(len(nbest), 0) def testListAttributes(self): lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('Hello Tim', \ ['Now is the winter of our discontent', 'Made glorious summer by this sun of York', ]) zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('York Tim') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('Tuesday Tim York') self.assertEqual(len(nbest), 0) def testStopWords(self): # the only non-stopword is question text = ("to be or not to be " "that is the question") doc = Indexable(text) self.zc_index.index_object(1, doc) for word in text.split(): if word != "question": wids = self.lexicon.termToWordIds(word) self.assertEqual(wids, []) self.assertEqual(len(self.index.get_words(1)), 1) self.parserSuccess('question', 1) self.parserSuccess('question AND to AND be', 1) self.parserSuccess('to AND question AND be', 1) self.parserSuccess('question AND NOT gardenia', 1) self.parserSuccess('question AND gardenia', 0) self.parserSuccess('gardenia', 0) self.parserSuccess('question OR gardenia', 1) self.parserSuccess('question AND NOT to AND NOT be', 1) self.parserSuccess('question OR to OR be', 1) self.parserSuccess('question to be', 1) self.parserFailure('to be') self.parserFailure('to AND be') self.parserFailure('to OR be') self.parserFailure('to AND NOT be') self.parserFailure('to AND NOT question') self.parserFailure('to AND NOT gardenia') def testDocUpdate(self): docid = 1 # doesn't change -- we index the same doc repeatedly N = len(text) stop = get_stopdict() d = {} # word -> list of version numbers containing that word for version, i in zip(text, range(N)): # use a simple splitter rather than an official one words = [w for w in re.split("\W+", version.lower()) if len(w) > 1 and not stop.has_key(w)] word_seen = {} for w in words: if not word_seen.has_key(w): d.setdefault(w, []).append(i) word_seen[w] = 1 unique = {} # version number -> list of words unique to that version common = [] # list of words common to all versions for w, versionlist in d.items(): if len(versionlist) == 1: unique.setdefault(versionlist[0], []).append(w) elif len(versionlist) == N: common.append(w) self.assert_(len(common) > 0) self.assert_(len(unique) > 0) for version, i in zip(text, range(N)): doc = Indexable(version) self.zc_index.index_object(docid, doc) for w in common: nbest, total = self.zc_index.query(w) self.assertEqual(total, 1, "did not find %s" % w) for k, v in unique.items(): if k == i: continue for w in v: nbest, total = self.zc_index.query(w) self.assertEqual(total, 0, "did not expect to find %s" % w)
class ZCIndexTestsBase(object): def setUp(self): self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(), StopWordRemover()) caller = LexiconHolder(self.lexicon) self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') self.index = self.zc_index.index def parserFailure(self, query): self.assertRaises(ParseError, self.zc_index.query, query) def parserSuccess(self, query, n): r, num = self.zc_index.query(query) self.assertEqual(num, n) if n: self.assertEqual(r[0][0], 1) def testMultipleAttributes(self): caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('foo bar', 'alpha omega') zc_index.index_object(1, doc) nbest, total = zc_index.query('foo') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('foo alpha gamma') self.assertEqual(len(nbest), 0) def testListAttributes(self): caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text1,text2', 'lexicon') doc = Indexable2('Hello Tim', ['Now is the winter of our discontent', 'Made glorious summer by this sun of York', ]) zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('York Tim') self.assertEqual(len(nbest), 1) nbest, total = zc_index.query('Tuesday Tim York') self.assertEqual(len(nbest), 0) def testReindex(self): caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') doc = Indexable('Hello Tim') zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 0) nbest, total = zc_index.query('Tim') self.assertEqual(len(nbest), 1) # reindex with another value doc.text = 'Goodbye George' zc_index.index_object(1, doc) nbest, total = zc_index.query('Tim') self.assertEqual(len(nbest), 0) nbest, total = zc_index.query('Goodbye') self.assertEqual(len(nbest), 1) # reindex with an empty value doc.text = '' zc_index.index_object(1, doc) nbest, total = zc_index.query('George') self.assertEqual(len(nbest), 0) def testStopWords(self): # the only non-stopword is question text = ('to be or not to be ' 'that is the question') doc = Indexable(text) self.zc_index.index_object(1, doc) for word in text.split(): if word != 'question': wids = self.lexicon.termToWordIds(word) self.assertEqual(wids, []) self.assertEqual(len(self.index.get_words(1)), 1) self.parserSuccess('question', 1) self.parserSuccess('question AND to AND be', 1) self.parserSuccess('to AND question AND be', 1) self.parserSuccess('question AND NOT gardenia', 1) self.parserSuccess('question AND gardenia', 0) self.parserSuccess('gardenia', 0) self.parserSuccess('question OR gardenia', 1) self.parserSuccess('question AND NOT to AND NOT be', 1) self.parserSuccess('question OR to OR be', 1) self.parserSuccess('question to be', 1) self.parserFailure('to be') self.parserFailure('to AND be') self.parserFailure('to OR be') self.parserFailure('to AND NOT be') self.parserFailure('to AND NOT question') self.parserFailure('to AND NOT gardenia') def testDocUpdate(self): docid = 1 # doesn't change -- we index the same doc repeatedly N = len(text) stop = get_stopdict() d = {} # word -> list of version numbers containing that word for version, i in zip(text, range(N)): # use a simple splitter rather than an official one words = [w for w in re.split(r'\W+', version.lower()) if len(w) > 1 and w not in stop] word_seen = {} for w in words: if w not in word_seen: d.setdefault(w, []).append(i) word_seen[w] = 1 unique = {} # version number -> list of words unique to that version common = [] # list of words common to all versions for w, versionlist in d.items(): if len(versionlist) == 1: unique.setdefault(versionlist[0], []).append(w) elif len(versionlist) == N: common.append(w) self.assertGreater(len(common), 0) self.assertGreater(len(unique), 0) for version, i in zip(text, range(N)): doc = Indexable(version) self.zc_index.index_object(docid, doc) for w in common: nbest, total = self.zc_index.query(w) self.assertEqual(total, 1, 'did not find {0}'.format(w)) for k, v in unique.items(): if k == i: continue for w in v: nbest, total = self.zc_index.query(w) self.assertEqual( total, 0, 'did not expect to find {0}'.format(w) ) def testLexiconIsNotFoundRaisesLookupError(self): caller = LexiconHolder(self.lexicon) with self.assertRaises(LookupError): ZCTextIndex( 'name', extra=None, caller=caller, ) def testInvalidIndexTypeRaisesValueError(self): caller = LexiconHolder(self.lexicon) class Extra(object): index_type = 'Some invalid index type' with self.assertRaises(ValueError): ZCTextIndex( 'name', extra=Extra, caller=caller, index_factory=None, lexicon_id='lexicon' )
def testReindex(self): caller = LexiconHolder(self.lexicon) zc_index = ZCTextIndex('name', None, caller, self.IndexFactory, 'text', 'lexicon') doc = Indexable('Hello Tim') zc_index.index_object(1, doc) nbest, total = zc_index.query('glorious') self.assertEqual(len(nbest), 0) nbest, total = zc_index.query('Tim') self.assertEqual(len(nbest), 1) # reindex with another value doc.text = 'Goodbye George' zc_index.index_object(1, doc) nbest, total = zc_index.query('Tim') self.assertEqual(len(nbest), 0) nbest, total = zc_index.query('Goodbye') self.assertEqual(len(nbest), 1) # reindex with an empty value doc.text = '' zc_index.index_object(1, doc) nbest, total = zc_index.query('George') self.assertEqual(len(nbest), 0)