def updateIndexes(self): if not getattr(self, 'audit_lexicon', None): # installing, add lexicon, indexes and metadata self.addIndex('last_audited_date', 'DateIndex') self.addIndex('audited_action', 'KeywordIndex') self.addColumn('Title') self.addColumn('id') self.addColumn('UID') self.addColumn('last_audited_date') self.addColumn('audited_action') l = PLexicon('audit_lexicon', '', HTMLWordSplitter(), CaseNormalizer(), StopWordRemover()) self._setObject('audit_lexicon', l) catalog = portal_api.get_tool('portal_catalog') indexes = catalog._catalog.indexes for name, index in indexes.items(): if name in self._catalog.indexes.keys(): continue if index.meta_type == 'DateRecurringIndex': continue elif index.meta_type == 'ZCTextIndex': extras = Empty() extras.doc_attr = name extras.index_type = 'Okapi BM25 Rank' extras.lexicon_id = 'audit_lexicon' self.addIndex(name, index.meta_type, extras) else: self.addIndex(name, index.meta_type)
class MySplitter: def __init__(self): self._v_splitter = HTMLWordSplitter() def __call__(self, text, stopdict, *args, **kwargs): words = self._v_splitter._split(text) def lookup(w): return stopdict.get(w, w) return filter(None, map(lookup, words))
def make_zc_index(): # there's an elaborate dance necessary to construct an index class Struct: pass extra = Struct() extra.doc_attr = "read" extra.lexicon_id = "lexicon" caller = Struct() caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover()) return ZCTextIndex("read", extra, caller)
def enumerateLexicons(self): return ( ( 'plaintext_lexicon' , Splitter() , CaseNormalizer() , StopWordRemover() ) , ( 'htmltext_lexicon' , HTMLWordSplitter() , CaseNormalizer() , StopWordRemover() ) )
def testSplitterLocaleAwareness(self): from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter import locale loc = locale.setlocale(locale.LC_ALL) # get current locale # set German locale try: if sys.platform != 'win32': locale.setlocale(locale.LC_ALL, 'de_DE.ISO8859-1') else: locale.setlocale(locale.LC_ALL, 'German_Germany.1252') except locale.Error: return # This test doesn't work here :-( expected = [ 'm\xfclltonne', 'waschb\xe4r', 'beh\xf6rde', '\xfcberflieger' ] words = [" ".join(expected)] words = Splitter().process(words) self.assertEqual(words, expected) words = HTMLWordSplitter().process(words) self.assertEqual(words, expected) locale.setlocale(locale.LC_ALL, loc) # restore saved locale
def __init__(self, id='Help', title=''): self.id = id self.title = title c = self.catalog = ZCatalog('catalog') l = PLexicon('lexicon', '', HTMLWordSplitter(), CaseNormalizer(), StopWordRemover()) c._setObject('lexicon', l) i = ZCTextIndex('SearchableText', caller=c, index_factory=OkapiIndex, lexicon_id=l.id) # not using c.addIndex because it depends on Product initialization c._catalog.addIndex('SearchableText', i) c._catalog.addIndex('categories', KeywordIndex('categories')) c._catalog.addIndex('permissions', KeywordIndex('permissions')) c.addColumn('categories') c.addColumn('permissions') c.addColumn('title_or_id') c.addColumn('url') c.addColumn('id')
def __init__(self): self._v_splitter = HTMLWordSplitter()