gb_encoding = getSupportedEncoding(['gb18030', 'mbcs', 'gbk', 'gb2312']) class GBSplitter(CJKSplitter): default_encoding = gb_encoding big5_encoding = getSupportedEncoding(['big5', 'mbcs']) class BIG5Splitter(CJKSplitter): default_encoding = big5_encoding try: element_factory.registerFactory('Word Splitter', 'CJK splitter', CJKSplitter) element_factory.registerFactory('Word Splitter', 'CJK GB splitter', GBSplitter) element_factory.registerFactory('Word Splitter', 'CJK BIG5 splitter', BIG5Splitter) except: # ValueError: # in case the splitter is already registered, ValueError is raised pass if __name__ == '__main__': words = ['abc def我们的很 好。', '金益康eHR产品', '户外广告测试文', '上', '上海', '上海人', '上海人民'] for word in words: print '=====now test:', word u = unicode(word, 'gbk').encode('utf8') s = CJKSplitter() print 'no glob result:'
someResultWordStrings = [] if not theStringsList: return someResultWordStrings aDefaultEncoding = self.default_encoding for aString in theStringsList: if aString: aUnicodeString = aString if not isinstance(aString, UnicodeType): aUnicodeString = unicode(aString, aDefaultEncoding, 'replace') if aUnicodeString: someWords = self.fReplaceCharsAndSplitWords( aUnicodeString, theIsGlob) if someWords: someResultWordStrings.extend(someWords) return someResultWordStrings def fReplaceCharsAndSplitWords(self, theString, theIsGlob=False): return fgReplaceCharsAndSplitWords_asUnicode(theString, theIsGlob) try: element_factory.registerFactory('Word Splitter', 'gvSIG-i18n splitter', TRASplitter) except: # ValueError: # in case the splitter is already registered, ValueError is raised pass
# non-unicode text. try: if not isinstance(s, unicode): s = unicode(s, enc) except (UnicodeDecodeError, TypeError): # Fall back to locale aware splitter result += self.rxGlob_L.findall(s) else: words = self.rxGlob_U.findall(s) result += [w.encode(enc) for w in words] return result ###classImplements(Splitter, Splitter.__implements__) try: element_factory.registerFactory('Word Splitter', 'Unicode Whitespace splitter', Splitter) except ValueError: # In case the splitter is already registered, ValueError is raised pass class CaseNormalizer: def process(self, lst): result = [] for s in lst: # This is a hack to get the normalizer working with # non-unicode text. try: if not isinstance(s, unicode): s = unicode(s, enc) except (UnicodeDecodeError, TypeError):
for word in lst: norm_word = encode_ascii(word) result.extend(self.rx.findall(norm_word)) return result def processGlob(self, lst): result = [] for word in lst: norm_word = encode_ascii(word) result.extend(self.rxGlob.findall(norm_word)) return result try: element_factory.registerFactory("Glossary Latin normalizer and splitter", "Glossary Latin normalizer and splitter", LatinNormalizerAndSplitter) except ValueError: # in case the normalizer is already registered, ValueError is raised pass class PloneGlossaryCatalog(ZCatalog): """Catalog for PloneGlossary""" id = PLONEGLOSSARY_CATALOG title = "Glossary Catalog" security = ClassSecurityInfo() def __init__(self):
splat = [] for t in text: splat += self._split(t, wordpat) return splat def processGlob(self, text): # see Lexicon.globToWordIds() return self.process(text, r"(?L)\w+[\w*?]*") def _split(self, text, wordpat): text = text.lower() remove = [r"<[^<>]*>", r"&[A-Za-z]+;"] for pat in remove: text = re.sub(pat, " ", text) return re.findall(wordpat, text) element_factory.registerFactory('Word Splitter', 'HTML aware splitter', HTMLWordSplitter) if __name__ == "__main__": import sys splitter = HTMLWordSplitter() for path in sys.argv[1:]: f = open(path, "rb") buf = f.read() f.close() print path print splitter.process([buf])
dict = read_stopwords()['stopwords'] try: from Products.ZCTextIndex.stopper import process as _process except ImportError: def process(self, lst): has_key = self.dict.has_key return [w for w in lst if not has_key(w)] else: def process(self, lst): return self._process(self.dict, lst) try: element_factory.registerFactory('Stop Words', ALISS_LEXICON_REMOVE_SW, ALiSSStopWordRemover) except: pass class ALiSSStopWordAndSingleCharRemover(StopWordRemover): dict = read_stopwords()['stopwords'] for c in range(255): dict[chr(c)] = None try: element_factory.registerFactory('Stop Words', ALISS_LEXICON_REMOVE_SW_AND_SINGLE, ALiSSStopWordAndSingleCharRemover) except:
return result def process_post_glob(self, lst): """ Will be called twice when searching. Receive list of str, Remove ? and *, then return the list of str. """ enc = 'utf-8' result = [process_str_post(s, enc) for s in lst] return result classImplements(BigramSplitter, BigramSplitter.__implements__) try: element_factory.registerFactory('Word Splitter', 'Bigram Splitter', BigramSplitter) except ValueError: # In case the splitter is already registered, ValueError is raised pass class BigramCaseNormalizer(object): def process(self, lst): enc = 'utf-8' result = [] for s in lst: # This is a hack to get the normalizer working with # non-unicode text. try: if not isinstance(s, unicode): s = unicode(s, enc)
# add the last word to the catalog if not isGlob: result.append(w[-1]) else: result.append(w) # return [word.encode('utf8') for word in result] return result def processGlob(self, lst): return self.process(lst, 1) gb_encoding = getSupportedEncoding(['gb18030', 'mbcs', 'gbk', 'gb2312']) class GBSplitter(CJKSplitter): default_encoding = gb_encoding big5_encoding = getSupportedEncoding(['big5', 'mbcs']) class BIG5Splitter(CJKSplitter): default_encoding = big5_encoding try: element_factory.registerFactory('Word Splitter', 'CJK splitter', CJKSplitter) element_factory.registerFactory('Word Splitter', 'CJK GB splitter', GBSplitter) element_factory.registerFactory('Word Splitter', 'CJK BIG5 splitter', BIG5Splitter) except:# ValueError: # in case the splitter is already registered, ValueError is raised pass
word_pattern = r"(?L)\w+" glob_pattern = r"(?L)\w+[\w*?]*" else: # in Python 3, the locale flag can only be applied to bytes patterns word_pattern = r"\w+" glob_pattern = r"\w+[\w*?]*" @implementer(ISplitter) class HTMLWordSplitter(object): def process(self, text, wordpat=word_pattern): splat = [] for t in text: splat += self._split(t, wordpat) return splat def processGlob(self, text): # see Lexicon.globToWordIds() return self.process(text, glob_pattern) def _split(self, text, wordpat): text = text.lower() remove = [r"<[^<>]*>", r"&[A-Za-z]+;"] for pat in remove: text = re.sub(pat, " ", text) return re.findall(wordpat, text) element_factory.registerFactory('Word Splitter', 'HTML aware splitter', HTMLWordSplitter)
from Products.ZCTextIndex.ISplitter import ISplitter from Products.ZCTextIndex.PipelineFactory import element_factory ENC = "utf-8" def process_unicode(s): tokenizer = Tokenizer() return [t.surface for t in tokenizer.tokenize(s)] class JanomeSplitter(object): """ Japanese Splitter by janome """ __implements__ = ISplitter def process(self, lst): """ Will be called when indexing. Receive list of str, then return the list of str. """ result = [] for s in lst: if not isinstance(s, unicode): s = s.decode(ENC, 'replace') # Ignore '*' and '?' globbing s.replace(u"?", u"").replace(u"*", u"") result += process_unicode(s) return result element_factory.registerFactory('Word Splitter', 'JanomeSplitter', JanomeSplitter)
def processGlob(self, lst): """ Will be called once when searching. Receive list of str, make it bi-grammed considering globbing, then return the list of str. """ return [x for s in lst for x in process_str_glob(s)] def process_post_glob(self, lst): """ Will be called twice when searching. Receive list of str, Remove ? and *, then return the list of str. """ return [process_str_post(s) for s in lst] try: element_factory.registerFactory('Word Splitter', 'Unicode Whitespace splitter', Splitter) except ValueError: # In case the splitter is already registered, ValueError is raised pass class CaseNormalizer(object): def process(self, lst): enc = 'utf-8' result = [] for s in lst: # This is a hack to get the normalizer working with # non-unicode text. try: if not isinstance(s, unicode):
return list(res) # ------------------------------------------------------------------------------ class TextIndexer: '''Extracts, from text field values, a normalized value to index.''' def process(self, texts): res = set() for text in texts: cleanText = normalizeText(text) res = res.union(splitIntoWords(cleanText)) return list(res) class ListIndexer: '''This lexicon does nothing: list of values must be indexed as is.''' def process(self, texts): return texts # ------------------------------------------------------------------------------ try: from Products.ZCTextIndex.PipelineFactory import element_factory as ef ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer) ef.registerFactory('Text indexer', 'Text indexer', TextIndexer) ef.registerFactory('List indexer', 'List indexer', ListIndexer) except ImportError: # May occur at generation time. pass # ------------------------------------------------------------------------------
for word in lst: norm_word = encode_ascii(word) result.extend(self.rx.findall(norm_word)) return result def processGlob(self, lst): result = [] for word in lst: norm_word = encode_ascii(word) result.extend(self.rxGlob.findall(norm_word)) return result try: element_factory.registerFactory( "Glossary Latin normalizer and splitter", "Glossary Latin normalizer and splitter", LatinNormalizerAndSplitter) except ValueError: # in case the normalizer is already registered, ValueError is raised pass class PloneGlossaryCatalog(ZCatalog): """Catalog for PloneGlossary""" id = PLONEGLOSSARY_CATALOG title = "Glossary Catalog" security = ClassSecurityInfo() def __init__(self):
rx = re.compile(r"\w+", re.UNICODE) rxGlob = re.compile(r"\w+[\w*?]*", re.UNICODE) def process(self, lst): result = [] for s in lst: result += self.rx.findall(s) return result def processGlob(self, lst): result = [] for s in lst: result += self.rxGlob.findall(s) return result try: element_factory.registerFactory( 'Word Splitter', 'Unicode Whitespace splitter', Splitter) except ValueError: # in case the splitter is already registred, ValueError is raised pass if __name__ == "__main__": import sys splitter = Splitter() for path in sys.argv[1:]: f = open(path, "rb") buf = f.read() f.close() print path print splitter.process([buf])
class UnicodeHTMLWordSplitter(UnicodeWordSplitter): def process(self, lst, glob=False): return UnicodeWordSplitter.process(self, lst, glob, True) class UnicodeCaseNormalizer: def process(self, lst): result = [] for w in lst: if not isinstance(w, unicode): w = unicode(w, enc) result.append(w.lower()) return result try: element_factory.registerFactory('Word Splitter', 'Unicode Whitespace splitter', UnicodeWordSplitter) element_factory.registerFactory('Word Splitter', 'Unicode HTML aware splitter', UnicodeHTMLWordSplitter) element_factory.registerFactory('Case Normalizer', 'Unicode Case normalizer', UnicodeCaseNormalizer) except ValueError: # in case the splitter is already registered, ValueError is raised pass
globbing, then return the list of str. """ return [x for s in lst for x in process_str_glob(s)] def process_post_glob(self, lst): """ Will be called twice when searching. Receive list of str, Remove ? and *, then return the list of str. """ return [process_str_post(s) for s in lst] try: element_factory.registerFactory( 'Word Splitter', 'Unicode Whitespace splitter', Splitter, ) except ValueError: # In case the splitter is already registered, ValueError is raised pass class CaseNormalizer: def process(self, lst): enc = 'utf-8' result = [] for s in lst: # This is a hack to get the normalizer working with # non-unicode text. try:
rxGlob = re.compile(r"(?L)\w+[\w*?]*") # See globToWordIds() above def process(self, lst): result = [] for s in lst: result += self.rx.findall(s) return result def processGlob(self, lst): result = [] for s in lst: result += self.rxGlob.findall(s) return result element_factory.registerFactory('Word Splitter', 'Whitespace splitter', Splitter) class CaseNormalizer: def process(self, lst): return [w.lower() for w in lst] element_factory.registerFactory('Case Normalizer', 'Case Normalizer', CaseNormalizer) element_factory.registerFactory('Stop Words', ' Don\'t remove stop words', None) class StopWordRemover:
rxGlob = re.compile(r"(?L)\w+[\w*?]*") # See globToWordIds() above def process(self, lst): result = [] for s in lst: result += self.rx.findall(s) return result def processGlob(self, lst): result = [] for s in lst: result += self.rxGlob.findall(s) return result element_factory.registerFactory('Word Splitter', 'Whitespace splitter', Splitter) class CaseNormalizer: def process(self, lst): return [w.lower() for w in lst] element_factory.registerFactory('Case Normalizer', 'Case Normalizer', CaseNormalizer) element_factory.registerFactory('Stop Words', ' Don\'t remove stop words', None)
enc = 'utf-8' result = [x for s in lst for x in process_str_glob(s, enc)] return result def process_post_glob(self, lst): """ Will be called twice when searching. Receive list of str, Remove ? and *, then return the list of str. """ enc = 'utf-8' result = [process_str_post(s, enc) for s in lst] return result try: element_factory.registerFactory('Word Splitter', 'Unicode Whitespace splitter', Splitter) except ValueError: # In case the splitter is already registered, ValueError is raised pass class CaseNormalizer(object): def process(self, lst): enc = 'utf-8' result = [] for s in lst: # This is a hack to get the normalizer working with # non-unicode text. try: if not isinstance(s, unicode): s = unicode(s, enc)
for text in texts: extractor = XhtmlTextExtractor(raiseOnError=False) cleanText = extractor.parse('<p>%s</p>' % text) res = res.union(splitIntoWords(cleanText)) return list(res) # ------------------------------------------------------------------------------ class TextIndexer: '''Extracts, from text field values, a normalized value to index.''' def process(self, texts): res = set() for text in texts: cleanText = normalizeText(text) res = res.union(splitIntoWords(cleanText)) return list(res) class ListIndexer: '''This lexicon does nothing: list of values must be indexed as is.''' def process(self, texts): return texts # ------------------------------------------------------------------------------ try: from Products.ZCTextIndex.PipelineFactory import element_factory as ef ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer) ef.registerFactory('Text indexer', 'Text indexer', TextIndexer) ef.registerFactory('List indexer', 'List indexer', ListIndexer) except ImportError: # May occur at generation time. pass # ------------------------------------------------------------------------------