Example #1
0
gb_encoding = getSupportedEncoding(['gb18030', 'mbcs', 'gbk', 'gb2312'])


class GBSplitter(CJKSplitter):
    default_encoding = gb_encoding


big5_encoding = getSupportedEncoding(['big5', 'mbcs'])


class BIG5Splitter(CJKSplitter):
    default_encoding = big5_encoding


try:
    element_factory.registerFactory('Word Splitter', 'CJK splitter',
                                    CJKSplitter)
    element_factory.registerFactory('Word Splitter', 'CJK GB splitter',
                                    GBSplitter)
    element_factory.registerFactory('Word Splitter', 'CJK BIG5 splitter',
                                    BIG5Splitter)
except:  # ValueError:
    # in case the splitter is already registered, ValueError is raised
    pass

if __name__ == '__main__':
    words = ['abc def我们的很 好。', '金益康eHR产品', '户外广告测试文', '上', '上海', '上海人', '上海人民']
    for word in words:
        print '=====now test:', word
        u = unicode(word, 'gbk').encode('utf8')
        s = CJKSplitter()
        print 'no glob result:'
Example #2
0
        someResultWordStrings = []
        if not theStringsList:
            return someResultWordStrings

        aDefaultEncoding = self.default_encoding
        for aString in theStringsList:
            if aString:
                aUnicodeString = aString
                if not isinstance(aString, UnicodeType):
                    aUnicodeString = unicode(aString, aDefaultEncoding,
                                             'replace')

                if aUnicodeString:
                    someWords = self.fReplaceCharsAndSplitWords(
                        aUnicodeString, theIsGlob)
                    if someWords:
                        someResultWordStrings.extend(someWords)

        return someResultWordStrings

    def fReplaceCharsAndSplitWords(self, theString, theIsGlob=False):
        return fgReplaceCharsAndSplitWords_asUnicode(theString, theIsGlob)


try:
    element_factory.registerFactory('Word Splitter', 'gvSIG-i18n splitter',
                                    TRASplitter)
except:  # ValueError:
    # in case the splitter is already registered, ValueError is raised
    pass
Example #3
0
            # non-unicode text.
            try:
                if not isinstance(s, unicode):
                    s = unicode(s, enc)
            except (UnicodeDecodeError, TypeError):
                # Fall back to locale aware splitter
                result += self.rxGlob_L.findall(s)
            else:
                words = self.rxGlob_U.findall(s)
                result += [w.encode(enc) for w in words]
        return result

###classImplements(Splitter, Splitter.__implements__)

try:
    element_factory.registerFactory('Word Splitter',
        'Unicode Whitespace splitter', Splitter)
except ValueError:
    # In case the splitter is already registered, ValueError is raised
    pass

class CaseNormalizer:

    def process(self, lst):
        result = []
        for s in lst:
            # This is a hack to get the normalizer working with
            # non-unicode text.
            try:
                if not isinstance(s, unicode):
                    s = unicode(s, enc)
            except (UnicodeDecodeError, TypeError):
Example #4
0
        for word in lst:
            norm_word = encode_ascii(word)
            result.extend(self.rx.findall(norm_word))
        return result

    def processGlob(self, lst):
        result = []
        for word in lst:
            norm_word = encode_ascii(word)
            result.extend(self.rxGlob.findall(norm_word))
        return result


try:
    element_factory.registerFactory("Glossary Latin normalizer and splitter",
                                    "Glossary Latin normalizer and splitter",
                                    LatinNormalizerAndSplitter)
except ValueError:
    # in case the normalizer is already registered, ValueError is raised
    pass


class PloneGlossaryCatalog(ZCatalog):
    """Catalog for PloneGlossary"""

    id = PLONEGLOSSARY_CATALOG
    title = "Glossary Catalog"

    security = ClassSecurityInfo()

    def __init__(self):
Example #5
0
        splat = []
        for t in text:
            splat += self._split(t, wordpat)
        return splat

    def processGlob(self, text):
        # see Lexicon.globToWordIds()
        return self.process(text, r"(?L)\w+[\w*?]*")

    def _split(self, text, wordpat):
        text = text.lower()
        remove = [r"<[^<>]*>",
                  r"&[A-Za-z]+;"]
        for pat in remove:
            text = re.sub(pat, " ", text)
        return re.findall(wordpat, text)

element_factory.registerFactory('Word Splitter',
                                'HTML aware splitter',
                                HTMLWordSplitter)

if __name__ == "__main__":
    import sys
    splitter = HTMLWordSplitter()
    for path in sys.argv[1:]:
        f = open(path, "rb")
        buf = f.read()
        f.close()
        print path
        print splitter.process([buf])
Example #6
0
    dict = read_stopwords()['stopwords']

    try:
        from Products.ZCTextIndex.stopper import process as _process
    except ImportError:
        def process(self, lst):
            has_key = self.dict.has_key
            return [w for w in lst if not has_key(w)]
    else:
        def process(self, lst):
            return self._process(self.dict, lst)

try:
    element_factory.registerFactory('Stop Words',
                                    ALISS_LEXICON_REMOVE_SW,
                                    ALiSSStopWordRemover)
except:
    pass

class ALiSSStopWordAndSingleCharRemover(StopWordRemover):

    dict = read_stopwords()['stopwords']
    for c in range(255):
        dict[chr(c)] = None

try:
    element_factory.registerFactory('Stop Words',
                                    ALISS_LEXICON_REMOVE_SW_AND_SINGLE,
                                    ALiSSStopWordAndSingleCharRemover)
except:
        return result

    def process_post_glob(self, lst):
        """ Will be called twice when searching.
        Receive list of str, Remove ? and *, then return
        the list of str.
        """
        enc = 'utf-8'
        result = [process_str_post(s, enc) for s in lst]
        return result


classImplements(BigramSplitter, BigramSplitter.__implements__)

try:
    element_factory.registerFactory('Word Splitter', 'Bigram Splitter',
                                    BigramSplitter)
except ValueError:
    # In case the splitter is already registered, ValueError is raised
    pass


class BigramCaseNormalizer(object):
    def process(self, lst):
        enc = 'utf-8'
        result = []
        for s in lst:
            # This is a hack to get the normalizer working with
            # non-unicode text.
            try:
                if not isinstance(s, unicode):
                    s = unicode(s, enc)
Example #8
0
                        # add the last word to the catalog
			if not isGlob:
                            result.append(w[-1])
                else:
                    result.append(w)
        # return [word.encode('utf8') for word in result]
        return result

    def processGlob(self, lst):
        return self.process(lst, 1)

gb_encoding = getSupportedEncoding(['gb18030', 'mbcs', 'gbk', 'gb2312']) 
class GBSplitter(CJKSplitter):
    default_encoding = gb_encoding

big5_encoding = getSupportedEncoding(['big5', 'mbcs'])
class BIG5Splitter(CJKSplitter):
    default_encoding = big5_encoding

try:
    element_factory.registerFactory('Word Splitter',
          'CJK splitter', CJKSplitter)
    element_factory.registerFactory('Word Splitter',
          'CJK GB splitter', GBSplitter)
    element_factory.registerFactory('Word Splitter',
          'CJK BIG5 splitter', BIG5Splitter)
except:# ValueError:
    # in case the splitter is already registered, ValueError is raised
    pass

Example #9
0
    word_pattern = r"(?L)\w+"
    glob_pattern = r"(?L)\w+[\w*?]*"
else:
    # in Python 3, the locale flag can only be applied to bytes patterns
    word_pattern = r"\w+"
    glob_pattern = r"\w+[\w*?]*"


@implementer(ISplitter)
class HTMLWordSplitter(object):
    def process(self, text, wordpat=word_pattern):
        splat = []
        for t in text:
            splat += self._split(t, wordpat)
        return splat

    def processGlob(self, text):
        # see Lexicon.globToWordIds()
        return self.process(text, glob_pattern)

    def _split(self, text, wordpat):
        text = text.lower()
        remove = [r"<[^<>]*>", r"&[A-Za-z]+;"]
        for pat in remove:
            text = re.sub(pat, " ", text)
        return re.findall(wordpat, text)


element_factory.registerFactory('Word Splitter', 'HTML aware splitter',
                                HTMLWordSplitter)
Example #10
0
from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory

ENC = "utf-8"

def process_unicode(s):
    tokenizer = Tokenizer()
    return [t.surface for t in tokenizer.tokenize(s)]

class JanomeSplitter(object):
    """
    Japanese Splitter by janome
    """
    __implements__ = ISplitter

    def process(self, lst):
        """ Will be called when indexing.
        Receive list of str, then return the list of str.
        """
        result = []
        for s in lst:
            if not isinstance(s, unicode):
                s = s.decode(ENC, 'replace')
            # Ignore '*' and '?' globbing
            s.replace(u"?", u"").replace(u"*", u"")
            result += process_unicode(s)
        return result

element_factory.registerFactory('Word Splitter',
                        'JanomeSplitter', JanomeSplitter)
Example #11
0
    def processGlob(self, lst):
        """ Will be called once when searching.
        Receive list of str, make it bi-grammed considering
        globbing, then return the list of str.
        """
        return [x for s in lst for x in process_str_glob(s)]

    def process_post_glob(self, lst):
        """ Will be called twice when searching.
        Receive list of str, Remove ? and *, then return
        the list of str.
        """
        return [process_str_post(s) for s in lst]

try:
    element_factory.registerFactory('Word Splitter',
        'Unicode Whitespace splitter', Splitter)
except ValueError:
    # In case the splitter is already registered, ValueError is raised
    pass


class CaseNormalizer(object):

    def process(self, lst):
        enc = 'utf-8'
        result = []
        for s in lst:
            # This is a hack to get the normalizer working with
            # non-unicode text.
            try:
                if not isinstance(s, unicode):
Example #12
0
        return list(res)


# ------------------------------------------------------------------------------
class TextIndexer:
    '''Extracts, from text field values, a normalized value to index.'''
    def process(self, texts):
        res = set()
        for text in texts:
            cleanText = normalizeText(text)
            res = res.union(splitIntoWords(cleanText))
        return list(res)


class ListIndexer:
    '''This lexicon does nothing: list of values must be indexed as is.'''
    def process(self, texts):
        return texts


# ------------------------------------------------------------------------------
try:
    from Products.ZCTextIndex.PipelineFactory import element_factory as ef
    ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
    ef.registerFactory('Text indexer', 'Text indexer', TextIndexer)
    ef.registerFactory('List indexer', 'List indexer', ListIndexer)
except ImportError:
    # May occur at generation time.
    pass
# ------------------------------------------------------------------------------
        for word in lst:
            norm_word = encode_ascii(word)
            result.extend(self.rx.findall(norm_word))
        return result

    def processGlob(self, lst):
        result = []
        for word in lst:
            norm_word = encode_ascii(word)
            result.extend(self.rxGlob.findall(norm_word))
        return result


try:
    element_factory.registerFactory(
        "Glossary Latin normalizer and splitter",
        "Glossary Latin normalizer and splitter",
        LatinNormalizerAndSplitter)
except ValueError:
    # in case the normalizer is already registered, ValueError is raised
    pass


class PloneGlossaryCatalog(ZCatalog):
    """Catalog for PloneGlossary"""

    id = PLONEGLOSSARY_CATALOG
    title = "Glossary Catalog"

    security = ClassSecurityInfo()

    def __init__(self):
Example #14
0
    rx = re.compile(r"\w+", re.UNICODE)
    rxGlob = re.compile(r"\w+[\w*?]*", re.UNICODE)

    def process(self, lst):
        result = []
        for s in lst:
            result += self.rx.findall(s)
        return result

    def processGlob(self, lst):
        result = []
        for s in lst:
            result += self.rxGlob.findall(s)
        return result
try:
    element_factory.registerFactory(
        'Word Splitter', 'Unicode Whitespace splitter', Splitter)
except ValueError:
    # in case the splitter is already registred, ValueError is raised
    pass

if __name__ == "__main__":
    import sys
    splitter = Splitter()
    for path in sys.argv[1:]:
        f = open(path, "rb")
        buf = f.read()
        f.close()
        print path
        print splitter.process([buf])
Example #15
0

class UnicodeHTMLWordSplitter(UnicodeWordSplitter):
    def process(self, lst, glob=False):
        return UnicodeWordSplitter.process(self, lst, glob, True)


class UnicodeCaseNormalizer:
    def process(self, lst):
        result = []
        for w in lst:
            if not isinstance(w, unicode):
                w = unicode(w, enc)
            result.append(w.lower())
        return result


try:
    element_factory.registerFactory('Word Splitter',
                                    'Unicode Whitespace splitter',
                                    UnicodeWordSplitter)
    element_factory.registerFactory('Word Splitter',
                                    'Unicode HTML aware splitter',
                                    UnicodeHTMLWordSplitter)
    element_factory.registerFactory('Case Normalizer',
                                    'Unicode Case normalizer',
                                    UnicodeCaseNormalizer)
except ValueError:
    # in case the splitter is already registered, ValueError is raised
    pass
Example #16
0
        globbing, then return the list of str.
        """
        return [x for s in lst for x in process_str_glob(s)]

    def process_post_glob(self, lst):
        """ Will be called twice when searching.
        Receive list of str, Remove ? and *, then return
        the list of str.
        """
        return [process_str_post(s) for s in lst]


try:
    element_factory.registerFactory(
        'Word Splitter',
        'Unicode Whitespace splitter',
        Splitter,
    )
except ValueError:
    # In case the splitter is already registered, ValueError is raised
    pass


class CaseNormalizer:
    def process(self, lst):
        enc = 'utf-8'
        result = []
        for s in lst:
            # This is a hack to get the normalizer working with
            # non-unicode text.
            try:
Example #17
0
    rxGlob = re.compile(r"(?L)\w+[\w*?]*")  # See globToWordIds() above

    def process(self, lst):
        result = []
        for s in lst:
            result += self.rx.findall(s)
        return result

    def processGlob(self, lst):
        result = []
        for s in lst:
            result += self.rxGlob.findall(s)
        return result


element_factory.registerFactory('Word Splitter', 'Whitespace splitter',
                                Splitter)


class CaseNormalizer:
    def process(self, lst):
        return [w.lower() for w in lst]


element_factory.registerFactory('Case Normalizer', 'Case Normalizer',
                                CaseNormalizer)

element_factory.registerFactory('Stop Words', ' Don\'t remove stop words',
                                None)


class StopWordRemover:
Example #18
0
    rxGlob = re.compile(r"(?L)\w+[\w*?]*") # See globToWordIds() above

    def process(self, lst):
        result = []
        for s in lst:
            result += self.rx.findall(s)
        return result

    def processGlob(self, lst):
        result = []
        for s in lst:
            result += self.rxGlob.findall(s)
        return result

element_factory.registerFactory('Word Splitter',
                                 'Whitespace splitter',
                                 Splitter)

class CaseNormalizer:

    def process(self, lst):
        return [w.lower() for w in lst]

element_factory.registerFactory('Case Normalizer',
                                'Case Normalizer',
                                CaseNormalizer)

element_factory.registerFactory('Stop Words',
                                ' Don\'t remove stop words',
                                None)
Example #19
0
        enc = 'utf-8'
        result = [x for s in lst for x in process_str_glob(s, enc)]
        return result

    def process_post_glob(self, lst):
        """ Will be called twice when searching.
        Receive list of str, Remove ? and *, then return
        the list of str.
        """
        enc = 'utf-8'
        result = [process_str_post(s, enc) for s in lst]
        return result


try:
    element_factory.registerFactory('Word Splitter',
                                    'Unicode Whitespace splitter', Splitter)
except ValueError:
    # In case the splitter is already registered, ValueError is raised
    pass


class CaseNormalizer(object):
    def process(self, lst):
        enc = 'utf-8'
        result = []
        for s in lst:
            # This is a hack to get the normalizer working with
            # non-unicode text.
            try:
                if not isinstance(s, unicode):
                    s = unicode(s, enc)
Example #20
0
        for text in texts:
            extractor = XhtmlTextExtractor(raiseOnError=False)
            cleanText = extractor.parse('<p>%s</p>' % text)
            res = res.union(splitIntoWords(cleanText))
        return list(res)

# ------------------------------------------------------------------------------
class TextIndexer:
    '''Extracts, from text field values, a normalized value to index.'''
    def process(self, texts):
        res = set()
        for text in texts:
            cleanText = normalizeText(text)
            res = res.union(splitIntoWords(cleanText))
        return list(res)

class ListIndexer:
    '''This lexicon does nothing: list of values must be indexed as is.'''
    def process(self, texts): return texts

# ------------------------------------------------------------------------------
try:
    from Products.ZCTextIndex.PipelineFactory import element_factory as ef
    ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
    ef.registerFactory('Text indexer', 'Text indexer', TextIndexer)
    ef.registerFactory('List indexer', 'List indexer', ListIndexer)
except ImportError:
    # May occur at generation time.
    pass
# ------------------------------------------------------------------------------