コード例 #1
0
ファイル: chardb.py プロジェクト: cburgmer/eclectus
    def __init__(self, language, characterDomain=None, databaseUrl=None,
        dbConnectInst=None, ignoreIllegalSettings=False, **options):

        dbConnectInst = dbConnectInst or getDBConnector(
            getDatabaseConfiguration(databaseUrl))

        locale = self.LANGUAGE_CHAR_LOCALE_MAPPING[language]
        CharacterLookup.__init__(self, locale, characterDomain or 'Unicode',
            dbConnectInst=dbConnectInst)

        self.language = language

        # choose a better character domain if non specified
        if (characterDomain
            and characterDomain not in self.LANGUAGE_CHAR_DOMAIN_MAPPING[
                    self.language]):
            if ignoreIllegalSettings:
                characterDomain = None
            else:
                raise ValueError(
                    "Illegal character domain '%s' for language '%s'"
                    % (characterDomain, self.language))
        if not characterDomain:
            self.setCharacterDomain(self._getCharacterDomain())

        if locale != 'T':
            self._characterLookupTraditional = CharacterLookup('T',
                dbConnectInst=self.db)
コード例 #2
0
 def divideIntoSections(self):
     '''
     same as lyricsParser.divideIntoSections just class variable name self.listSyllable is different
     converts mandarin to pinyin
     divides into sections 
     '''
     
     currSectionLyrics =  []
     for syl in self.listSyllables:
             
             
         isEndOfSentence, syl.text = stripPunctuationSings(syl.text)
             
             ### convert from mandarin to pinyin
         if not syl.text == 'REST':
             cjk = CharacterLookup('C')
             textPinYinList = cjk.getReadingForCharacter(syl.text, 'Pinyin', toneMarkType='none') 
             if len(textPinYinList) > 1:
                 self.logger.warn("converted syllable {} has {} parts".format(textPinYinList, len(textPinYinList)))
             syl.text = textPinYinList[0] # take only first variant of pinyin interpretations
             
         ### finish up sentence when punctuation present        
         if isEndOfSentence:
             
             currSectionLyrics.append(syl)
             self.listSentences.append(currSectionLyrics)
             currSectionLyrics =  []
         else:
             currSectionLyrics.append(syl)
コード例 #3
0
def mandarinToPinyin(mandarinChar):
    cjk = CharacterLookup('C')
    textPinYinList = cjk.getReadingForCharacter(mandarinChar,
                                                'Pinyin',
                                                toneMarkType='none')
    if len(textPinYinList) > 1:
        print "converted syllable {} has {} parts".format(
            textPinYinList, len(textPinYinList))
    pinyin = textPinYinList[
        0]  # take only first variant of pinyin interpretations
    return pinyin
コード例 #4
0
ファイル: check-strokes.py プロジェクト: yueqianzhang/cjklib
    def _checkStrokeOrderFromDecomposition(self, decomposition, index=0):
        """Goes through a decomposition"""
        if type(decomposition[index]) != type(()):
            # IDS operator
            character = decomposition[index]
            missingChars = []
            hasFullOrder = True
            if CharacterLookup.isBinaryIDSOperator(character):
                # check for IDS operators we can't make any order
                # assumption about
                if character not in self.ALLOWED_COMPONENT_STRUCTURE:
                    return False, index, []
                else:
                    # Get stroke order for both components
                    for _ in range(0, 2):
                        fullOrder, index, missing \
                            = self._checkStrokeOrderFromDecomposition(
                                decomposition, index+1)
                        if not fullOrder:
                            missingChars.extend(missing)

                        hasFullOrder = hasFullOrder and fullOrder

            elif CharacterLookup.isTrinaryIDSOperator(character):
                # Get stroke order for three components
                for _ in range(0, 3):
                    fullOrder, index, missing \
                        = self._checkStrokeOrderFromDecomposition(
                            decomposition, index+1)
                    if not fullOrder:
                        missingChars.extend(missing)

                    hasFullOrder = hasFullOrder and fullOrder
            else:
                assert False, 'not an IDS character'

            return hasFullOrder, index, missingChars
        else:
            # no IDS operator but character
            char, glyph = decomposition[index]
            # if the character is unknown or there is none raise
            if char == u'?':
                return False, index, []
            else:
                # recursion
                fullOrder, missingChars = self.checkStrokeOrder(char, glyph)
                if not fullOrder and not missingChars:
                    missingChars = [char]
                return fullOrder, index, missingChars

        assert False
コード例 #5
0
        def consumeComponent(decomposition):
            """
            Consumes a component on the top level, e.g. for 㐯, C{⿱⿱亠吕香}
            consumes C{⿱亠吕} when given the partial decomposition C{⿱亠吕香}.
            """
            if type(decomposition[0]) == type(()):
                # consume one component
                return decomposition[1:]

            if CharacterLookup.isBinaryIDSOperator(decomposition[0]):
                decomposition = consumeComponent(decomposition[1:])
                return consumeComponent(decomposition)
            elif CharacterLookup.isTrinaryIDSOperator(decomposition[0]):
                decomposition = consumeComponent(decomposition[1:])
                decomposition = consumeComponent(decomposition)
                return consumeComponent(decomposition)
コード例 #6
0
 def decompositionFromString(decomposition):
     # taken from CharacterLookup, but adapted to return None if no glyph
     #   given
     componentsList = []
     index = 0
     while index < len(decomposition):
         char = decomposition[index]
         if CharacterLookup.isIDSOperator(char):
             componentsList.append(char)
         else:
             # is Chinese character
             # Special handling for surrogate pairs on UCS-2 systems
             if util.isValidSurrogate(decomposition[index:index + 2]):
                 char = decomposition[index:index +
                                      2]  # A surrogate pair now
                 index += 1  # Bypass trailing surrogate
             if char == '#':
                 # pseudo character, find digit end
                 offset = 2
                 while index+offset < len(decomposition) \
                     and decomposition[index+offset].isdigit():
                     offset += 1
                 char = int(decomposition[index:index + offset])
                 charGlyph = 0
             elif index+1 < len(decomposition)\
                 and decomposition[index+1] == '[':
                 # extract glyph information
                 endIndex = decomposition.index(']', index + 1)
                 charGlyph = int(decomposition[index + 2:endIndex])
                 index = endIndex
             else:
                 charGlyph = None
             componentsList.append((char, charGlyph))
         index = index + 1
     return componentsList
コード例 #7
0
ファイル: models.py プロジェクト: SAPikachu/pikapika-py
def get_cat_code(s):
    char = unicode(s)[0]

    cjk = CharacterLookup("C")
    readings = cjk.getReadingForCharacter(char, "Pinyin")
    if not readings:
        # Not Chinese, just use first character as code
        return char.upper()

    # It's very hard to determine which reading is correct for our case,
    # so don't bother to check it, just use the first one and let users to fix
    # it if it is incorrect
    reading = readings[0]
    
    # We use the first letter as code
    return reading[0].upper()
コード例 #8
0
 def decompositionFromString(decomposition):
     # taken from CharacterLookup, but adapted to return None if no glyph
     #   given
     componentsList = []
     index = 0
     while index < len(decomposition):
         char = decomposition[index]
         if CharacterLookup.isIDSOperator(char):
             componentsList.append(char)
         else:
             # is Chinese character
             # Special handling for surrogate pairs on UCS-2 systems
             if util.isValidSurrogate(decomposition[index : index + 2]):
                 char = decomposition[index : index + 2]  # A surrogate pair now
                 index += 1  # Bypass trailing surrogate
             if char == "#":
                 # pseudo character, find digit end
                 offset = 2
                 while index + offset < len(decomposition) and decomposition[index + offset].isdigit():
                     offset += 1
                 char = int(decomposition[index : index + offset])
                 charGlyph = 0
             elif index + 1 < len(decomposition) and decomposition[index + 1] == "[":
                 # extract glyph information
                 endIndex = decomposition.index("]", index + 1)
                 charGlyph = int(decomposition[index + 2 : endIndex])
                 index = endIndex
             else:
                 charGlyph = None
             componentsList.append((char, charGlyph))
         index = index + 1
     return componentsList
コード例 #9
0
    def run(self):
        decompositionEntries, flagEntries = self.read()

        # Remove pseudo characters by merging entries
        if not self.includePseudoCharacters:
            decompositionEntries, flagEntries = self._removePseudoCharacters(decompositionEntries, flagEntries)

        # Remove minimal component entries
        if not self.includeMinimal:
            for char in sorted(decompositionEntries.keys()):
                for glyph in decompositionEntries[char]:
                    for decomposition in decompositionEntries[char][glyph].copy():

                        if len(decomposition) == 1:
                            decompositionEntries[char][glyph].remove(decomposition)
                            del flagEntries[char][glyph][decomposition]

        # Merge similar decompositions, removing inferior ones
        self._mergeSimilarDecompositions(decompositionEntries, flagEntries)

        # Write entries
        for char in sorted(decompositionEntries.keys()):
            for glyph in decompositionEntries[char]:
                for idx, decomposition in enumerate(sorted(decompositionEntries[char][glyph])):
                    decompStr = CharacterLookup.decompositionToString(decomposition)
                    if type(char) == type(0):
                        # pseudo character
                        char = "#%d" % char
                    flagStr = "".join(sorted(flagEntries[char][glyph][decomposition]))
                    print(
                        '"%(char)s","%(decomp)s",%(glyph)d,%(index)d,%(flags)s'
                        % {"char": char, "decomp": decompStr, "glyph": glyph, "index": idx, "flags": flagStr}
                    ).encode(default_encoding)
コード例 #10
0
        def consumeComponent(decomposition):
            """
            Consumes a component on the top level, e.g. for 㐯, C{⿱⿱亠吕香}
            consumes C{⿱亠吕} when given the partial decomposition C{⿱亠吕香}.
            """
            if type(decomposition[0]) == type(()):
                # consume one component
                return decomposition[1:]

            if CharacterLookup.isBinaryIDSOperator(decomposition[0]):
                decomposition = consumeComponent(decomposition[1:])
                return consumeComponent(decomposition)
            elif CharacterLookup.isTrinaryIDSOperator(decomposition[0]):
                decomposition = consumeComponent(decomposition[1:])
                decomposition = consumeComponent(decomposition)
                return consumeComponent(decomposition)
コード例 #11
0
 def getDecomposition(structure):
     # add glyph information
     decomposition = []
     for c in structure:
         if type(c) == type(u'') and CharacterLookup.isIDSOperator(c):
             decomposition.append(c)
         else:
             decomposition.append((c, 0))
     return decomposition
コード例 #12
0
 def getDecomposition(structure):
     # add glyph information
     decomposition = []
     for c in structure:
         if type(c) == type(u"") and CharacterLookup.isIDSOperator(c):
             decomposition.append(c)
         else:
             decomposition.append((c, 0))
     return decomposition
コード例 #13
0
class Mapper(object):
    def __init__(self, variant='T'):
        self.characterLookup = CharacterLookup('T')
        self.variant = variant

    def mapEntry(self, char, reading):
        entries = []
        for var in self.characterLookup.getCharacterVariants(
                char, self.variant):
            entries.append((var, reading))
        return entries
コード例 #14
0
def tokenize(input, output):
    try:
        text = open(input, 'r').readlines()
    except IOError:
        print "IOError: could not open", input
        sys.exit()

    cjk = CharacterLookup('T')
    out = open(output, 'w')

    for line in text:
        line = line.decode('utf-8')
        new_line = ""
        for char in line:
            pinyin = cjk.getReadingForCharacter(char, 'Pinyin')
            if pinyin:
                new_line += char
        new_line += '\n'
        out.write(new_line.encode('utf-8'))
    out.close()
コード例 #15
0
class Mapper(object):
    def __init__(self, variant='T'):
        self.characterLookup = CharacterLookup('T')
        self.variant = variant

    def mapEntry(self, char, reading):
        entries = []
        for var in self.characterLookup.getCharacterVariants(char,
            self.variant):
            entries.append((var, reading))
        return entries
コード例 #16
0
ファイル: importcjklib.py プロジェクト: KentVu/cjklib
        class GlyphIterator(object):
            def __init__(self):
                self._cjk = CharacterLookup('T', 'Unicode')
                self.characterIterator = self._cjk.getDomainCharacterIterator()
                self.curChar = None
                self.glyphQueue = []

            def __iter__(self):
                return self

            def next(self):
                while not self.glyphQueue:
                    self.curChar = self.characterIterator.next()
                    try:
                        glyphs = self._cjk.getCharacterGlyphs(self.curChar)
                        self.glyphQueue.extend(glyphs)
                    except exception.NoInformationError:
                        pass

                return '%s/%d' % (self.curChar, self.glyphQueue.pop())
コード例 #17
0
        class GlyphIterator(object):
            def __init__(self):
                self._cjk = CharacterLookup('T', 'Unicode')
                self.characterIterator = self._cjk.getDomainCharacterIterator()
                self.curChar = None
                self.glyphQueue = []

            def __iter__(self):
                return self

            def next(self):
                while not self.glyphQueue:
                    self.curChar = self.characterIterator.next()
                    try:
                        glyphs = self._cjk.getCharacterGlyphs(self.curChar)
                        self.glyphQueue.extend(glyphs)
                    except exception.NoInformationError:
                        pass

                return '%s/%d' % (self.curChar, self.glyphQueue.pop())
コード例 #18
0
def to_pinyin(filename):
        try:
                input = open(filename, 'r').readlines()
        except IOError:
                print "IOError: could not open", filename
                sys.exit()
 
        cjk = CharacterLookup('T')
 
        input = [u'我喜歡他']
 
        for line in input:
                #line = line.decode('utf-8')
                new_line = ""
                for char in line:
                        pinyin = cjk.getReadingForCharacter(char, 'Pinyin')
                        if pinyin:
                                print [unidecode(x) for x in pinyin]
                                simplified = unidecode(pinyin[0])
                                new_line += simplified + char + " "
                line = new_line
                print line
コード例 #19
0
        def parseIDS(decomposition, index):
            if index >= len(decomposition):
                raise ValueError()

            if type(decomposition[index]) == type(()):
                # consume one component
                return index + 1

            if not CharacterLookup.isIDSOperator(decomposition[index]):
                # simple chars should be IDS operators
                raise ValueError()

            if CharacterLookup.isBinaryIDSOperator(decomposition[index]):
                index = index + 1
                index = parseIDS(decomposition, index)
                return parseIDS(decomposition, index)
            elif CharacterLookup.isTrinaryIDSOperator(decomposition[index]):
                index = index + 1
                index = parseIDS(decomposition, index)
                index = parseIDS(decomposition, index)
                return parseIDS(decomposition, index)
            else:
                raise ValueError()
コード例 #20
0
        def parseIDS(decomposition, index):
            if index >= len(decomposition):
                raise ValueError()

            if type(decomposition[index]) == type(()):
                # consume one component
                return index + 1

            if not CharacterLookup.isIDSOperator(decomposition[index]):
                # simple chars should be IDS operators
                raise ValueError()

            if CharacterLookup.isBinaryIDSOperator(decomposition[index]):
                index = index + 1
                index = parseIDS(decomposition, index)
                return parseIDS(decomposition, index)
            elif CharacterLookup.isTrinaryIDSOperator(decomposition[index]):
                index = index + 1
                index = parseIDS(decomposition, index)
                index = parseIDS(decomposition, index)
                return parseIDS(decomposition, index)
            else:
                raise ValueError()
コード例 #21
0
ファイル: kanRen.py プロジェクト: fenildf/anki_addons-1
def getStrokeOrd(fin, kl):
    """
        Trying for awareness of glyph locale
        in lookup.
        """
    from cjklib.characterlookup import CharacterLookup
    for i in kl:
        if i in cedict.simplified: cjk = CharacterLookup('C')
        elif i in cedict.traditional: cjk = CharacterLookup('T')
        else: cjk = CharacterLookup('J')
        j = cjk.getStrokeOrder(i)
        fin.append(u'• ' + u' '.join(j))
    return fin
コード例 #22
0
ファイル: kanRen.py プロジェクト: fenildf/anki_addons-1
def auxSOrd(i):
    """
        Try to get stroke decomposition
        if subcomponent decomposition fails.
        """
    from cjklib.characterlookup import CharacterLookup
    if i in cedict.simplified: cjk = CharacterLookup('C')
    elif i in cedict.traditional: cjk = CharacterLookup('T')
    else: cjk = CharacterLookup('J')
    try:
        j = cjk.getStrokeOrder(i)
    except:
        return u'[x]'
    return u' '.join(j)
コード例 #23
0
    def next(self):
        entry = self._getNextEntry()
        if entry is None:
            raise StopIteration()
        else:
            char, decompString = entry
            # TODO support CHISE private character entries
            # remove CHISE private character entries
            decompString = re.sub("&[^;]+;", u'?', decompString)

            decomposition = []
            for c in decompString:
                if CharacterLookup.isIDSOperator(c):
                    decomposition.append(c)
                else:
                    decomposition.append((c, 0))
            # flag 'C'HISE
            return (char, 0, decomposition, set('C'))
コード例 #24
0
    def next(self):
        entry = self._getNextEntry()
        if entry is None:
            raise StopIteration()
        else:
            char, decompString = entry
            # TODO support CHISE private character entries
            # remove CHISE private character entries
            decompString = re.sub("&[^;]+;", u"?", decompString)

            decomposition = []
            for c in decompString:
                if CharacterLookup.isIDSOperator(c):
                    decomposition.append(c)
                else:
                    decomposition.append((c, 0))
            # flag 'C'HISE
            return (char, 0, decomposition, set("C"))
コード例 #25
0
    def run(self):
        decompositionEntries, flagEntries = self.read()

        # Remove pseudo characters by merging entries
        if not self.includePseudoCharacters:
            decompositionEntries, flagEntries = self._removePseudoCharacters(
                decompositionEntries, flagEntries)

        # Remove minimal component entries
        if not self.includeMinimal:
            for char in sorted(decompositionEntries.keys()):
                for glyph in decompositionEntries[char]:
                    for decomposition \
                        in decompositionEntries[char][glyph].copy():

                        if len(decomposition) == 1:
                            decompositionEntries[char][glyph].remove(
                                decomposition)
                            del flagEntries[char][glyph][decomposition]

        # Merge similar decompositions, removing inferior ones
        self._mergeSimilarDecompositions(decompositionEntries, flagEntries)

        # Write entries
        for char in sorted(decompositionEntries.keys()):
            for glyph in decompositionEntries[char]:
                for idx, decomposition in enumerate(
                        sorted(decompositionEntries[char][glyph])):
                    decompStr = CharacterLookup.decompositionToString(
                        decomposition)
                    if type(char) == type(0):
                        # pseudo character
                        char = '#%d' % char
                    flagStr = ''.join(
                        sorted(flagEntries[char][glyph][decomposition]))
                    print(
                        '"%(char)s","%(decomp)s",%(glyph)d,%(index)d,%(flags)s'
                        % {
                            'char': char,
                            'decomp': decompStr,
                            'glyph': glyph,
                            'index': idx,
                            'flags': flagStr
                        }).encode(default_encoding)
コード例 #26
0
    def next(self):
        if not hasattr(self, "_fileIterator"):
            if not self.quiet:
                print >>sys.stderr, "FILE: reading '%s'" % self.filePath
            fileHandle = codecs.open(self.filePath, "r", default_encoding)
            self._fileIterator = UnicodeCSVFileIterator(fileHandle)

        while True:
            char, decompString, glyph, _, flags = self._fileIterator.next()
            if len(char) > 1:
                # pseudo char
                if not char.startswith("#"):
                    print >>sys.stderr, ("FILE: Error parsing entry '%s', %s" % (char, glyph)).encode(default_encoding)
                    continue
                else:
                    char = int(char[1:])

            decomposition = CharacterLookup.decompositionFromString(decompString)
            return (char, int(glyph), decomposition, set(flags))
コード例 #27
0
    def next(self):
        if not hasattr(self, '_fileIterator'):
            if not self.quiet:
                print >> sys.stderr, "FILE: reading '%s'" % self.filePath
            fileHandle = codecs.open(self.filePath, 'r', default_encoding)
            self._fileIterator = UnicodeCSVFileIterator(fileHandle)

        while True:
            char, decompString, glyph, _, flags = self._fileIterator.next()
            if len(char) > 1:
                # pseudo char
                if not char.startswith('#'):
                    print >> sys.stderr, (
                        "FILE: Error parsing entry '%s', %s" %
                        (char, glyph)).encode(default_encoding)
                    continue
                else:
                    char = int(char[1:])

            decomposition = CharacterLookup.decompositionFromString(
                decompString)
            return (char, int(glyph), decomposition, set(flags))
コード例 #28
0
    def _getDecompositionEntriesDict(cls):
        """
        Gets the decomposition table from the database.

        @rtype: dict
        @return: dictionary with key pair character, I{glyph} and the first
            layer decomposition as value with the entry's flag
        """
        decompDict = {}
        # get entries from database
        db = dbconnector.getDBConnector()
        table = db.tables['CharacterDecomposition']

        result = db.selectRows(select([table.c.ChineseCharacter,
            table.c.Glyph, table.c.Decomposition, table.c.Flags])\
                .order_by(table.c.SubIndex))
        entries = []
        for char, glyph, decompString, flags in result:
            decomposition = CharacterLookup.decompositionFromString(
                decompString)
            entries.append((char, glyph, decomposition, set(flags)))

        return entries
コード例 #29
0
    def _getDecompositionEntriesDict(cls):
        """
        Gets the decomposition table from the database.

        @rtype: dict
        @return: dictionary with key pair character, I{glyph} and the first
            layer decomposition as value with the entry's flag
        """
        decompDict = {}
        # get entries from database
        db = dbconnector.getDBConnector()
        table = db.tables["CharacterDecomposition"]

        result = db.selectRows(
            select([table.c.ChineseCharacter, table.c.Glyph, table.c.Decomposition, table.c.Flags]).order_by(
                table.c.SubIndex
            )
        )
        entries = []
        for char, glyph, decompString, flags in result:
            decomposition = CharacterLookup.decompositionFromString(decompString)
            entries.append((char, glyph, decomposition, set(flags)))

        return entries
コード例 #30
0
ファイル: importcjklib.py プロジェクト: KentVu/cjklib
 def __init__(self):
     self._cjk = CharacterLookup('T', 'Unicode')
     self.characterIterator = self._cjk.getDomainCharacterIterator()
     self.curChar = None
     self.glyphQueue = []
コード例 #31
0
ファイル: importcjklib.py プロジェクト: KentVu/cjklib
 def getCharacters(self):
     cjk = CharacterLookup('T', self.title)
     return ' '.join(cjk.getDomainCharacterIterator())
コード例 #32
0
ファイル: test_ictclas.py プロジェクト: hinesmr/mica
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import mica
import cjklib

from cjklib.dictionary import CEDICT
from cjklib.characterlookup import CharacterLookup

d = CEDICT()
cjk = CharacterLookup('C')

src = """
小明五岁,他有一个哥哥,哥哥是学生。他爸爸妈妈都工作。小明说,他家一共五口人。

今天星期六,我们不上课。小王说,晚上有一个好电影,他和我一起去看,我很高兴。下午六点我去食堂吃饭,六点半去小王的宿舍,七点我们去看电影。

张丽英家有四口人:爸爸,妈妈,姐姐和她。她爸爸是大夫,五十七岁了,身体很好。他工作很忙,星期天常常不休息。妈妈是银行职员,今年五十岁。她姐姐是老师,今年二月结婚了。她不住在爸爸妈妈家。昨天是星期五,下午没有课。我们去她家了。她家在北京饭店旁边。我们到她家的时候,她爸爸妈妈不在家。我们和她一起谈话,听音乐,看电视。五点半张丽英的爸爸妈妈回家了。她姐姐也来了。我们在她家吃饭,晚上八点半我们就回学校了。

教学楼前边的自行车很多。田芳下课后要找自己的自行车。田芳的自行车是新的。张东问她,你的自行车是什么颜色的?田芳说是蓝的。张东说,那辆蓝车是不是你的?田芳说,我的自行车是新的,不是旧的,那辆车不是我的。忽然,田芳看见了自己的自行车,她说,啊,我的自行车在那儿呢,我找到了
"""

def tryce(uni, fail_if_more_than_one = False) :
    count = 0
    results = d.getFor(uni)
    trans = u''
    last = None

    for e in results : 
        if count > 0 and e[2].lower() == last[2].lower() : 
#           print "Duplicate CEDICT pinyin!"
           count -= 1
コード例 #33
0
    def _mergeSimilarDecompositions(self, decompositionEntries, flagEntries):
        """
        Merges two decompositions, if they are the same, except:
            - one has an unknown component while the other doesn't,
            - one has a subtree that is the decomposition of the corresponding
              component of the other decomposition.
        """
        def consumeComponent(decomposition):
            """
            Consumes a component on the top level, e.g. for 㐯, C{⿱⿱亠吕香}
            consumes C{⿱亠吕} when given the partial decomposition C{⿱亠吕香}.
            """
            if type(decomposition[0]) == type(()):
                # consume one component
                return decomposition[1:]

            if CharacterLookup.isBinaryIDSOperator(decomposition[0]):
                decomposition = consumeComponent(decomposition[1:])
                return consumeComponent(decomposition)
            elif CharacterLookup.isTrinaryIDSOperator(decomposition[0]):
                decomposition = consumeComponent(decomposition[1:])
                decomposition = consumeComponent(decomposition)
                return consumeComponent(decomposition)

        def compareTrees(decompositionA, decompositionB):
            """
            Checks for similar decomposition trees, taking care of unknown
            components.

            Returns C{None} if the trees are not equal, a integer if the trees
            are similar. If the left tree (decompositionA) should be preferred a
            negative number is returned, or a positive number for the right tree
            (decompositionB). If C{0} is returned, both trees are equally good
            to choose from.
            """
            if not decompositionA and not decompositionB:
                # equal
                return 0
            elif not decompositionA or not decompositionB:
                # if all preceding components are the same that shouldn't happen
                raise ValueError()
            elif decompositionA[0] == decompositionB[0]:
                return compareTrees(decompositionA[1:], decompositionB[1:])

            elif (type(decompositionA[0]) == type(())
                  and decompositionA[0][0] == u'?'):
                decompositionB = consumeComponent(decompositionB)
                result = compareTrees(decompositionA[1:], decompositionB)
                if result is None or result < 0:
                    # unequal or the left side is preferred later on
                    return None
                else:
                    return +1

            elif (type(decompositionB[0]) == type(())
                  and decompositionB[0][0] == u'?'):
                decompositionA = consumeComponent(decompositionA)
                result = compareTrees(decompositionA, decompositionB[1:])
                if result is None or result > 0:
                    # unequal or the right side is preferred later on
                    return None
                else:
                    return -1

            elif (CharacterLookup.isIDSOperator(decompositionA[0])
                  and CharacterLookup.isIDSOperator(decompositionB[0])):
                # No way these decompositions can be equal
                #   (simplified subseq. checking)
                return None

            elif CharacterLookup.isIDSOperator(decompositionA[0]):
                # expand tree B
                char, glyph = decompositionB[0]
                if (char in decompositionEntries
                        and glyph in decompositionEntries[char]):

                    for decomposition in decompositionEntries[char][glyph]:
                        result = compareTrees(
                            decompositionA, decomposition + decompositionB[1:])
                        if result is not None and result >= 0:
                            # right side preferred and so do we...
                            #   A shorted description is better
                            return 1

                return None

            elif CharacterLookup.isIDSOperator(decompositionB[0]):
                # expand tree A
                char, glyph = decompositionA[0]
                if (char in decompositionEntries
                        and glyph in decompositionEntries[char]):

                    for decomposition in decompositionEntries[char][glyph]:
                        result = compareTrees(
                            decomposition + decompositionA[1:], decompositionB)
                        if result is not None and result <= 0:
                            # left side preferred and so do we...
                            #   A shorted description is better
                            return -1
                return None
            else:
                return None

        for char in decompositionEntries:
            for glyph in decompositionEntries[char]:
                idxA = 0
                decompositions = list(decompositionEntries[char][glyph])
                flagsDict = flagEntries[char][glyph]
                # Check every decomposition with all others to the right
                while idxA < len(decompositions):
                    idxB = idxA + 1
                    while idxB < len(decompositions):
                        try:
                            result = compareTrees(decompositions[idxA],
                                                  decompositions[idxB])
                            if result is not None and result == 0:
                                # Entries are equal, we can transfer flags
                                flagsDict[decompositions[idxA]].update(
                                    flagsDict[decompositions[idxB]])
                                del flagsDict[decompositions[idxB]]
                                del decompositions[idxB]
                            elif result is not None and result < 0:
                                del flagsDict[decompositions[idxB]]
                                del decompositions[idxB]
                            elif result is not None and result > 0:
                                del flagsDict[decompositions[idxA]]
                                del decompositions[idxA]
                                # No need for further testing for this decomp
                                break
                            else:
                                # Only increase if the list didn't shift to the
                                #   left
                                idxB += 1
                        except ValueError:
                            print >> sys.stderr, (
                                "Error comparing decompositions %s and %s"
                                % (CharacterLookup.decompositionToString(
                                    decompositions[idxA]),
                                    CharacterLookup.decompositionToString(
                                        decompositions[idxB])))\
                                    .encode(default_encoding)
                            idxB += 1
                    else:
                        idxA += 1
                decompositionEntries[char][glyph] = set(decompositions)
コード例 #34
0
ファイル: check-strokes.py プロジェクト: yueqianzhang/cjklib
    def __init__(self, options, args):
        self._locale = options.locale
        self._characterDomain = options.characterDomain

        self._cjk = CharacterLookup(self._locale, self._characterDomain)
コード例 #35
0
ファイル: check-strokes.py プロジェクト: yueqianzhang/cjklib
class StrokeChecker(object):
    ALLOWED_COMPONENT_STRUCTURE = [u'⿰', u'⿱', u'⿵', u'⿶', u'⿸', u'⿹', u'⿺',
        u'⿲', u'⿳']
    """
    Component structures that allow derivation of stroke order from components.
    """

    MIN_COMPONENT_PRODUCTIVITY = 2
    """
    Min productivity when reporting out-domain components that could help boost
    the in-domain set.
    """

    def __init__(self, options, args):
        self._locale = options.locale
        self._characterDomain = options.characterDomain

        self._cjk = CharacterLookup(self._locale, self._characterDomain)

    def run(self):
        charCount = 0
        charFullCount = 0

        missingCharsDict = {}
        missingSingleCharacters = []
        # iterate through all characters of the character set
        for char in self._cjk.getDomainCharacterIterator():
        #for char in iter([u'亄', u'乿', u'仜', u'伳']): # DEBUG
            charCount += 1
            if charCount % 100 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()

            hasFullOrder, missingChars = self.checkStrokeOrder(char)

            if hasFullOrder:
                charFullCount += 1
            else:
                if missingChars:
                    # list components that can help us build this transform.
                    for missing in missingChars:
                        if missing not in missingCharsDict:
                            missingCharsDict[missing] = []
                        missingCharsDict[missing].append(char)
                else:
                    missingSingleCharacters.append(char)

        sys.stdout.write('\n')

        output_encoding = sys.stdout.encoding or locale.getpreferredencoding() \
            or 'ascii'

        print 'Total characters: %d' % charCount
        print 'Characters with full stroke data: %d (%d%%)' % (charFullCount,
            100 * charFullCount / charCount)


        # missing single characters
        # Extend by those with components, that have a component with low
        #   productivity.
        inDomainComponents = set(
            self._cjk.filterDomainCharacters(missingCharsDict.keys()))

        lowProductivityComponentChars = []
        for component, chars in missingCharsDict.items():
            if component not in inDomainComponents \
                and len(chars) < self.MIN_COMPONENT_PRODUCTIVITY:
                lowProductivityComponentChars.extend(chars)
                del missingCharsDict[component]
        missingSingleCharacters.extend(lowProductivityComponentChars)

        print 'Missing single characters:',
        print ''.join(missingSingleCharacters).encode(output_encoding,
            'replace')

        # remove characters that we already placed in "single"
        _missingSingleCharacters = set(missingSingleCharacters)
        for component, chars in missingCharsDict.items():
            missingCharsDict[component] = list(
                set(chars) - _missingSingleCharacters)
            if not missingCharsDict[component]:
                del missingCharsDict[component]

        # missing components

        missingComponents = sorted(missingCharsDict.items(),
            key=lambda (x,y): len(y))
        missingComponents.reverse()

        inDomainComponentList = [(component, chars) \
            for component, chars in missingComponents \
            if component in inDomainComponents]
        # only show "out-domain" components if they have productivity > 1
        outDomainComponentList = [(component, chars) \
            for component, chars in missingComponents \
            if component not in inDomainComponents and len(chars) > 1]

        print 'Missing components: %d' % (len(inDomainComponentList) \
            + len(outDomainComponentList))
        print 'Missing in-domain components:',
        print ', '.join(['%s (%s)' % (component, ''.join(chars)) \
            for component, chars in inDomainComponentList])\
            .encode(output_encoding, 'replace')
        print 'Missing out-domain components:',
        print ', '.join(['%s (%s)' % (component, ''.join(chars)) \
            for component, chars in outDomainComponentList])\
            .encode(output_encoding, 'replace')

    def checkStrokeOrder(self, char, glyph=None):
        try:
            self._cjk.getStrokeOrder(char, glyph)
            return True, []
        except NoInformationError:
            pass

        # add decompositions, limit to upper bound max_samples
        missingChars = []
        decompositions = self._cjk.getDecompositionEntries(char, glyph)
        for decomposition in decompositions:
            hasFullOrder, _, missing = self._checkStrokeOrderFromDecomposition(
                decomposition)
            assert not hasFullOrder
            missingChars.extend(missing)

        return False, missingChars

    def _checkStrokeOrderFromDecomposition(self, decomposition, index=0):
        """Goes through a decomposition"""
        if type(decomposition[index]) != type(()):
            # IDS operator
            character = decomposition[index]
            missingChars = []
            hasFullOrder = True
            if CharacterLookup.isBinaryIDSOperator(character):
                # check for IDS operators we can't make any order
                # assumption about
                if character not in self.ALLOWED_COMPONENT_STRUCTURE:
                    return False, index, []
                else:
                    # Get stroke order for both components
                    for _ in range(0, 2):
                        fullOrder, index, missing \
                            = self._checkStrokeOrderFromDecomposition(
                                decomposition, index+1)
                        if not fullOrder:
                            missingChars.extend(missing)

                        hasFullOrder = hasFullOrder and fullOrder

            elif CharacterLookup.isTrinaryIDSOperator(character):
                # Get stroke order for three components
                for _ in range(0, 3):
                    fullOrder, index, missing \
                        = self._checkStrokeOrderFromDecomposition(
                            decomposition, index+1)
                    if not fullOrder:
                        missingChars.extend(missing)

                    hasFullOrder = hasFullOrder and fullOrder
            else:
                assert False, 'not an IDS character'

            return hasFullOrder, index, missingChars
        else:
            # no IDS operator but character
            char, glyph = decomposition[index]
            # if the character is unknown or there is none raise
            if char == u'?':
                return False, index, []
            else:
                # recursion
                fullOrder, missingChars = self.checkStrokeOrder(char, glyph)
                if not fullOrder and not missingChars:
                    missingChars = [char]
                return fullOrder, index, missingChars

        assert False
コード例 #36
0
 def __init__(self, variant='T'):
     self.characterLookup = CharacterLookup('T')
     self.variant = variant
コード例 #37
0
ファイル: checkcjkradicals.py プロジェクト: ninchanese/cjklib
def main():
    cjk = CharacterLookup('T')
    cjkSimplified = CharacterLookup('C')

    fileEntryCount = 0
    databaseMissingEntryCount = 0
    noEntryCount = 0
    wrongEquivalentCount = 0
    seenRadicalFormIndices = set()
    seenRadicalVariantIndices = set()
    for line in sys.stdin:
        line = line.decode(default_encoding)

        if re.match(r'\s*#', line) or re.match(r'\s+$', line):
            continue
        else:
            fileEntryCount = fileEntryCount + 1

            matchObj = re.match(r"(\d{1,3})('?);\s+([1234567890ABCDEF]{4,5});" \
                + r"\s+([1234567890ABCDEF]{4,5})\s*$", line)
            if matchObj:
                index, variant, radicalCP, equivalentCP = matchObj.groups()
                radicalIdx = int(index)
                radicalForm = chr(int(radicalCP, 16))
                equivalentForm = chr(int(equivalentCP, 16))

                if variant:
                    seenRadicalVariantIndices.add(radicalIdx)
                else:
                    seenRadicalFormIndices.add(radicalIdx)
                # check radicalForm
                if not variant:
                    targetForms = set([cjk.getKangxiRadicalForm(radicalIdx)])
                else:
                    targetForms = set()
                    # add simplified form, if different
                    simplifiedForm = cjkSimplified.getKangxiRadicalForm(
                        radicalIdx)
                    if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx):
                        targetForms.add(simplifiedForm)
                    # add simplified variant
                    targetForms.update(
                        set(cjkSimplified.getKangxiRadicalVariantForms(
                            radicalIdx)) \
                        - set(cjk.getKangxiRadicalVariantForms(radicalIdx)))

                if radicalForm not in targetForms:
                    # cjklib is missing something
                    print(("No entry for radical form '%s' with index %d%s"
                        % (radicalForm, radicalIdx, variant))\
                        .encode(default_encoding))
                    databaseMissingEntryCount += 1
                if targetForms - set([radicalForm]):
                    # CJKRadicals.txt is missing something
                    for form in targetForms - set([radicalForm]):
                        print(("Database entry '%s' with radical index %d%s" \
                            % (form, radicalIdx, variant) \
                            + " not included in table")\
                            .encode(default_encoding))
                    noEntryCount += 1

                # check equivalentForm
                libraryEquivalentForm \
                    = cjk.getRadicalFormEquivalentCharacter(radicalForm)
                if libraryEquivalentForm != equivalentForm:
                    print(("Equivalent radical form '%s' with index %d%s"
                        % (libraryEquivalentForm, radicalIdx, variant) \
                        + " not backed by table: '%s'" % equivalentForm)\
                        .encode(default_encoding))
                    wrongEquivalentCount += 1

            else:
                print(("error reading line: '" + line + "'")\
                    .encode(default_encoding))


    for radicalIdx in set(range(1, 215)) - seenRadicalFormIndices:
        print(("No table entry for radical index %d" % radicalIdx)\
            .encode(default_encoding))
        noEntryCount += 1

    for radicalIdx in set(range(1, 215)) - seenRadicalVariantIndices:
        simplifiedForms = set()
        # add simplified form, if different
        simplifiedForm = cjkSimplified.getKangxiRadicalForm(
            radicalIdx)
        if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx):
            simplifiedForms.add(simplifiedForm)
        # add simplified variant
        simplifiedForms.update(
            set(cjkSimplified.getKangxiRadicalVariantForms(
                radicalIdx)) \
            - set(cjk.getKangxiRadicalVariantForms(radicalIdx)))
        for form in simplifiedForms:
            print(("No table entry for simplified radical %s with index %d'"
                % (form, radicalIdx)).encode(default_encoding))
            noEntryCount += 1

    for radicalIdx in range(1, 215):
        otherVariants = set(cjk.getKangxiRadicalVariantForms(radicalIdx)) \
            - set(cjkSimplified.getKangxiRadicalVariantForms(radicalIdx))
        for form in otherVariants:
            print(("No table entry for variant %s with index %d'"
                % (form, radicalIdx)).encode(default_encoding))
            noEntryCount += 1

    print("Total %d entries" % fileEntryCount \
        + ", %d missing from cjklib" % databaseMissingEntryCount \
        + ", %d mismatches in equivalent forms" % wrongEquivalentCount \
        + ", not found in source list: %d" % noEntryCount)
コード例 #38
0
ファイル: checkcjkradicals.py プロジェクト: KentVu/cjklib
def main():
    cjk = CharacterLookup('T')
    cjkSimplified = CharacterLookup('C')

    fileEntryCount = 0
    databaseMissingEntryCount = 0
    noEntryCount = 0
    wrongEquivalentCount = 0
    seenRadicalFormIndices = set()
    seenRadicalVariantIndices = set()
    for line in sys.stdin:
        line = line.decode(default_encoding)

        if re.match(r'\s*#', line) or re.match(r'\s+$', line):
            continue
        else:
            fileEntryCount = fileEntryCount + 1

            matchObj = re.match(r"(\d{1,3})('?);\s+([1234567890ABCDEF]{4,5});" \
                + r"\s+([1234567890ABCDEF]{4,5})\s*$", line)
            if matchObj:
                index, variant, radicalCP, equivalentCP = matchObj.groups()
                radicalIdx = int(index)
                radicalForm = unichr(int(radicalCP, 16))
                equivalentForm = unichr(int(equivalentCP, 16))

                if variant:
                    seenRadicalVariantIndices.add(radicalIdx)
                else:
                    seenRadicalFormIndices.add(radicalIdx)
                # check radicalForm
                if not variant:
                    targetForms = set([cjk.getKangxiRadicalForm(radicalIdx)])
                else:
                    targetForms = set()
                    # add simplified form, if different
                    simplifiedForm = cjkSimplified.getKangxiRadicalForm(
                        radicalIdx)
                    if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx):
                        targetForms.add(simplifiedForm)
                    # add simplified variant
                    targetForms.update(
                        set(cjkSimplified.getKangxiRadicalVariantForms(
                            radicalIdx)) \
                        - set(cjk.getKangxiRadicalVariantForms(radicalIdx)))

                if radicalForm not in targetForms:
                    # cjklib is missing something
                    print ("No entry for radical form '%s' with index %d%s"
                        % (radicalForm, radicalIdx, variant))\
                        .encode(default_encoding)
                    databaseMissingEntryCount += 1
                if targetForms - set([radicalForm]):
                    # CJKRadicals.txt is missing something
                    for form in targetForms - set([radicalForm]):
                        print ("Database entry '%s' with radical index %d%s" \
                            % (form, radicalIdx, variant) \
                            + " not included in table")\
                            .encode(default_encoding)
                    noEntryCount += 1

                # check equivalentForm
                libraryEquivalentForm \
                    = cjk.getRadicalFormEquivalentCharacter(radicalForm)
                if libraryEquivalentForm != equivalentForm:
                    print ("Equivalent radical form '%s' with index %d%s"
                        % (libraryEquivalentForm, radicalIdx, variant) \
                        + " not backed by table: '%s'" % equivalentForm)\
                        .encode(default_encoding)
                    wrongEquivalentCount += 1

            else:
                print ("error reading line: '" + line + "'")\
                    .encode(default_encoding)


    for radicalIdx in set(range(1, 215)) - seenRadicalFormIndices:
        print ("No table entry for radical index %d" % radicalIdx)\
            .encode(default_encoding)
        noEntryCount += 1

    for radicalIdx in set(range(1, 215)) - seenRadicalVariantIndices:
        simplifiedForms = set()
        # add simplified form, if different
        simplifiedForm = cjkSimplified.getKangxiRadicalForm(
            radicalIdx)
        if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx):
            simplifiedForms.add(simplifiedForm)
        # add simplified variant
        simplifiedForms.update(
            set(cjkSimplified.getKangxiRadicalVariantForms(
                radicalIdx)) \
            - set(cjk.getKangxiRadicalVariantForms(radicalIdx)))
        for form in simplifiedForms:
            print ("No table entry for simplified radical %s with index %d'"
                % (form, radicalIdx)).encode(default_encoding)
            noEntryCount += 1

    for radicalIdx in range(1, 215):
        otherVariants = set(cjk.getKangxiRadicalVariantForms(radicalIdx)) \
            - set(cjkSimplified.getKangxiRadicalVariantForms(radicalIdx))
        for form in otherVariants:
            print ("No table entry for variant %s with index %d'"
                % (form, radicalIdx)).encode(default_encoding)
            noEntryCount += 1

    print "Total %d entries" % fileEntryCount \
        + ", %d missing from cjklib" % databaseMissingEntryCount \
        + ", %d mismatches in equivalent forms" % wrongEquivalentCount \
        + ", not found in source list: %d" % noEntryCount
コード例 #39
0
 def _characterLookup(cls):
     if not hasattr(cls, '_cjk'):
         cls._cjk = CharacterLookup('T', 'Unicode')
     return cls._cjk
コード例 #40
0
    def _removePseudoCharacters(self, decompositionEntries, flagEntries):
        """
        Removes all pseudo character entries and subsitutes their occurence
        by their own entries.
        """

        def substitutePseudoCharacters(decomposition):
            newDecomposition = []
            for c in decomposition:
                if type(c) != type(()):
                    # IDS
                    newDecomposition.append([[c]])
                else:
                    char, _ = c
                    if type(char) == type(0):
                        if c in pseudoCharacterMap:
                            # get all decompositions of this pseudo character
                            newPseudoDecomp = []
                            for decomp in pseudoCharacterMap[c]:
                                newDecomps = substitutePseudoCharacters(decomp)
                                if newDecomps:
                                    newPseudoDecomp.extend(newDecomps)
                            newDecomposition.append(newPseudoDecomp)
                        else:
                            return
                    else:
                        # normal char
                        newDecomposition.append([[c]])
            # all combinations of sub-decompositions
            flatDecomp = set()
            for newDecomp in cross(*newDecomposition):
                flatEntry = []
                for entry in newDecomp:
                    flatEntry.extend(entry)
                flatDecomp.add(tuple(flatEntry))
            return flatDecomp

        # find pseude characters first
        pseudoCharacterMap = {}
        for char in decompositionEntries:
            if type(char) == type(0):
                for glyph in decompositionEntries[char]:
                    pseudoCharacterMap[(char, glyph)] = decompositionEntries[char][glyph]

        # now apply
        newDecompositionsEntries = {}
        newFlagEntries = {}
        for char in decompositionEntries:
            if type(char) == type(0):
                continue
            newDecompositionsEntries[char] = {}
            newFlagEntries[char] = {}
            for glyph in decompositionEntries[char]:
                newDecompositionsEntries[char][glyph] = set()
                newFlagEntries[char][glyph] = {}
                for decomposition in decompositionEntries[char][glyph]:
                    newDecompositions = substitutePseudoCharacters(decomposition)
                    if newDecompositions:
                        newDecompositionsEntries[char][glyph].update(newDecompositions)
                        # transfer flags
                        for newDecomposition in newDecompositions:
                            newFlagEntries[char][glyph][newDecomposition] = flagEntries[char][glyph][decomposition]
                    elif not self.quiet:
                        print >>sys.stderr, (
                            "Unable to resolve decomposition"
                            + " with pseudo character for '%s': " % char
                            + CharacterLookup.decompositionToString(decomposition)
                        ).encode(default_encoding)

        return newDecompositionsEntries, newFlagEntries
コード例 #41
0
        def compareTrees(decompositionA, decompositionB):
            """
            Checks for similar decomposition trees, taking care of unknown
            components.

            Returns C{None} if the trees are not equal, a integer if the trees
            are similar. If the left tree (decompositionA) should be preferred a
            negative number is returned, or a positive number for the right tree
            (decompositionB). If C{0} is returned, both trees are equally good
            to choose from.
            """
            if not decompositionA and not decompositionB:
                # equal
                return 0
            elif not decompositionA or not decompositionB:
                # if all preceding components are the same that shouldn't happen
                raise ValueError()
            elif decompositionA[0] == decompositionB[0]:
                return compareTrees(decompositionA[1:], decompositionB[1:])

            elif type(decompositionA[0]) == type(()) and decompositionA[0][0] == u"?":
                decompositionB = consumeComponent(decompositionB)
                result = compareTrees(decompositionA[1:], decompositionB)
                if result is None or result < 0:
                    # unequal or the left side is preferred later on
                    return None
                else:
                    return +1

            elif type(decompositionB[0]) == type(()) and decompositionB[0][0] == u"?":
                decompositionA = consumeComponent(decompositionA)
                result = compareTrees(decompositionA, decompositionB[1:])
                if result is None or result > 0:
                    # unequal or the right side is preferred later on
                    return None
                else:
                    return -1

            elif CharacterLookup.isIDSOperator(decompositionA[0]) and CharacterLookup.isIDSOperator(decompositionB[0]):
                # No way these decompositions can be equal
                #   (simplified subseq. checking)
                return None

            elif CharacterLookup.isIDSOperator(decompositionA[0]):
                # expand tree B
                char, glyph = decompositionB[0]
                if char in decompositionEntries and glyph in decompositionEntries[char]:

                    for decomposition in decompositionEntries[char][glyph]:
                        result = compareTrees(decompositionA, decomposition + decompositionB[1:])
                        if result is not None and result >= 0:
                            # right side preferred and so do we...
                            #   A shorted description is better
                            return 1

                return None

            elif CharacterLookup.isIDSOperator(decompositionB[0]):
                # expand tree A
                char, glyph = decompositionA[0]
                if char in decompositionEntries and glyph in decompositionEntries[char]:

                    for decomposition in decompositionEntries[char][glyph]:
                        result = compareTrees(decomposition + decompositionA[1:], decompositionB)
                        if result is not None and result <= 0:
                            # left side preferred and so do we...
                            #   A shorted description is better
                            return -1
                return None
            else:
                return None
コード例 #42
0
 def __init__(self):
     self._cjk = CharacterLookup('T', 'Unicode')
     self.characterIterator = self._cjk.getDomainCharacterIterator()
     self.curChar = None
     self.glyphQueue = []
コード例 #43
0
 def getCharacters(self):
     cjk = CharacterLookup('T', self.title)
     return ' '.join(cjk.getDomainCharacterIterator())
コード例 #44
0
    def _mergeSimilarDecompositions(self, decompositionEntries, flagEntries):
        """
        Merges two decompositions, if they are the same, except:
            - one has an unknown component while the other doesn't,
            - one has a subtree that is the decomposition of the corresponding
              component of the other decomposition.
        """

        def consumeComponent(decomposition):
            """
            Consumes a component on the top level, e.g. for 㐯, C{⿱⿱亠吕香}
            consumes C{⿱亠吕} when given the partial decomposition C{⿱亠吕香}.
            """
            if type(decomposition[0]) == type(()):
                # consume one component
                return decomposition[1:]

            if CharacterLookup.isBinaryIDSOperator(decomposition[0]):
                decomposition = consumeComponent(decomposition[1:])
                return consumeComponent(decomposition)
            elif CharacterLookup.isTrinaryIDSOperator(decomposition[0]):
                decomposition = consumeComponent(decomposition[1:])
                decomposition = consumeComponent(decomposition)
                return consumeComponent(decomposition)

        def compareTrees(decompositionA, decompositionB):
            """
            Checks for similar decomposition trees, taking care of unknown
            components.

            Returns C{None} if the trees are not equal, a integer if the trees
            are similar. If the left tree (decompositionA) should be preferred a
            negative number is returned, or a positive number for the right tree
            (decompositionB). If C{0} is returned, both trees are equally good
            to choose from.
            """
            if not decompositionA and not decompositionB:
                # equal
                return 0
            elif not decompositionA or not decompositionB:
                # if all preceding components are the same that shouldn't happen
                raise ValueError()
            elif decompositionA[0] == decompositionB[0]:
                return compareTrees(decompositionA[1:], decompositionB[1:])

            elif type(decompositionA[0]) == type(()) and decompositionA[0][0] == u"?":
                decompositionB = consumeComponent(decompositionB)
                result = compareTrees(decompositionA[1:], decompositionB)
                if result is None or result < 0:
                    # unequal or the left side is preferred later on
                    return None
                else:
                    return +1

            elif type(decompositionB[0]) == type(()) and decompositionB[0][0] == u"?":
                decompositionA = consumeComponent(decompositionA)
                result = compareTrees(decompositionA, decompositionB[1:])
                if result is None or result > 0:
                    # unequal or the right side is preferred later on
                    return None
                else:
                    return -1

            elif CharacterLookup.isIDSOperator(decompositionA[0]) and CharacterLookup.isIDSOperator(decompositionB[0]):
                # No way these decompositions can be equal
                #   (simplified subseq. checking)
                return None

            elif CharacterLookup.isIDSOperator(decompositionA[0]):
                # expand tree B
                char, glyph = decompositionB[0]
                if char in decompositionEntries and glyph in decompositionEntries[char]:

                    for decomposition in decompositionEntries[char][glyph]:
                        result = compareTrees(decompositionA, decomposition + decompositionB[1:])
                        if result is not None and result >= 0:
                            # right side preferred and so do we...
                            #   A shorted description is better
                            return 1

                return None

            elif CharacterLookup.isIDSOperator(decompositionB[0]):
                # expand tree A
                char, glyph = decompositionA[0]
                if char in decompositionEntries and glyph in decompositionEntries[char]:

                    for decomposition in decompositionEntries[char][glyph]:
                        result = compareTrees(decomposition + decompositionA[1:], decompositionB)
                        if result is not None and result <= 0:
                            # left side preferred and so do we...
                            #   A shorted description is better
                            return -1
                return None
            else:
                return None

        for char in decompositionEntries:
            for glyph in decompositionEntries[char]:
                idxA = 0
                decompositions = list(decompositionEntries[char][glyph])
                flagsDict = flagEntries[char][glyph]
                # Check every decomposition with all others to the right
                while idxA < len(decompositions):
                    idxB = idxA + 1
                    while idxB < len(decompositions):
                        try:
                            result = compareTrees(decompositions[idxA], decompositions[idxB])
                            if result is not None and result == 0:
                                # Entries are equal, we can transfer flags
                                flagsDict[decompositions[idxA]].update(flagsDict[decompositions[idxB]])
                                del flagsDict[decompositions[idxB]]
                                del decompositions[idxB]
                            elif result is not None and result < 0:
                                del flagsDict[decompositions[idxB]]
                                del decompositions[idxB]
                            elif result is not None and result > 0:
                                del flagsDict[decompositions[idxA]]
                                del decompositions[idxA]
                                # No need for further testing for this decomp
                                break
                            else:
                                # Only increase if the list didn't shift to the
                                #   left
                                idxB += 1
                        except ValueError:
                            print >>sys.stderr, (
                                "Error comparing decompositions %s and %s"
                                % (
                                    CharacterLookup.decompositionToString(decompositions[idxA]),
                                    CharacterLookup.decompositionToString(decompositions[idxB]),
                                )
                            ).encode(default_encoding)
                            idxB += 1
                    else:
                        idxA += 1
                decompositionEntries[char][glyph] = set(decompositions)
コード例 #45
0
import requests
import codecs

import gevent
from gevent import monkey

monkey.patch_all()
from bs4 import BeautifulSoup

# 汉字拼音识别
from pypinyin import pinyin, lazy_pinyin, Style

# 笔划数识别
from cjklib.characterlookup import CharacterLookup

cjk = CharacterLookup('C')

# 汉字偏旁识别
from lib.component import *

reload(sys)
sys.setdefaultencoding("utf-8")

# 代理配置
proxies = {}


class BabyName():
    def __init__(self,
                 config={},
                 name_dict={},
コード例 #46
0
    def _removePseudoCharacters(self, decompositionEntries, flagEntries):
        """
        Removes all pseudo character entries and subsitutes their occurence
        by their own entries.
        """
        def substitutePseudoCharacters(decomposition):
            newDecomposition = []
            for c in decomposition:
                if type(c) != type(()):
                    # IDS
                    newDecomposition.append([[c]])
                else:
                    char, _ = c
                    if type(char) == type(0):
                        if c in pseudoCharacterMap:
                            # get all decompositions of this pseudo character
                            newPseudoDecomp = []
                            for decomp in pseudoCharacterMap[c]:
                                newDecomps = substitutePseudoCharacters(decomp)
                                if newDecomps:
                                    newPseudoDecomp.extend(newDecomps)
                            newDecomposition.append(newPseudoDecomp)
                        else:
                            return
                    else:
                        # normal char
                        newDecomposition.append([[c]])
            # all combinations of sub-decompositions
            flatDecomp = set()
            for newDecomp in cross(*newDecomposition):
                flatEntry = []
                for entry in newDecomp:
                    flatEntry.extend(entry)
                flatDecomp.add(tuple(flatEntry))
            return flatDecomp

        # find pseude characters first
        pseudoCharacterMap = {}
        for char in decompositionEntries:
            if type(char) == type(0):
                for glyph in decompositionEntries[char]:
                    pseudoCharacterMap[(char, glyph)] \
                        = decompositionEntries[char][glyph]

        # now apply
        newDecompositionsEntries = {}
        newFlagEntries = {}
        for char in decompositionEntries:
            if type(char) == type(0):
                continue
            newDecompositionsEntries[char] = {}
            newFlagEntries[char] = {}
            for glyph in decompositionEntries[char]:
                newDecompositionsEntries[char][glyph] = set()
                newFlagEntries[char][glyph] = {}
                for decomposition in decompositionEntries[char][glyph]:
                    newDecompositions = substitutePseudoCharacters(
                        decomposition)
                    if newDecompositions:
                        newDecompositionsEntries[char][glyph].update(
                            newDecompositions)
                        # transfer flags
                        for newDecomposition in newDecompositions:
                            newFlagEntries[char][glyph][newDecomposition] \
                                = flagEntries[char][glyph][decomposition]
                    elif not self.quiet:
                        print >> sys.stderr, ("Unable to resolve decomposition"
                            + " with pseudo character for '%s': " % char
                            + CharacterLookup.decompositionToString(
                                decomposition))\
                            .encode(default_encoding)

        return newDecompositionsEntries, newFlagEntries
コード例 #47
0
        def compareTrees(decompositionA, decompositionB):
            """
            Checks for similar decomposition trees, taking care of unknown
            components.

            Returns C{None} if the trees are not equal, a integer if the trees
            are similar. If the left tree (decompositionA) should be preferred a
            negative number is returned, or a positive number for the right tree
            (decompositionB). If C{0} is returned, both trees are equally good
            to choose from.
            """
            if not decompositionA and not decompositionB:
                # equal
                return 0
            elif not decompositionA or not decompositionB:
                # if all preceding components are the same that shouldn't happen
                raise ValueError()
            elif decompositionA[0] == decompositionB[0]:
                return compareTrees(decompositionA[1:], decompositionB[1:])

            elif (type(decompositionA[0]) == type(())
                  and decompositionA[0][0] == u'?'):
                decompositionB = consumeComponent(decompositionB)
                result = compareTrees(decompositionA[1:], decompositionB)
                if result is None or result < 0:
                    # unequal or the left side is preferred later on
                    return None
                else:
                    return +1

            elif (type(decompositionB[0]) == type(())
                  and decompositionB[0][0] == u'?'):
                decompositionA = consumeComponent(decompositionA)
                result = compareTrees(decompositionA, decompositionB[1:])
                if result is None or result > 0:
                    # unequal or the right side is preferred later on
                    return None
                else:
                    return -1

            elif (CharacterLookup.isIDSOperator(decompositionA[0])
                  and CharacterLookup.isIDSOperator(decompositionB[0])):
                # No way these decompositions can be equal
                #   (simplified subseq. checking)
                return None

            elif CharacterLookup.isIDSOperator(decompositionA[0]):
                # expand tree B
                char, glyph = decompositionB[0]
                if (char in decompositionEntries
                        and glyph in decompositionEntries[char]):

                    for decomposition in decompositionEntries[char][glyph]:
                        result = compareTrees(
                            decompositionA, decomposition + decompositionB[1:])
                        if result is not None and result >= 0:
                            # right side preferred and so do we...
                            #   A shorted description is better
                            return 1

                return None

            elif CharacterLookup.isIDSOperator(decompositionB[0]):
                # expand tree A
                char, glyph = decompositionA[0]
                if (char in decompositionEntries
                        and glyph in decompositionEntries[char]):

                    for decomposition in decompositionEntries[char][glyph]:
                        result = compareTrees(
                            decomposition + decompositionA[1:], decompositionB)
                        if result is not None and result <= 0:
                            # left side preferred and so do we...
                            #   A shorted description is better
                            return -1
                return None
            else:
                return None
コード例 #48
0
 def __init__(self, variant='T'):
     self.characterLookup = CharacterLookup('T')
     self.variant = variant