Exemple #1
0
    def testInitialization(self):
        """Test initialisation."""
        # test if locales are accepted
        for locale in 'TCJKV':
            characterlookup.CharacterLookup(locale, dbConnectInst=self.db)

        # test if locale is rejected
        self.assertRaises(ValueError,
                          characterlookup.CharacterLookup,
                          'F',
                          dbConnectInst=self.db)

        # test default database connector
        characterlookup.CharacterLookup('T')

        # test if character domain 'Unicode' is accepted
        characterlookup.CharacterLookup('T', 'Unicode', dbConnectInst=self.db)

        # test if character domain is accepted
        from sqlalchemy import Table, Column, String
        domain = 'MyDomain'
        tableObj = Table(domain + 'Set',
                         self.db.metadata,
                         Column('ChineseCharacter', String),
                         useexisting=True)
        mydb = DatabaseConnectorMock(self.db,
                                     mockTables=[domain + 'Set'],
                                     mockTableDefinition=[tableObj])
        characterlookup.CharacterLookup('T', domain, dbConnectInst=mydb)
        self.db.metadata.remove(tableObj)

        # test if character domain is rejected
        domain = 'MyDomain'
        mydb = DatabaseConnectorMock(self.db, mockNonTables=[domain + 'Set'])
        self.assertRaises(ValueError,
                          characterlookup.CharacterLookup,
                          'T',
                          domain,
                          dbConnectInst=mydb)

        # test if character domain is rejected
        domain = 'MyOtherDomain'
        tableObj = Table(domain + 'Set',
                         self.db.metadata,
                         Column('SomeColumn', String),
                         useexisting=True)
        mydb = DatabaseConnectorMock(self.db,
                                     mockTables=[domain + 'Set'],
                                     mockTableDefinition=[tableObj])
        self.assertRaises(ValueError,
                          characterlookup.CharacterLookup,
                          'T',
                          domain,
                          dbConnectInst=mydb)
        self.db.metadata.remove(tableObj)
Exemple #2
0
def chunk_gen(text, sub=' '):
    """
    Iterator over characters in text, replacing them as needed
    Replaces punctuation, symbols, separators with spaces
    Reduces characters to their variant with the lowest code point
    :param text: input text
    :param sub: thing to substitute for unwanted characters
    :return: generator
    """

    # Lookup characters in chinese locale
    lookup = characterlookup.CharacterLookup(locale='C')

    for char in base_standardizer.remove_unwanted_gen(text, sub):
        if char == sub:
            yield char
        else:
            # see https://github.com/cburgmer/cjklib/blob/3faf249e1416ed5dca4d7b9a3341400bf64a9e50/cjklib/characterlookup.py
            # much faster - one db hit
            # includes (specialized)semantic variants, traditional/simplified variants
            # unicode compatibility variants, and Z variants
            # Empty list if character not found
            variants = lookup.getAllCharacterVariants(char)
            variants.append((char, 'M'))
            desired = min(v[0] for v in variants if v[1] in {'P', 'M'})

            yield desired
 def testDomainCharsAccepted(self):
     """Test if all characters in the character domain are accepted."""
     for domain in self.characterLookup.getAvailableCharacterDomains():
         characterLookupDomain = characterlookup.CharacterLookup(
             'T', domain, dbConnectInst=self.db)
         for char in characterLookupDomain.getDomainCharacterIterator():
             self.assert_(characterLookupDomain.isCharacterInDomain(char))
Exemple #4
0
    def getCharacterLookupInst(self, options):
        if not hasattr(self, '_instanceDict'):
            self._instanceDict = {}
        if options not in self._instanceDict:
            self._instanceDict[options] = characterlookup.CharacterLookup(
                dbConnectInst=self.db, *options)

        return self._instanceDict[options]
 def is_subchar(self, char, subchar):
     import cjklib.characterlookup as cl
     cjk = cl.CharacterLookup('C')
     decomp = cjk.getDecompositionEntries(char)
     if decomp:
         subchars = decomp[0][1:]
         return subchar.decode('utf-8') in [x[0] for x in subchars]
     else:
         return False
Exemple #6
0
 def testFilterIdentityOnSelf(self):
     """
     Test if filterDomainCharacters operates as identity on characters from
     domain.
     """
     for domain in self.characterLookup.getAvailableCharacterDomains():
         characterLookupDomain = characterlookup.CharacterLookup(
             'T', domain, dbConnectInst=self.db)
         domainChars = [c for c \
             in characterLookupDomain.getDomainCharacterIterator()]
         self.assertTrue(domainChars \
             == characterLookupDomain.filterDomainCharacters(domainChars))
Exemple #7
0
 def testCharacterDomainInUnicode(self):
     """
     Tests if all character domains are included in the maximum Unicode
     domain.
     """
     for domain in self.characterLookup.getAvailableCharacterDomains():
         characterLookupDomain = characterlookup.CharacterLookup(
             'T', domain, dbConnectInst=self.db)
         domainChars = [c for c \
             in characterLookupDomain.getDomainCharacterIterator()]
         self.assertTrue(domainChars \
             == self.characterLookup.filterDomainCharacters(domainChars))
Exemple #8
0
def getCJK():
    """
    Creates an instance of the L{CharacterLookup} object if needed and returns
    it.

    @rtype: object
    @return: an instance of the L{CharacterLookup} object
    """
    global _cjk
    if not _cjk:
        _cjk = characterlookup.CharacterLookup('T')
    return _cjk
Exemple #9
0
    def testAvailableCharacterDomains(self):
        """Test if ``getAvailableCharacterDomains()`` returns proper domains."""
        # test default domain
        self.assertTrue('Unicode' \
            in self.characterLookup.getAvailableCharacterDomains())

        # test provided domain
        from sqlalchemy import Table, Column, String
        domain = 'MyDomain'
        tableObj = Table(domain + 'Set',
                         self.db.metadata,
                         Column('ChineseCharacter', String),
                         useexisting=True)
        mydb = DatabaseConnectorMock(self.db,
                                     mockTables=[domain + 'Set'],
                                     mockTableDefinition=[tableObj])
        cjk = characterlookup.CharacterLookup('T', dbConnectInst=mydb)
        self.assertTrue(domain in cjk.getAvailableCharacterDomains())
        self.db.metadata.remove(tableObj)

        # test domain not included
        domain = 'MyDomain'
        mydb = DatabaseConnectorMock(self.db, mockNonTables=[domain + 'Set'])
        cjk = characterlookup.CharacterLookup('T', dbConnectInst=mydb)
        self.assertTrue(domain not in cjk.getAvailableCharacterDomains())

        # test domain not included
        domain = 'MyOtherDomain'
        tableObj = Table(domain + 'Set',
                         self.db.metadata,
                         Column('SomeColumn', String),
                         useexisting=True)
        mydb = DatabaseConnectorMock(self.db,
                                     mockTables=[domain + 'Set'],
                                     mockTableDefinition=[tableObj])
        cjk = characterlookup.CharacterLookup('T', dbConnectInst=mydb)
        self.assertTrue(domain not in cjk.getAvailableCharacterDomains())
        self.db.metadata.remove(tableObj)
Exemple #10
0
def characterIsSimpTrad(c, simpTrad):
    from db import database
    from cjklib import characterlookup

    thislocale, otherlocale = simpTrad == 0 and ("C", "T") or ("T", "C")
    clookup = characterlookup.CharacterLookup(
        thislocale, dbConnectInst=database(
        ))  # NB: not sure that thisLocale actualy makes any difference..

    # Find all the variants of this character for the relevant locales
    othervariants = clookup.getCharacterVariants(c, otherlocale)
    thisvariants = clookup.getCharacterVariants(c, thislocale)

    # If there are any variants at all, guess that we must have a character in the original locale.
    # To deal nicely with situations where we lack data, guess that things are in the requested locale
    # if we *also* don't have any versions of them in the original locale.
    return len(othervariants) != 0 or len(thisvariants) == 0
Exemple #11
0
 def testStrokeOrderMatchesStrokeCount(self):
     """
     Tests if stroke order information returned by ``getStrokeOrder`` matches
     stroke count returned by ``getStrokeCount``.
     """
     cjk = characterlookup.CharacterLookup('T',
                                           'GlyphInformation',
                                           dbConnectInst=self.db)
     for char in cjk.getDomainCharacterIterator():
         try:
             strokeOrder = cjk.getStrokeOrder(char, includePartial=True)
             strokeCount = cjk.getStrokeCount(char)
             self.assertTrue(
                 len(strokeOrder) == strokeCount,
                 "Stroke count %d does not match stroke order (%d)" %
                 (strokeCount, len(strokeOrder)) +
                 " for character '%s'" % char)
         except exception.NoInformationError:
             continue
Exemple #12
0
 def __init__(self):
     self.cjk = characterlookup.CharacterLookup('T')
Exemple #13
0
def test_character_lookup():
    from cjklib import characterlookup
    lookup = characterlookup.CharacterLookup('C')
    result = lookup.getDecompositionEntries('兴')
    assert (result == [['⿳', ('⺍', 0), ('一', 0), ('八', 2)]])

def pinyin_re_sub():
    inits = u"zh|sh|ch|[bpmfdtnlgkhjqxrzscwy]"
    finals = u"i[ōóǒòo]ng|[ūúǔùu]ng|[āáǎàa]ng|[ēéěèe]ng|i[āɑ̄áɑ́ɑ́ǎɑ̌àɑ̀aāáǎàa]ng|[īíǐìi]ng|i[āáǎàa]n|u[āáǎàa]n|[ōóǒòo]ng|[ēéěèe]r|i[āáǎàa]|i[ēéěèe]|i[āáǎàa]o|i[ūúǔùu]|[īíǐìi]n|u[āáǎàa]|u[ōóǒòo]|u[āáǎàa]i|u[īíǐìi]|[ūúǔùu]n|u[ēéěèe]|ü[ēéěèe]|v[ēéěèe]|i[ōóǒòo]|[āáǎàa]i|[ēéěèe]i|[āáǎàa]o|[ōóǒòo]u|[āáǎàa]n|[ēéěèe]n|[āáǎàa]|[ēéěèe]|[ōóǒòo]|[īíǐìi]|[ūúǔùu]|[ǖǘǚǜüv]"
    standalones = u"'[āáǎàa]ng|'[ēéěèe]ng|'[ēéěèe]r|'[āáǎàa]i|'[ēéěèe]i|'[āáǎàa]o|'[ōóǒòo]u|'[āáǎàa]n|'[ēéěèe]n|'[āáǎàa]|'[ēéěèe]|'[ōóǒòo]"
    return "((" + inits + ")(" + finals + ")|(" + standalones + "))"


pinyin_re = pinyin_re_sub()
pinyin_two_re = re.compile("(?P<one>" + pinyin_re + ")(?P<two>" + pinyin_re +
                           ")",
                           flags=re.I)

try:
    characterLookup = characterlookup.CharacterLookup('C')
    #One of TCJKV (Taiwan, China, Japan, Korea, Vietnam). I don't know what difference it actually makes
except:
    #Mornir's bug (Issue #29) : on Windows, CKJlib will fail if the user path contains special characters (eg: the profile name contains an accent)
    from aqt.utils import showInfo
    showInfo(
        '<b>Chinese Support Add-on</b> seem to be experiencing Mornir\'s bug. Please refer to <a href="https://github.com/ttempe/chinese-support-addon/wiki/Mornir%27s-bug">this help plage</a> to solve the issue.'
    )
    characterLookup = characterlookup.CharacterLookup(
        'C')  #cause the actual error after showing the help message.

bopomofo_notes = {u"ˊ": "2", u"ˇ": "3", u"ˋ": "4", u"˙": "5"}


def extract_sound_tags(text):
    sound_tags = re.findall(r"\[sound:.*?\]", text)
Exemple #15
0
 def setUp(self):
     NeedsDatabaseTest.setUp(self)
     self.characterLookup = characterlookup.CharacterLookup(
         'T', dbConnectInst=self.db)
                decompositionTable.c.Glyph
            ],
                   decompositionTable.c.ChineseCharacter.in_(
                       select([charsetTable.c.ChineseCharacter])),
                   distinct=True),
            select([
                strokeOrderTable.c.ChineseCharacter, strokeOrderTable.c.Glyph
            ],
                   strokeOrderTable.c.ChineseCharacter.in_(
                       select([charsetTable.c.ChineseCharacter])),
                   distinct=True))))
"""Queue of characters needed to be checked."""
characterDecomposition = {}
"""Mapping of character to its decomposition(s)."""

cjk = characterlookup.CharacterLookup('T')

# get mappings
for char, glyph in characterQueue.copy():
    decompositions = cjk.getDecompositionEntries(char, glyph=glyph)
    if decompositions:
        characterDecomposition[(char, glyph)] = decompositions
    else:
        characterQueue.remove((char, glyph))
        minimalBasicComponents.add(char)

# process queue
while characterQueue:
    for charEntry in characterQueue.copy():
        fullyDecomposed = True
        for decomposition in characterDecomposition[charEntry]:
Exemple #17
0
import sys

from cjklib import characterlookup


# Maps the romanisation command line argument to the (reading,toneMarkType)
# params of CharacterLookup.getReadingForCharacter()
rom_param_map = {
    'Pinyin': ('Pinyin', 'diacritics'),
    'PinyinNum': ('Pinyin', 'numbers'),
    'CantoneseYale': ('CantoneseYale', 'diacritics'),
    'CantoneseYaleNum': ('CantoneseYale', 'numbers'),
    'CantoneseJyutping': ('Jyutping', 'numbers'),
}

CharLookup = characterlookup.CharacterLookup('C')

def subtitle_line(line, romanisation):
    """
    Subtitles the given line of Chinese text using the given romanisation.
    Returns a tuple where the first element is the list of Chinese characters
    and the second element is the list of corresponding romanisations.
    """
    if len(line)<=0:
        return None
    zh_chars = []
    rom_chars = []
    for ch in line:
        param = rom_param_map[romanisation]
        rom = CharLookup.getReadingForCharacter(ch, param[0], toneMarkType=param[1])
        zh_chars.append(ch)