def testInitialization(self): """Test initialisation.""" # test if locales are accepted for locale in 'TCJKV': characterlookup.CharacterLookup(locale, dbConnectInst=self.db) # test if locale is rejected self.assertRaises(ValueError, characterlookup.CharacterLookup, 'F', dbConnectInst=self.db) # test default database connector characterlookup.CharacterLookup('T') # test if character domain 'Unicode' is accepted characterlookup.CharacterLookup('T', 'Unicode', dbConnectInst=self.db) # test if character domain is accepted from sqlalchemy import Table, Column, String domain = 'MyDomain' tableObj = Table(domain + 'Set', self.db.metadata, Column('ChineseCharacter', String), useexisting=True) mydb = DatabaseConnectorMock(self.db, mockTables=[domain + 'Set'], mockTableDefinition=[tableObj]) characterlookup.CharacterLookup('T', domain, dbConnectInst=mydb) self.db.metadata.remove(tableObj) # test if character domain is rejected domain = 'MyDomain' mydb = DatabaseConnectorMock(self.db, mockNonTables=[domain + 'Set']) self.assertRaises(ValueError, characterlookup.CharacterLookup, 'T', domain, dbConnectInst=mydb) # test if character domain is rejected domain = 'MyOtherDomain' tableObj = Table(domain + 'Set', self.db.metadata, Column('SomeColumn', String), useexisting=True) mydb = DatabaseConnectorMock(self.db, mockTables=[domain + 'Set'], mockTableDefinition=[tableObj]) self.assertRaises(ValueError, characterlookup.CharacterLookup, 'T', domain, dbConnectInst=mydb) self.db.metadata.remove(tableObj)
def chunk_gen(text, sub=' '): """ Iterator over characters in text, replacing them as needed Replaces punctuation, symbols, separators with spaces Reduces characters to their variant with the lowest code point :param text: input text :param sub: thing to substitute for unwanted characters :return: generator """ # Lookup characters in chinese locale lookup = characterlookup.CharacterLookup(locale='C') for char in base_standardizer.remove_unwanted_gen(text, sub): if char == sub: yield char else: # see https://github.com/cburgmer/cjklib/blob/3faf249e1416ed5dca4d7b9a3341400bf64a9e50/cjklib/characterlookup.py # much faster - one db hit # includes (specialized)semantic variants, traditional/simplified variants # unicode compatibility variants, and Z variants # Empty list if character not found variants = lookup.getAllCharacterVariants(char) variants.append((char, 'M')) desired = min(v[0] for v in variants if v[1] in {'P', 'M'}) yield desired
def testDomainCharsAccepted(self): """Test if all characters in the character domain are accepted.""" for domain in self.characterLookup.getAvailableCharacterDomains(): characterLookupDomain = characterlookup.CharacterLookup( 'T', domain, dbConnectInst=self.db) for char in characterLookupDomain.getDomainCharacterIterator(): self.assert_(characterLookupDomain.isCharacterInDomain(char))
def getCharacterLookupInst(self, options): if not hasattr(self, '_instanceDict'): self._instanceDict = {} if options not in self._instanceDict: self._instanceDict[options] = characterlookup.CharacterLookup( dbConnectInst=self.db, *options) return self._instanceDict[options]
def is_subchar(self, char, subchar): import cjklib.characterlookup as cl cjk = cl.CharacterLookup('C') decomp = cjk.getDecompositionEntries(char) if decomp: subchars = decomp[0][1:] return subchar.decode('utf-8') in [x[0] for x in subchars] else: return False
def testFilterIdentityOnSelf(self): """ Test if filterDomainCharacters operates as identity on characters from domain. """ for domain in self.characterLookup.getAvailableCharacterDomains(): characterLookupDomain = characterlookup.CharacterLookup( 'T', domain, dbConnectInst=self.db) domainChars = [c for c \ in characterLookupDomain.getDomainCharacterIterator()] self.assertTrue(domainChars \ == characterLookupDomain.filterDomainCharacters(domainChars))
def testCharacterDomainInUnicode(self): """ Tests if all character domains are included in the maximum Unicode domain. """ for domain in self.characterLookup.getAvailableCharacterDomains(): characterLookupDomain = characterlookup.CharacterLookup( 'T', domain, dbConnectInst=self.db) domainChars = [c for c \ in characterLookupDomain.getDomainCharacterIterator()] self.assertTrue(domainChars \ == self.characterLookup.filterDomainCharacters(domainChars))
def getCJK(): """ Creates an instance of the L{CharacterLookup} object if needed and returns it. @rtype: object @return: an instance of the L{CharacterLookup} object """ global _cjk if not _cjk: _cjk = characterlookup.CharacterLookup('T') return _cjk
def testAvailableCharacterDomains(self): """Test if ``getAvailableCharacterDomains()`` returns proper domains.""" # test default domain self.assertTrue('Unicode' \ in self.characterLookup.getAvailableCharacterDomains()) # test provided domain from sqlalchemy import Table, Column, String domain = 'MyDomain' tableObj = Table(domain + 'Set', self.db.metadata, Column('ChineseCharacter', String), useexisting=True) mydb = DatabaseConnectorMock(self.db, mockTables=[domain + 'Set'], mockTableDefinition=[tableObj]) cjk = characterlookup.CharacterLookup('T', dbConnectInst=mydb) self.assertTrue(domain in cjk.getAvailableCharacterDomains()) self.db.metadata.remove(tableObj) # test domain not included domain = 'MyDomain' mydb = DatabaseConnectorMock(self.db, mockNonTables=[domain + 'Set']) cjk = characterlookup.CharacterLookup('T', dbConnectInst=mydb) self.assertTrue(domain not in cjk.getAvailableCharacterDomains()) # test domain not included domain = 'MyOtherDomain' tableObj = Table(domain + 'Set', self.db.metadata, Column('SomeColumn', String), useexisting=True) mydb = DatabaseConnectorMock(self.db, mockTables=[domain + 'Set'], mockTableDefinition=[tableObj]) cjk = characterlookup.CharacterLookup('T', dbConnectInst=mydb) self.assertTrue(domain not in cjk.getAvailableCharacterDomains()) self.db.metadata.remove(tableObj)
def characterIsSimpTrad(c, simpTrad): from db import database from cjklib import characterlookup thislocale, otherlocale = simpTrad == 0 and ("C", "T") or ("T", "C") clookup = characterlookup.CharacterLookup( thislocale, dbConnectInst=database( )) # NB: not sure that thisLocale actualy makes any difference.. # Find all the variants of this character for the relevant locales othervariants = clookup.getCharacterVariants(c, otherlocale) thisvariants = clookup.getCharacterVariants(c, thislocale) # If there are any variants at all, guess that we must have a character in the original locale. # To deal nicely with situations where we lack data, guess that things are in the requested locale # if we *also* don't have any versions of them in the original locale. return len(othervariants) != 0 or len(thisvariants) == 0
def testStrokeOrderMatchesStrokeCount(self): """ Tests if stroke order information returned by ``getStrokeOrder`` matches stroke count returned by ``getStrokeCount``. """ cjk = characterlookup.CharacterLookup('T', 'GlyphInformation', dbConnectInst=self.db) for char in cjk.getDomainCharacterIterator(): try: strokeOrder = cjk.getStrokeOrder(char, includePartial=True) strokeCount = cjk.getStrokeCount(char) self.assertTrue( len(strokeOrder) == strokeCount, "Stroke count %d does not match stroke order (%d)" % (strokeCount, len(strokeOrder)) + " for character '%s'" % char) except exception.NoInformationError: continue
def __init__(self): self.cjk = characterlookup.CharacterLookup('T')
def test_character_lookup(): from cjklib import characterlookup lookup = characterlookup.CharacterLookup('C') result = lookup.getDecompositionEntries('兴') assert (result == [['⿳', ('⺍', 0), ('一', 0), ('八', 2)]])
def pinyin_re_sub(): inits = u"zh|sh|ch|[bpmfdtnlgkhjqxrzscwy]" finals = u"i[ōóǒòo]ng|[ūúǔùu]ng|[āáǎàa]ng|[ēéěèe]ng|i[āɑ̄áɑ́ɑ́ǎɑ̌àɑ̀aāáǎàa]ng|[īíǐìi]ng|i[āáǎàa]n|u[āáǎàa]n|[ōóǒòo]ng|[ēéěèe]r|i[āáǎàa]|i[ēéěèe]|i[āáǎàa]o|i[ūúǔùu]|[īíǐìi]n|u[āáǎàa]|u[ōóǒòo]|u[āáǎàa]i|u[īíǐìi]|[ūúǔùu]n|u[ēéěèe]|ü[ēéěèe]|v[ēéěèe]|i[ōóǒòo]|[āáǎàa]i|[ēéěèe]i|[āáǎàa]o|[ōóǒòo]u|[āáǎàa]n|[ēéěèe]n|[āáǎàa]|[ēéěèe]|[ōóǒòo]|[īíǐìi]|[ūúǔùu]|[ǖǘǚǜüv]" standalones = u"'[āáǎàa]ng|'[ēéěèe]ng|'[ēéěèe]r|'[āáǎàa]i|'[ēéěèe]i|'[āáǎàa]o|'[ōóǒòo]u|'[āáǎàa]n|'[ēéěèe]n|'[āáǎàa]|'[ēéěèe]|'[ōóǒòo]" return "((" + inits + ")(" + finals + ")|(" + standalones + "))" pinyin_re = pinyin_re_sub() pinyin_two_re = re.compile("(?P<one>" + pinyin_re + ")(?P<two>" + pinyin_re + ")", flags=re.I) try: characterLookup = characterlookup.CharacterLookup('C') #One of TCJKV (Taiwan, China, Japan, Korea, Vietnam). I don't know what difference it actually makes except: #Mornir's bug (Issue #29) : on Windows, CKJlib will fail if the user path contains special characters (eg: the profile name contains an accent) from aqt.utils import showInfo showInfo( '<b>Chinese Support Add-on</b> seem to be experiencing Mornir\'s bug. Please refer to <a href="https://github.com/ttempe/chinese-support-addon/wiki/Mornir%27s-bug">this help plage</a> to solve the issue.' ) characterLookup = characterlookup.CharacterLookup( 'C') #cause the actual error after showing the help message. bopomofo_notes = {u"ˊ": "2", u"ˇ": "3", u"ˋ": "4", u"˙": "5"} def extract_sound_tags(text): sound_tags = re.findall(r"\[sound:.*?\]", text)
def setUp(self): NeedsDatabaseTest.setUp(self) self.characterLookup = characterlookup.CharacterLookup( 'T', dbConnectInst=self.db)
decompositionTable.c.Glyph ], decompositionTable.c.ChineseCharacter.in_( select([charsetTable.c.ChineseCharacter])), distinct=True), select([ strokeOrderTable.c.ChineseCharacter, strokeOrderTable.c.Glyph ], strokeOrderTable.c.ChineseCharacter.in_( select([charsetTable.c.ChineseCharacter])), distinct=True)))) """Queue of characters needed to be checked.""" characterDecomposition = {} """Mapping of character to its decomposition(s).""" cjk = characterlookup.CharacterLookup('T') # get mappings for char, glyph in characterQueue.copy(): decompositions = cjk.getDecompositionEntries(char, glyph=glyph) if decompositions: characterDecomposition[(char, glyph)] = decompositions else: characterQueue.remove((char, glyph)) minimalBasicComponents.add(char) # process queue while characterQueue: for charEntry in characterQueue.copy(): fullyDecomposed = True for decomposition in characterDecomposition[charEntry]:
import sys from cjklib import characterlookup # Maps the romanisation command line argument to the (reading,toneMarkType) # params of CharacterLookup.getReadingForCharacter() rom_param_map = { 'Pinyin': ('Pinyin', 'diacritics'), 'PinyinNum': ('Pinyin', 'numbers'), 'CantoneseYale': ('CantoneseYale', 'diacritics'), 'CantoneseYaleNum': ('CantoneseYale', 'numbers'), 'CantoneseJyutping': ('Jyutping', 'numbers'), } CharLookup = characterlookup.CharacterLookup('C') def subtitle_line(line, romanisation): """ Subtitles the given line of Chinese text using the given romanisation. Returns a tuple where the first element is the list of Chinese characters and the second element is the list of corresponding romanisations. """ if len(line)<=0: return None zh_chars = [] rom_chars = [] for ch in line: param = rom_param_map[romanisation] rom = CharLookup.getReadingForCharacter(ch, param[0], toneMarkType=param[1]) zh_chars.append(ch)