def getStrokeOrd(fin, kl): """ Trying for awareness of glyph locale in lookup. """ from cjklib.characterlookup import CharacterLookup for i in kl: if i in cedict.simplified: cjk = CharacterLookup('C') elif i in cedict.traditional: cjk = CharacterLookup('T') else: cjk = CharacterLookup('J') j = cjk.getStrokeOrder(i) fin.append(u'• ' + u' '.join(j)) return fin
def auxSOrd(i): """ Try to get stroke decomposition if subcomponent decomposition fails. """ from cjklib.characterlookup import CharacterLookup if i in cedict.simplified: cjk = CharacterLookup('C') elif i in cedict.traditional: cjk = CharacterLookup('T') else: cjk = CharacterLookup('J') try: j = cjk.getStrokeOrder(i) except: return u'[x]' return u' '.join(j)
def mandarinToPinyin(mandarinChar): cjk = CharacterLookup('C') textPinYinList = cjk.getReadingForCharacter(mandarinChar, 'Pinyin', toneMarkType='none') if len(textPinYinList) > 1: print "converted syllable {} has {} parts".format( textPinYinList, len(textPinYinList)) pinyin = textPinYinList[ 0] # take only first variant of pinyin interpretations return pinyin
def tokenize(input, output): try: text = open(input, 'r').readlines() except IOError: print "IOError: could not open", input sys.exit() cjk = CharacterLookup('T') out = open(output, 'w') for line in text: line = line.decode('utf-8') new_line = "" for char in line: pinyin = cjk.getReadingForCharacter(char, 'Pinyin') if pinyin: new_line += char new_line += '\n' out.write(new_line.encode('utf-8')) out.close()
def to_pinyin(filename): try: input = open(filename, 'r').readlines() except IOError: print "IOError: could not open", filename sys.exit() cjk = CharacterLookup('T') input = [u'我喜歡他'] for line in input: #line = line.decode('utf-8') new_line = "" for char in line: pinyin = cjk.getReadingForCharacter(char, 'Pinyin') if pinyin: print [unidecode(x) for x in pinyin] simplified = unidecode(pinyin[0]) new_line += simplified + char + " " line = new_line print line
import requests import codecs import gevent from gevent import monkey monkey.patch_all() from bs4 import BeautifulSoup # 汉字拼音识别 from pypinyin import pinyin, lazy_pinyin, Style # 笔划数识别 from cjklib.characterlookup import CharacterLookup cjk = CharacterLookup('C') # 汉字偏旁识别 from lib.component import * reload(sys) sys.setdefaultencoding("utf-8") # 代理配置 proxies = {} class BabyName(): def __init__(self, config={}, name_dict={},
def _characterLookup(cls): if not hasattr(cls, '_cjk'): cls._cjk = CharacterLookup('T', 'Unicode') return cls._cjk
def getCharacters(self): cjk = CharacterLookup('T', self.title) return ' '.join(cjk.getDomainCharacterIterator())
def __init__(self): self._cjk = CharacterLookup('T', 'Unicode') self.characterIterator = self._cjk.getDomainCharacterIterator() self.curChar = None self.glyphQueue = []
def __init__(self, options, args): self._locale = options.locale self._characterDomain = options.characterDomain self._cjk = CharacterLookup(self._locale, self._characterDomain)
def __init__(self, variant='T'): self.characterLookup = CharacterLookup('T') self.variant = variant
def main(): cjk = CharacterLookup('T') cjkSimplified = CharacterLookup('C') fileEntryCount = 0 databaseMissingEntryCount = 0 noEntryCount = 0 wrongEquivalentCount = 0 seenRadicalFormIndices = set() seenRadicalVariantIndices = set() for line in sys.stdin: line = line.decode(default_encoding) if re.match(r'\s*#', line) or re.match(r'\s+$', line): continue else: fileEntryCount = fileEntryCount + 1 matchObj = re.match(r"(\d{1,3})('?);\s+([1234567890ABCDEF]{4,5});" \ + r"\s+([1234567890ABCDEF]{4,5})\s*$", line) if matchObj: index, variant, radicalCP, equivalentCP = matchObj.groups() radicalIdx = int(index) radicalForm = chr(int(radicalCP, 16)) equivalentForm = chr(int(equivalentCP, 16)) if variant: seenRadicalVariantIndices.add(radicalIdx) else: seenRadicalFormIndices.add(radicalIdx) # check radicalForm if not variant: targetForms = set([cjk.getKangxiRadicalForm(radicalIdx)]) else: targetForms = set() # add simplified form, if different simplifiedForm = cjkSimplified.getKangxiRadicalForm( radicalIdx) if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx): targetForms.add(simplifiedForm) # add simplified variant targetForms.update( set(cjkSimplified.getKangxiRadicalVariantForms( radicalIdx)) \ - set(cjk.getKangxiRadicalVariantForms(radicalIdx))) if radicalForm not in targetForms: # cjklib is missing something print(("No entry for radical form '%s' with index %d%s" % (radicalForm, radicalIdx, variant))\ .encode(default_encoding)) databaseMissingEntryCount += 1 if targetForms - set([radicalForm]): # CJKRadicals.txt is missing something for form in targetForms - set([radicalForm]): print(("Database entry '%s' with radical index %d%s" \ % (form, radicalIdx, variant) \ + " not included in table")\ .encode(default_encoding)) noEntryCount += 1 # check equivalentForm libraryEquivalentForm \ = cjk.getRadicalFormEquivalentCharacter(radicalForm) if libraryEquivalentForm != equivalentForm: print(("Equivalent radical form '%s' with index %d%s" % (libraryEquivalentForm, radicalIdx, variant) \ + " not backed by table: '%s'" % equivalentForm)\ .encode(default_encoding)) wrongEquivalentCount += 1 else: print(("error reading line: '" + line + "'")\ .encode(default_encoding)) for radicalIdx in set(range(1, 215)) - seenRadicalFormIndices: print(("No table entry for radical index %d" % radicalIdx)\ .encode(default_encoding)) noEntryCount += 1 for radicalIdx in set(range(1, 215)) - seenRadicalVariantIndices: simplifiedForms = set() # add simplified form, if different simplifiedForm = cjkSimplified.getKangxiRadicalForm( radicalIdx) if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx): simplifiedForms.add(simplifiedForm) # add simplified variant simplifiedForms.update( set(cjkSimplified.getKangxiRadicalVariantForms( radicalIdx)) \ - set(cjk.getKangxiRadicalVariantForms(radicalIdx))) for form in simplifiedForms: print(("No table entry for simplified radical %s with index %d'" % (form, radicalIdx)).encode(default_encoding)) noEntryCount += 1 for radicalIdx in range(1, 215): otherVariants = set(cjk.getKangxiRadicalVariantForms(radicalIdx)) \ - set(cjkSimplified.getKangxiRadicalVariantForms(radicalIdx)) for form in otherVariants: print(("No table entry for variant %s with index %d'" % (form, radicalIdx)).encode(default_encoding)) noEntryCount += 1 print("Total %d entries" % fileEntryCount \ + ", %d missing from cjklib" % databaseMissingEntryCount \ + ", %d mismatches in equivalent forms" % wrongEquivalentCount \ + ", not found in source list: %d" % noEntryCount)