def get_D_prop_aliases(self): """ Get a map of Unicode property aliases, e.g. {'ccc': 'Canonical_Combining_Class', ...} """ D = {} with open(data_path('chardata', 'unidata/source/PropertyAliases.txt'), 'r', encoding='utf-8') as f: for line in f: line = line.split('#')[0].strip() L = [i.strip() for i in line.split(';') if i.strip()] if not L: continue prop = L[1] LAliases = [L[0]] + L[2:] for alias in LAliases: assert not alias in D D[alias.lower()] = prop # HACKS! D['canonical_combining_class'] = D[ 'ccc'] = 'canonical_combining_classes' return D
def __init__(self): from char_data.data_processors.internal.data_sources.unicode.UnicodeData import UnicodeData WriteBase.__init__(self, UnicodeData()) # Basic data self.unicode_data('UnicodeData.txt') self.names_list(data_path('chardata', 'unidata/source/NamesList.txt')) # Arabic shaping self.arabic_shaping('ArabicShaping.txt') # Normalization/composition self.composition_exclusions('CompositionExclusions.txt') self.normalization_props('DerivedNormalizationProps.txt') self.normalization_corrections('NormalizationCorrections.txt') # Casing self.case_folding('CaseFolding.txt') # Also adds in "Unicode General" self.special_casing('SpecialCasing.txt') for key, path in [ # Rendering/case folding/display etc ('east asian width', 'EastAsianWidth.txt'), ('property list', 'PropList.txt'), # Derived properties ('core properties', 'DerivedCoreProperties.txt'), ('age', 'DerivedAge.txt'), # Definitions ('conscript name', 'UnicodeDataConscript.txt'), #('named aliases', 'NameAliases.txt'), # FIXME! ================================== # Layout ('line break', 'LineBreak.txt'), ('joining type', 'extracted/DerivedJoiningType.txt'), ('grapheme break', 'auxiliary/GraphemeBreakProperty.txt'), ('sentence break', 'auxiliary/SentenceBreakProperty.txt'), ('word break', 'auxiliary/WordBreakProperty.txt'), ('bidi mirroring', 'BidiMirroring.txt'), # Blocks/scripts ('block', 'Blocks.txt'), ('script', 'Scripts.txt'), ('conscript blocks', 'ConscriptBlocks.txt') ]: self.simple(key, path)
def __get_file_and_D_config(self, key, append_idx=False): DRtn = {} output_path = data_path('chardata', '%s/output/%s' % (key, key)) if append_idx: DKeys = load(output_path + '-idx.json') f = open('%s-idx.bin' % output_path, 'r+b') else: DKeys = load(output_path + '.json') f = open('%s.bin' % output_path, 'r+b') for key, DJSON in list(DKeys.items()): set_key_to = get_key_name(key) assert not set_key_to in DRtn DRtn[set_key_to] = DJSON return f, DRtn
def open_ccdict(self): #=======================================================# # CCDict Data # #=======================================================# for D in open_unihan( [data_path('chardata', 'ccdict/source/ccdict.txt')]): ord_ = D['codepoint'] for key, value in list(D.items()): if key == 'codepoint': continue elif key.startswith('f'): key = key[1:] #print key, ord_, value yield key, ord_, value
def get_D_general_cat_aliases(self): D = {} with open(data_path('chardata', 'GeneralCatAliases.txt'), 'r', encoding='utf-8') as f: for line in f: line = line.split('#')[0].strip() if not line: continue prop, alias = line.split('\t') alias = alias.lower() if alias in D: assert D[alias] == prop D[alias.lower()] = prop return D
def open_unihan(self): #=======================================================# # Unihan Data # #=======================================================# path = data_path('chardata', 'unihan/source/*.txt') LUni = glob.glob(path) assert LUni, path for D in open_unihan(LUni): # Get the codepoint, deleting the 'Word' key ord_ = D['codepoint'] self.IICore(ord_, D) self.HDZRadBreak(ord_, D) self.Fenn(ord_, D) self.CheungBauer(ord_, D) for key, value in list(D.items()): if key == 'codepoint': continue yield key, ord_, value
def get_trad_ja_maps(): """ Returns dicts which convert traditional chinese to Japanese characters, and Japanese to traditional, respectively Has keys of single Hanzi/Kanji, and values as multi-character Unicode strings """ DTradToJa = defaultdict(str) DJaToTrad = defaultdict(str) for line in open(data_path('chardata', 'j_simplified/JSimplified.txt'), 'rb', 'utf-8'): line = line.strip() if not line: continue ja, trad = line.split() for i in trad: DTradToJa[i] += ja DJaToTrad[ja] += trad return DTradToJa, DJaToTrad
from char_data.data_paths import data_path UNICODE_PATH = data_path('chardata', 'unidata/source/%s') def uni_open(file_name): path = UNICODE_PATH % file_name f = open(path, 'r', encoding='utf-8', errors='replace') return f
def convert(i): try: if (not i.isupper() and not i.isdigit()) or len(i) not in (4, 5, 6): raise Exception return int(i, 16) except: return i def convert_hex(L): nL = [] for i in L: if ' ' in i or True: i_nL = [convert(j) for j in i.split(' ')] if all(isinstance(j, int) for j in i_nL): nL.append(i_nL) else: #nL.append(longest(i_nL)) nL.append(i) else: nL.append(convert(i)) return nL if __name__ == '__main__': for mode, D in NamesList( data_path('chardata', 'unidata/source/NamesList.txt')): print((mode, D))
def run(): ImportUnihan().write(data_path('chardata', 'unihan/output/unihan'))
def open_names_list(self): current_D_block = None current_D_sub_block = None nl = NamesList(data_path('chardata', 'unidata/source/NamesList.txt')) for kind, D in nl: if kind == 'information': # Copyright info etc # Will implement this at a different level, so will ignore here pass elif kind == 'block': # Information that pertains to the entire block (e.g. Basic Latin etc) current_D_block = D current_D_sub_block = None elif kind == 'subblock': # Information about part of a block current_D_sub_block = D elif kind == 'character': # Info about specific characters ord_ = int(D['codepoint']) if current_D_block: for key, value in list(current_D_block.items()): if key in ('block name', 'block description'): yield key, ord_, value elif key == 'has separator': yield key, ord_, str( value ) # HACK: PLEASE MAKE WORK WITH ENUMS!!!! ==================================== if current_D_sub_block: for key, value in list(current_D_sub_block.items()): if key in ('subblock heading', 'subblock technical notice'): yield key, ord_, value elif key == 'subblock see also': yield key, ord_, [ sa_codepoint for sa_codepoint, _ in value ] else: raise KeyError("Unknown subblock key: %s" % key) for key in D: if key in ('codepoint', 'name', 'compatibility mapping', 'decomposed form'): pass elif key == 'see also': yield 'see also', ord_, [ sa_codepoint for sa_codepoint, _ in D['see also'] ] elif key in ('also called', 'formally also called', 'technical notice', 'comments'): yield key, ord_, D[key] else: raise KeyError("Unknown codepoint key: %s" % key) else: raise Exception("Unknown kind: %s" % kind)
def run(): ImportCCDict().write(data_path('chardata', 'ccdict/output/ccdict'))
def run(): ImportUnicode().write(data_path('chardata', 'unidata/output/unidata'))
def run(): ImportKanjidic().write(data_path('chardata', 'kanjidic/output/kanjidic'))
if key in ('subblock heading', 'subblock technical notice'): yield key, ord_, value elif key == 'subblock see also': yield key, ord_, [ sa_codepoint for sa_codepoint, _ in value ] else: raise KeyError("Unknown subblock key: %s" % key) for key in D: if key in ('codepoint', 'name', 'compatibility mapping', 'decomposed form'): pass elif key == 'see also': yield 'see also', ord_, [ sa_codepoint for sa_codepoint, _ in D['see also'] ] elif key in ('also called', 'formally also called', 'technical notice', 'comments'): yield key, ord_, D[key] else: raise KeyError("Unknown codepoint key: %s" % key) else: raise Exception("Unknown kind: %s" % kind) if __name__ == '__main__': nli = NamesListImport() nli.write(data_path('chardata', 'unidata/nameslist'))
def __init__(self): from char_data.data_processors.internal.data_sources.kanjidic.Kanjidic import Kanjidic WriteBase.__init__(self, Kanjidic()) self.open_kanjidic( data_path('chardata', 'kanjidic/source/kanjidic2.xml'))
from char_data.abstract_base_classes.formatters.ExternalFormatterBase import ExternalFormatterBase from char_data.data_processors.consts import HEADER_VARIANTS #from warnings import warn #warn("PLEASE FIX CEDictVariants to be not reliant on Flazzle dictionary modules!!!") # HACK! #DReverseLinkKeys = {} #REVERSE = None LLinkKeys = list(DReverseLinkKeys.keys()) #LLinkKeys = ['other variant', 'less common variant', 'popular variant', 'Erhua variant', 'abbreviated form', 'correct form', 'unabbreviated form', 'PRC variant', 'Chinese classifier', 'words which can use classifier', 'non-PRC Variant', 'archaic variant', 'non-Erhua variant', 'see also', 'non-Japanese variant', 'same as', 'obscure variant', 'more common variant', 'modern form', 'archaic form', 'antonym', 'variant of', 'erroneous form', 'Japanese variant', 'modern variant'] with open( data_path('chardata', 'cedict/variants.json'), 'rb', 'utf-8' ) as f: DVariants = json.loads(f.read()) class CEDictVariantsFormatter(ExternalFormatterBase): def __init__(self, parent, key): self.LKeys = DReverseLinkKeys[key] self.key = key ExternalFormatterBase.__init__( self, parent, HEADER_VARIANTS, original_name=key, short_desc=key, LISOs=['zh', 'zh_Hant'] # CHECK ME!!!!! ===================================== )