Exemple #1
0
    def get_D_prop_aliases(self):
        """
        Get a map of Unicode property aliases,
        e.g. {'ccc': 'Canonical_Combining_Class', ...}
        """
        D = {}
        with open(data_path('chardata', 'unidata/source/PropertyAliases.txt'),
                  'r',
                  encoding='utf-8') as f:

            for line in f:
                line = line.split('#')[0].strip()
                L = [i.strip() for i in line.split(';') if i.strip()]
                if not L:
                    continue

                prop = L[1]
                LAliases = [L[0]] + L[2:]

                for alias in LAliases:
                    assert not alias in D
                    D[alias.lower()] = prop

        # HACKS!
        D['canonical_combining_class'] = D[
            'ccc'] = 'canonical_combining_classes'
        return D
Exemple #2
0
    def __init__(self):
        from char_data.data_processors.internal.data_sources.unicode.UnicodeData import UnicodeData
        WriteBase.__init__(self, UnicodeData())

        # Basic data
        self.unicode_data('UnicodeData.txt')
        self.names_list(data_path('chardata', 'unidata/source/NamesList.txt'))

        # Arabic shaping
        self.arabic_shaping('ArabicShaping.txt')

        # Normalization/composition
        self.composition_exclusions('CompositionExclusions.txt')
        self.normalization_props('DerivedNormalizationProps.txt')
        self.normalization_corrections('NormalizationCorrections.txt')

        # Casing
        self.case_folding('CaseFolding.txt')  # Also adds in "Unicode General"
        self.special_casing('SpecialCasing.txt')

        for key, path in [
                # Rendering/case folding/display etc
            ('east asian width', 'EastAsianWidth.txt'),
            ('property list', 'PropList.txt'),

                # Derived properties
            ('core properties', 'DerivedCoreProperties.txt'),
            ('age', 'DerivedAge.txt'),

                # Definitions
            ('conscript name', 'UnicodeDataConscript.txt'),
                #('named aliases', 'NameAliases.txt'), # FIXME! ==================================

                # Layout
            ('line break', 'LineBreak.txt'),
            ('joining type', 'extracted/DerivedJoiningType.txt'),
            ('grapheme break', 'auxiliary/GraphemeBreakProperty.txt'),
            ('sentence break', 'auxiliary/SentenceBreakProperty.txt'),
            ('word break', 'auxiliary/WordBreakProperty.txt'),
            ('bidi mirroring', 'BidiMirroring.txt'),

                # Blocks/scripts
            ('block', 'Blocks.txt'),
            ('script', 'Scripts.txt'),
            ('conscript blocks', 'ConscriptBlocks.txt')
        ]:
            self.simple(key, path)
    def __get_file_and_D_config(self, key, append_idx=False):
        DRtn = {}
        output_path = data_path('chardata', '%s/output/%s' % (key, key))

        if append_idx:
            DKeys = load(output_path + '-idx.json')
            f = open('%s-idx.bin' % output_path, 'r+b')
        else:
            DKeys = load(output_path + '.json')
            f = open('%s.bin' % output_path, 'r+b')

        for key, DJSON in list(DKeys.items()):
            set_key_to = get_key_name(key)
            assert not set_key_to in DRtn
            DRtn[set_key_to] = DJSON

        return f, DRtn
Exemple #4
0
    def open_ccdict(self):
        #=======================================================#
        #                      CCDict Data                      #
        #=======================================================#

        for D in open_unihan(
            [data_path('chardata', 'ccdict/source/ccdict.txt')]):
            ord_ = D['codepoint']

            for key, value in list(D.items()):
                if key == 'codepoint':
                    continue
                elif key.startswith('f'):
                    key = key[1:]

                #print key, ord_, value
                yield key, ord_, value
Exemple #5
0
    def get_D_general_cat_aliases(self):
        D = {}
        with open(data_path('chardata', 'GeneralCatAliases.txt'),
                  'r',
                  encoding='utf-8') as f:

            for line in f:
                line = line.split('#')[0].strip()
                if not line:
                    continue

                prop, alias = line.split('\t')
                alias = alias.lower()
                if alias in D:
                    assert D[alias] == prop

                D[alias.lower()] = prop
        return D
Exemple #6
0
    def open_unihan(self):
        #=======================================================#
        #                      Unihan Data                      #
        #=======================================================#

        path = data_path('chardata', 'unihan/source/*.txt')
        LUni = glob.glob(path)
        assert LUni, path

        for D in open_unihan(LUni):
            # Get the codepoint, deleting the 'Word' key
            ord_ = D['codepoint']

            self.IICore(ord_, D)
            self.HDZRadBreak(ord_, D)
            self.Fenn(ord_, D)
            self.CheungBauer(ord_, D)

            for key, value in list(D.items()):
                if key == 'codepoint':
                    continue

                yield key, ord_, value
Exemple #7
0
def get_trad_ja_maps():
    """
    Returns dicts which convert traditional chinese to Japanese
    characters, and Japanese to traditional, respectively

    Has keys of single Hanzi/Kanji, and values as multi-character
    Unicode strings
    """
    DTradToJa = defaultdict(str)
    DJaToTrad = defaultdict(str)

    for line in open(data_path('chardata', 'j_simplified/JSimplified.txt'),
                     'rb', 'utf-8'):
        line = line.strip()
        if not line:
            continue
        ja, trad = line.split()

        for i in trad:
            DTradToJa[i] += ja

        DJaToTrad[ja] += trad

    return DTradToJa, DJaToTrad
Exemple #8
0
from char_data.data_paths import data_path

UNICODE_PATH = data_path('chardata', 'unidata/source/%s')


def uni_open(file_name):
    path = UNICODE_PATH % file_name
    f = open(path, 'r', encoding='utf-8', errors='replace')
    return f
Exemple #9
0
def convert(i):
    try:
        if (not i.isupper() and not i.isdigit()) or len(i) not in (4, 5, 6):
            raise Exception

        return int(i, 16)
    except:
        return i


def convert_hex(L):
    nL = []
    for i in L:
        if ' ' in i or True:
            i_nL = [convert(j) for j in i.split(' ')]

            if all(isinstance(j, int) for j in i_nL):
                nL.append(i_nL)
            else:
                #nL.append(longest(i_nL))
                nL.append(i)
        else:
            nL.append(convert(i))
    return nL


if __name__ == '__main__':
    for mode, D in NamesList(
            data_path('chardata', 'unidata/source/NamesList.txt')):
        print((mode, D))
Exemple #10
0
def run():
    ImportUnihan().write(data_path('chardata', 'unihan/output/unihan'))
Exemple #11
0
    def open_names_list(self):
        current_D_block = None
        current_D_sub_block = None

        nl = NamesList(data_path('chardata', 'unidata/source/NamesList.txt'))

        for kind, D in nl:
            if kind == 'information':
                # Copyright info etc
                # Will implement this at a different level, so will ignore here
                pass

            elif kind == 'block':
                # Information that pertains to the entire block (e.g. Basic Latin etc)
                current_D_block = D
                current_D_sub_block = None

            elif kind == 'subblock':
                # Information about part of a block
                current_D_sub_block = D

            elif kind == 'character':
                # Info about specific characters
                ord_ = int(D['codepoint'])

                if current_D_block:
                    for key, value in list(current_D_block.items()):
                        if key in ('block name', 'block description'):
                            yield key, ord_, value
                        elif key == 'has separator':
                            yield key, ord_, str(
                                value
                            )  # HACK: PLEASE MAKE WORK WITH ENUMS!!!! ====================================

                if current_D_sub_block:
                    for key, value in list(current_D_sub_block.items()):
                        if key in ('subblock heading',
                                   'subblock technical notice'):
                            yield key, ord_, value
                        elif key == 'subblock see also':
                            yield key, ord_, [
                                sa_codepoint for sa_codepoint, _ in value
                            ]
                        else:
                            raise KeyError("Unknown subblock key: %s" % key)

                for key in D:
                    if key in ('codepoint', 'name', 'compatibility mapping',
                               'decomposed form'):
                        pass
                    elif key == 'see also':
                        yield 'see also', ord_, [
                            sa_codepoint for sa_codepoint, _ in D['see also']
                        ]
                    elif key in ('also called', 'formally also called',
                                 'technical notice', 'comments'):
                        yield key, ord_, D[key]
                    else:
                        raise KeyError("Unknown codepoint key: %s" % key)
            else:
                raise Exception("Unknown kind: %s" % kind)
Exemple #12
0
def run():
    ImportCCDict().write(data_path('chardata', 'ccdict/output/ccdict'))
Exemple #13
0
def run():
    ImportUnicode().write(data_path('chardata', 'unidata/output/unidata'))
Exemple #14
0
def run():
    ImportKanjidic().write(data_path('chardata', 'kanjidic/output/kanjidic'))
Exemple #15
0
                        if key in ('subblock heading',
                                   'subblock technical notice'):
                            yield key, ord_, value
                        elif key == 'subblock see also':
                            yield key, ord_, [
                                sa_codepoint for sa_codepoint, _ in value
                            ]
                        else:
                            raise KeyError("Unknown subblock key: %s" % key)

                for key in D:
                    if key in ('codepoint', 'name', 'compatibility mapping',
                               'decomposed form'):
                        pass
                    elif key == 'see also':
                        yield 'see also', ord_, [
                            sa_codepoint for sa_codepoint, _ in D['see also']
                        ]
                    elif key in ('also called', 'formally also called',
                                 'technical notice', 'comments'):
                        yield key, ord_, D[key]
                    else:
                        raise KeyError("Unknown codepoint key: %s" % key)
            else:
                raise Exception("Unknown kind: %s" % kind)


if __name__ == '__main__':
    nli = NamesListImport()
    nli.write(data_path('chardata', 'unidata/nameslist'))
Exemple #16
0
 def __init__(self):
     from char_data.data_processors.internal.data_sources.kanjidic.Kanjidic import Kanjidic
     WriteBase.__init__(self, Kanjidic())
     self.open_kanjidic(
         data_path('chardata', 'kanjidic/source/kanjidic2.xml'))
from char_data.abstract_base_classes.formatters.ExternalFormatterBase import ExternalFormatterBase
from char_data.data_processors.consts import HEADER_VARIANTS

#from warnings import warn
#warn("PLEASE FIX CEDictVariants to be not reliant on Flazzle dictionary modules!!!")
# HACK!
#DReverseLinkKeys = {}
#REVERSE = None


LLinkKeys = list(DReverseLinkKeys.keys())
#LLinkKeys = ['other variant', 'less common variant', 'popular variant', 'Erhua variant', 'abbreviated form', 'correct form', 'unabbreviated form', 'PRC variant', 'Chinese classifier', 'words which can use classifier', 'non-PRC Variant', 'archaic variant', 'non-Erhua variant', 'see also', 'non-Japanese variant', 'same as', 'obscure variant', 'more common variant', 'modern form', 'archaic form', 'antonym', 'variant of', 'erroneous form', 'Japanese variant', 'modern variant']


with open(
    data_path('chardata', 'cedict/variants.json'),
    'rb', 'utf-8'
) as f:
    DVariants = json.loads(f.read())


class CEDictVariantsFormatter(ExternalFormatterBase):
    def __init__(self, parent, key):
        self.LKeys = DReverseLinkKeys[key]
        self.key = key

        ExternalFormatterBase.__init__(
            self, parent, HEADER_VARIANTS, original_name=key,
            short_desc=key, LISOs=['zh', 'zh_Hant']  # CHECK ME!!!!! =====================================
        )