Exemple #1
0
    def __add_to_two_level_mappings(self):
        # TODO: ADD TO DTwoLevelMappings!!!
        DOut = {}

        for iso in get_L_possible_isos():
            iso_info = ISOTools.split(iso)
            if iso_info.territory:
                from iso_tools.ISOCodes import DCountries
                region = DCountries.get(iso_info.territory, ['Unknown'])[0]  # TODO: ALLOW FOR i18n etc!!!
            else:
                part3 = iso_info.lang
                # OPEN ISSUE: Use LCountry[2] here, to use continent rather than country??
                try:
                    region = ISOCodes.get_D_iso(part3)['LCountry'][1]
                except KeyError:
                    region = 'Unknown'

            DOut.setdefault(region, []).append(iso)

        LOut = []
        for region, LValues in sorted(DOut.items()):
            LOut.append((region, LValues))

        from char_data.data_processors.consts import DTwoLevelMappings
        DTwoLevelMappings['cldr_alphabets.alphabets'] = LOut  # HACK!
Exemple #2
0
    def split(self, s):
        m = RE_ISO.match(s)
        assert m, "invalid ISO code format: %s" % s
        lang, script, territory, variant = m.group(1, 2, 3, 4)

        if SPLIT_CHECKING:
            if lang and not lang in self.SOKISOs:
                assert lang in ISOCodes, \
                    "lang code %s not valid!" % lang

                # Shorten if possible to a two-letter code
                if lang and len(lang) == 3:
                    DISO = ISOCodes.get_D_iso(lang)
                    assert not 'part1' in DISO, \
                        "lang code %s should be shortened to %s!" % (lang, DISO['part1'])

                # This ISO checking is very slow, so
                # remember this ISO as OK for next time
                self.SOKISOs.add(lang)

            if script:
                assert script in self.DScript2Name, \
                    "script %s not valid!" % script

            if territory:
                assert territory in self.STerritories, \
                    "territory code %s not valid!" % territory

        return ISOCode(lang, script, territory, variant)
Exemple #3
0
    def get_lang_name(self, part3, typ=LANG_DEFAULT, default=KeyError):
        """
        Finds the localized name for ISO 639-3 code `part3`,
        converting first to ISO 639-1 if necessary

        NOTE: See also ISOPrettifier for ISO string
        aware (e.g. "en_Latn-AU")-style pretty formatting!

        TODO: Add support for profiles/scripts/variants, e.g. "zh_CN"!
        TODO: Make sure CLDR doesn't used ISO 639-2! =========================
        """
        DLangs = self.D['DLangs']
        if part3 in DLangs:
            return get(DLangs, default, [part3, typ])

        # Get the ISO-639-1 two-letter code
        try:
            DISO = ISOCodes.get_D_iso(part3, add_alternates=False)

        except KeyError:
            if default == KeyError:
                raise
            return default

        if 'part1' in DISO:
            part1 = DISO['part1']
        else:
            part1 = part3  # WARNING! ===========================================
        return get(DLangs, default, [part1, typ])
Exemple #4
0
    def get_L_part3(self):
        LRtn = []
        for iso in self.D['DLangs']:

            if len(iso) == 2:
                if iso == 'bh':
                    continue  # Bihari not in 639-3 HACK! =====================================
                iso = ISOCodes.to_part3(iso)

            LRtn.append(iso)
        return sorted(LRtn)
Exemple #5
0
    def get_L_pretty(self, s):
        """
        get the localized names of the language,
        script, territory+variant (if specified)
        """
        from iso_tools.ISOTools import ISOTools
        lang, script, territory, variant = ISOTools.split(s)

        if lang:
            DISO = ISOCodes.get_D_iso(lang)

        territory_default = DCountries.get(territory, [territory])[0]

        return (self.get_lang_name(
            lang, default=DISO.get('short_name', DISO['long_name'])) if lang
                else None, self.get_script_name(script) if script else None,
                self.get_territory_name(territory, default=territory_default)
                if territory else None,
                self.get_variant_name(variant, default=variant)
                if variant else None)
Exemple #6
0
    def get_D_lang_names_to_iso(self, make_case_insensitive=False):
        DRtn = {}
        DLangs = self.D['DLangs']

        for part3, DLang in DLangs.items():
            for typ, loc_lang_name in DLang.items():
                try:
                    DISO = ISOCodes.get_D_iso(part3, add_alternates=False)
                except KeyError:
                    DISO = {}

                # Try to convert to two-char codes
                if 'part1' in DISO:
                    part1 = DISO['part1']
                else:
                    part1 = part3

                if make_case_insensitive:
                    loc_lang_name = loc_lang_name.lower()
                DRtn[loc_lang_name] = part1

        return DRtn
Exemple #7
0
    def join(self, part3=None,
                   script=None,
                   territory=None,
                   variant=None,
                   part2=None):

        _lang = _script = _territory = _variant = None

        # Process ISO
        if part3 and part3 in self.DPart3Cache:
            _lang = self.DPart3Cache[part3]

        elif part2 and part2 in self.DPart2Cache:
            _lang = self.DPart2Cache[part2]

        else:
            if part3:
                _lang = part3
                assert _lang in ISOCodes, \
                    "language code %s not valid!" % _lang
            elif part2:
                _lang = ISOCodes.to_part3(part2)

            # "undefined" iso codes will be omitted
            _lang = None if _lang == 'und' else _lang

            # Shorten if possible to a two-letter code
            if _lang and len(_lang) == 3:
                DISO = ISOCodes.get_D_iso(_lang)
                if DISO and 'part1' in DISO:
                    _lang = DISO['part1']

            # Cache the result for later
            DSet = self.DPart3Cache if part3 else self.DPart2Cache
            DSet[part3 if part3 else part2] = _lang


        # Process script, converting from the
        # English script name if necessary
        if script:
            if not len(script) == 4 or not script.istitle():
                assert script.lower() in self.DName2Script,\
                    "script name %s not valid!" % script

                script = self.DName2Script[script.lower()]
            else:
                assert script in self.DScript2Name, \
                    "script %s not valid!" % script

            _script = script


        # Process Country
        if territory:
            if (
                (
                    not len(territory)==2 or
                    not territory.isupper()
                )
                and not territory.isdigit()
            ):
                # Convert to a territory code from
                # the English script name if necessary
                assert territory.lower() in DRevCountries,\
                    "territory name %s was not found!" % territory

                _territory = DRevCountries[territory.lower()]
            else:
                # Otherwise just verify the territory code is valid
                assert territory in self.STerritories, \
                    "territory code %s not valid!" % territory

                _territory = territory


        # Process Variant
        # (currently no checking, but it may pay
        #  to check if a localization is available etc)
        if variant:
            _variant = variant


        return  '%s%s%s%s' % (
            _lang if _lang else '',
            ('_' if _lang else '') + _script if _script else '',
            ('-' if _script or _lang else '') + _territory if _territory else '',
            '|'+_variant if _variant else ''
        )
Exemple #8
0
def get_D_name_to_iso():
    from iso_tools.ISOCodes import ISOCodes
    D = {}

    for part3 in ISOCodes:
        DISO = ISOCodes.get_D_iso(part3, add_alternates=True)

        if 'part1' in DISO:
            part3 = DISO['part1']  # HACK: shorten the code!!!

        D[DISO['short_name'].lower()] = part3
        D[DISO['long_name'].lower()] = part3

        DAlt = DISO['DAlt']
        if 'language name' in DAlt:
            DLangNames = DAlt['language name']
            for k, v in list(DLangNames.items()):
                D[k.lower()] = part3

                if 'alternate language name' in v:
                    DAltNames = v['alternate language name']
                    for k2 in DAltNames:
                        D[k2.lower()] = part3

    for k in list(D.keys()):
        # HACK: Always use the generic "zh"
        # Chinese code for Mandarin Chinese!
        if D[k] == 'cmn':
            D[k] = 'zh'

        # HACK: Fix e.g. "chinese, hakka" -> "hakka"
        if ', ' in k:
            _, _, lang = k.partition(', ')
            if not lang in D:
                D[lang] = D[k]

    D['cantonese'] = 'yue'
    D['mandarin'] = 'zh'  # HACK!
    D['translingual'] = 'mul'

    # Codes from Wiktionary headers with more than
    # ~400 entries that aren't in the above data
    DExc = {
        'middle english': 'enm',  # 21579,
        'ancient greek': 'grc',  # 19189,
        #'norman': 'nrf',  # 10941,
        'old english': 'ang',  # 8847,
        'old french': 'fro',  # 7636,
        'old armenian': 'xcl',  # 6945,
        'khmer': 'khm',  # 4121,
        'middle french': 'frm',  # 4049,
        'interlingua': 'ina',  # 3212,
        'livonian': 'liv',  # 2995,
        'old irish': 'sga',  # 2920,
        'west frisian': 'fry',  # 2865,
        # 'westrobothnian': 2735,
        'egyptian': 'egy',  # 2622,
        'old church slavonic': 'chu',  # 2419,
        'middle dutch': 'dum',  # 1958,
        'aramaic': 'arc',  # 1755,
        'alemannic german': 'gsw',  # 1585,
        'swazi': 'ssw',  # 1408,
        'vilamovian': 'wym',  # 1385,
        'old high german': 'goh',  # 1334,
        'tocharian b': 'txb',  # 1258,
        'punjabi': 'pan',  # 1240,
        'ottoman turkish': 'ota',  # 1106,
        'saterland frisian': 'stq',  # 1085,
        'german low german': 'nds',  # 945,
        'haitian creole': 'hat',  # 913,
        'chichewa': 'nya',  # 881,
        'mauritian creole': 'mfe',  # 852,
        'dupaningan agta': 'duo',  # 702,
        'zazaki': 'zza',  # 688,
        # 'old swedish': 640,
        'bikol central': 'bcl',  # 607,
        # 'tarantino': 593,
        'old occitan': 'pro',  # 580,
        # 'old portuguese': 573,
        'north frisian': 'frr',  # 567,
        # 'bourguignon': 539,
        'taos': 'twf',  # 523,
        'romani': 'rom',  # 494,
        # 'central franconian': 486,
        'okinawan': 'ryu',  # 429,
        'ojibwe': 'oji',  # 403,
    }

    for lang_name, part3 in DExc.items():
        DISO = ISOCodes.get_D_iso(part3, add_alternates=True)

        if 'part1' in DISO:
            # Shorten the code
            DExc[lang_name] = DISO['part1']

    D.update(DExc)
    return D