Example #1
0
def main(db_filename):
    # Create the database
    with LanguageDB(db_filename) as db:
        db.setup()
        load_registry(db, parse_registry(), 'en')
        load_cldr(db, Path(data_filename('cldr')))
        load_bibliographic_aliases(db, Path(data_filename('bibliographic_codes.csv')))
Example #2
0
def save_reverse_name_tables(category, rev_dict):
    for language, lang_dict in rev_dict.items():
        if language in CLDR_LANGUAGES or language == 'und':
            os.makedirs(data_filename('trie/{}'.format(language)),
                        exist_ok=True)
            save_trie(
                resolve_names(lang_dict, debug=True),
                data_filename('trie/{}/name_to_{}.marisa'.format(
                    language, category)))
Example #3
0
def read_cldr_supplemental(dataname):
    cldr_supp_path = data_filename('cldr-core-json/supplemental')
    filename = data_filename(f'{cldr_supp_path}/{dataname}.json')
    fulldata = json.load(open(filename, encoding='utf-8'))
    if dataname == 'aliases':
        data = fulldata['supplemental']['metadata']['alias']
    else:
        data = fulldata['supplemental'][dataname]
    return data
Example #4
0
def main(db_filename):
    # Create the database
    with LanguageDB(db_filename) as db:
        db.setup()
        load_cldr(db, Path(data_filename('cldr')))
        load_registry(db, parse_registry(), 'en')
        load_bibliographic_aliases(
            db, Path(data_filename('bibliographic_codes.csv')))
        load_custom_aliases(db, Path(data_filename('aliases.csv')))
Example #5
0
def build_tries(cldr_path):
    language_names_rev = {}
    territory_names_rev = {}
    script_names_rev = {}
    language_names_fwd = {}
    territory_names_fwd = {}
    script_names_fwd = {}
    cldr_main_path = Path(cldr_path) / 'main'

    for subpath in sorted(cldr_main_path.iterdir()):
        if subpath.is_dir():
            langcode = subpath.name
            if (subpath / 'languages.json').exists():
                language_data = read_cldr_name_file(cldr_main_path, langcode,
                                                    'languages')
                update_names(language_names_fwd, language_names_rev,
                             language_data)

                script_data = read_cldr_name_file(cldr_main_path, langcode,
                                                  'scripts')
                update_names(script_names_fwd, script_names_rev, script_data)

                territory_data = read_cldr_name_file(cldr_main_path, langcode,
                                                     'territories')
                update_names(territory_names_fwd, territory_names_rev,
                             territory_data)

    iana_languages, iana_scripts, iana_territories = read_iana_registry_names()
    update_names(language_names_fwd, language_names_rev, iana_languages)
    update_names(script_names_fwd, script_names_rev, iana_scripts)
    update_names(territory_names_fwd, territory_names_rev, iana_territories)

    wiktionary_data = read_wiktionary_names(
        data_filename('wiktionary/codes-en.csv'), 'en')
    update_names(language_names_fwd, language_names_rev, wiktionary_data)

    extra_language_data = read_csv_names(
        data_filename('extra_language_names.csv'))
    update_names(language_names_fwd, language_names_rev, extra_language_data)

    save_reverse_name_tables('language', language_names_rev)
    save_reverse_name_tables('script', script_names_rev)
    save_reverse_name_tables('territory', territory_names_rev)
    save_trie(language_names_fwd,
              data_filename('trie/language_to_name.marisa'))
    save_trie(script_names_fwd, data_filename('trie/script_to_name.marisa'))
    save_trie(territory_names_fwd,
              data_filename('trie/territory_to_name.marisa'))
Example #6
0
def read_cldr_supplemental(path, dataname):
    filename = data_filename('{}/supplemental/{}.json'.format(path, dataname))
    fulldata = json.load(open(filename, encoding='utf-8'))
    if dataname == 'aliases':
        data = fulldata['supplemental']['metadata']['alias']
    else:
        data = fulldata['supplemental'][dataname]
    return data
Example #7
0
def read_cldr_names(path, language, category):
    """
    Read CLDR's names for things in a particular language.
    """
    filename = data_filename('{}/{}/{}.json'.format(path, language, category))
    fulldata = json.load(open(filename, encoding='utf-8'))
    data = fulldata['main'][language]['localeDisplayNames'][category]
    return data
Example #8
0
def parse_registry():
    """
    Yield a sequence of dictionaries, containing the info in the included
    IANA subtag registry file.
    """
    with open(data_filename('language-subtag-registry.txt'),
              encoding='utf-8') as data_file:
        # 'yield from' instead of returning, so that we only close the file
        # when finished.
        yield from parse_file(data_file)
def parse_registry():
    """
    Yield a sequence of dictionaries, containing the info in the included
    IANA subtag registry file.
    """
    with open(data_filename('language-subtag-registry.txt'),
              encoding='utf-8') as data_file:
        # 'yield from' instead of returning, so that we only close the file
        # when finished.
        yield from parse_file(data_file)
Example #10
0
def read_language_distances():
    language_info_path = data_filename(
        'cldr/common/supplemental/languageInfo.xml')
    root = ET.fromstring(open(language_info_path).read())
    matches = root.findall(
        './languageMatching/languageMatches[@type="written_new"]/languageMatch'
    )
    tag_distances = {}
    for match in matches:
        attribs = match.attrib
        n_parts = attribs['desired'].count('_') + 1
        if n_parts < 3:
            if attribs.get('oneway') == 'true':
                pairs = [(attribs['desired'], attribs['supported'])]
            else:
                pairs = [
                    (attribs['desired'], attribs['supported']),
                    (attribs['supported'], attribs['desired']),
                ]
            for (desired, supported) in pairs:
                desired_distance = tag_distances.setdefault(desired, {})
                desired_distance[supported] = int(attribs['distance'])

                # The 'languageInfo' data file contains distances for the unnormalized
                # tag 'sh', but we work mostly with normalized tags, and they don't
                # describe at all how to cope with this.
                #
                # 'sh' normalizes to 'sr-Latn', and when we're matching languages we
                # aren't matching scripts yet, so when 'sh' appears we'll add a
                # corresponding match for 'sr'.
                #
                # Then because we're kind of making this plan up, add 1 to the distance
                # so it's a worse match than ones that are actually clearly defined
                # in languageInfo.
                if desired == 'sh' or supported == 'sh':
                    if desired == 'sh':
                        desired = 'sr'
                    if supported == 'sh':
                        supported = 'sr'
                    if desired != supported:
                        # don't try to define a non-zero distance for sr <=> sr
                        desired_distance = tag_distances.setdefault(
                            desired, {})
                        desired_distance[supported] = int(
                            attribs['distance']) + 1

    return tag_distances
Example #11
0
def code_to_names(category, code):
    """
    Given the code for a language, script, or region, get a dictionary of its
    names in various languages.
    """
    trie_name = '{}_to_name'.format(category)
    if trie_name not in TRIES:
        TRIES[trie_name] = load_trie(data_filename('trie/{}.marisa'.format(trie_name)))

    trie = TRIES[trie_name]
    lookup = code.lower() + '@'
    possible_keys = trie.keys(lookup)
    names = {}
    for key in possible_keys:
        target_language = key.split('@')[1]
        names[target_language] = get_trie_value(trie, key)
    return names
Example #12
0
def read_validity_regex():
    validity_options = []
    for codetype in ('language', 'region', 'script', 'variant'):
        validity_path = data_filename(f'cldr-core/common/validity/{codetype}.xml')
        root = ET.fromstring(open(validity_path).read())
        matches = root.findall('./idValidity/id')
        for match in matches:
            for item in match.text.strip().split():
                if '~' in item:
                    assert item[-2] == '~'
                    prefix = item[:-3]
                    range_start = item[-3]
                    range_end = item[-1]
                    option = f"{prefix}[{range_start}-{range_end}]"
                    validity_options.append(option)
                else:
                    validity_options.append(item)
    return '|'.join(validity_options)
Example #13
0
def name_to_code(category, name, language: str = 'und'):
    """
    Get a language, script, or territory by its name in some language.

    The language here must be a string representing a language subtag only.
    The `Language.find` method can handle other representations of a language
    and normalize them to this form.

    The default language, "und", will allow matching names in any language,
    so you can get the code 'fr' by looking up "French", "Français", or
    "francés".

    A small amount of fuzzy matching is supported: if the name can be
    shortened or lengthened to match a single language name, you get that
    language. This allows, for example, "Hakka Chinese" to match "Hakka".

    Occasionally, names are ambiguous in a way that can be resolved by
    specifying what name the language is supposed to be in. For example,
    there is a language named 'Malayo' in English, but it's different from
    the language named 'Malayo' in Spanish (which is Malay). Specifying the
    language will look up the name in a trie that is only in that language.
    """
    assert '/' not in language, "Language codes cannot contain slashes"
    assert '-' not in language, "This code should be reduced to a language subtag only"
    trie_name = '{}/name_to_{}'.format(language, category)
    if trie_name not in TRIES:
        TRIES[trie_name] = load_trie(
            data_filename('trie/{}.marisa'.format(trie_name)))

    trie = TRIES[trie_name]
    lookup = normalize_name(name)
    if lookup in trie:
        return get_trie_value(trie, lookup)
    else:
        # Is this a language name plus extra verbiage? Maybe it has "...isch",
        # "... language", or "... Chinese" attached to it, for example. Look
        # for a matching prefix of the desired name with at least 4 characters.
        prefixes = trie.prefixes(lookup)
        if prefixes and len(prefixes[-1]) >= 4:
            return get_trie_value(trie, prefixes[-1])
        else:
            return None
Example #14
0
def name_to_code(category, name, language: str='und'):
    """
    Get a language, script, or region by its name in some language.

    The language here must be a string representing a language subtag only.
    The `Language.find` method can handle other representations of a language
    and normalize them to this form.

    The default language, "und", will allow matching names in any language,
    so you can get the code 'fr' by looking up "French", "Français", or
    "francés".

    A small amount of fuzzy matching is supported: if the name can be
    shortened or lengthened to match a single language name, you get that
    language. This allows, for example, "Hakka Chinese" to match "Hakka".

    Occasionally, names are ambiguous in a way that can be resolved by
    specifying what name the language is supposed to be in. For example,
    there is a language named 'Malayo' in English, but it's different from
    the language named 'Malayo' in Spanish (which is Malay). Specifying the
    language will look up the name in a trie that is only in that language.
    """
    assert '/' not in language, "Language codes cannot contain slashes"
    assert '-' not in language, "This code should be reduced to a language subtag only"
    trie_name = '{}/name_to_{}'.format(language, category)
    if trie_name not in TRIES:
        TRIES[trie_name] = load_trie(data_filename('trie/{}.marisa'.format(trie_name)))

    trie = TRIES[trie_name]
    lookup = normalize_name(name)
    if lookup in trie:
        return get_trie_value(trie, lookup)
    else:
        # Is this a language plus extra junk? Maybe it has "...isch", "... language",
        # or "... Chinese" attached to it, for example.
        prefixes = trie.prefixes(lookup)
        if prefixes and len(prefixes[-1]) >= 4:
            return get_trie_value(trie, prefixes[-1])
        else:
            return None
Example #15
0
def build_data(cldr_path, cldr_supp_path):
    lang_scripts = read_iana_registry_scripts()
    macrolanguages = read_iana_registry_macrolanguages()
    iana_replacements = read_iana_registry_replacements()
    language_distances = read_language_distances()
    display_separators = read_display_separators()

    alias_data = read_cldr_supplemental(cldr_supp_path, 'aliases')
    likely_subtags = read_cldr_supplemental(cldr_supp_path, 'likelySubtags')
    replacements = {}
    norm_macrolanguages = {}
    for alias_type in ['languageAlias', 'scriptAlias', 'territoryAlias']:
        aliases = alias_data[alias_type]
        # Initially populate 'languageAlias' with the aliases from the IANA file
        if alias_type == 'languageAlias':
            replacements[alias_type] = iana_replacements
            replacements[alias_type]['root'] = 'und'
        else:
            replacements[alias_type] = {}
        for code, value in aliases.items():
            # Make all keys lowercase so they can be looked up
            # case-insensitively
            code = code.lower()

            # If there are multiple replacements, take the first one. For example,
            # we just replace the Soviet Union (SU) with Russia (RU), instead of
            # trying to do something context-sensitive and poorly standardized
            # that selects one of the successor countries to the Soviet Union.
            replacement = value['_replacement'].split()[0]
            if value['_reason'] == 'macrolanguage':
                norm_macrolanguages[code] = replacement
            else:
                replacements[alias_type][code] = replacement

    # This section builds the trie lookups. It was formerly a separate function,
    # `build_tries`, but now we want to know what set of languages it built names
    # for, so we want to have that information here.
    language_names_rev = {}
    territory_names_rev = {}
    script_names_rev = {}
    language_names_fwd = {}
    territory_names_fwd = {}
    script_names_fwd = {}
    cldr_main_path = Path(cldr_path) / 'main'

    override_language_data = read_csv_names(
        data_filename('override_language_names.csv'))
    update_names(language_names_fwd, language_names_rev,
                 override_language_data)

    for langcode in get_name_languages(cldr_path):
        language_data = read_cldr_name_file(cldr_main_path, langcode,
                                            'languages')
        update_names(language_names_fwd, language_names_rev, language_data)

        script_data = read_cldr_name_file(cldr_main_path, langcode, 'scripts')
        update_names(script_names_fwd, script_names_rev, script_data)

        territory_data = read_cldr_name_file(cldr_main_path, langcode,
                                             'territories')
        update_names(territory_names_fwd, territory_names_rev, territory_data)

    iana_languages, iana_scripts, iana_territories = read_iana_registry_names()
    update_names(language_names_fwd, language_names_rev, iana_languages)
    update_names(script_names_fwd, script_names_rev, iana_scripts)
    update_names(territory_names_fwd, territory_names_rev, iana_territories)

    wiktionary_data = read_wiktionary_names(
        data_filename('wiktionary/codes-en.csv'), 'en')
    update_names(language_names_fwd, language_names_rev, wiktionary_data)

    extra_language_data = read_csv_names(
        data_filename('extra_language_names.csv'))
    update_names(language_names_fwd, language_names_rev, extra_language_data)

    save_reverse_name_tables('language', language_names_rev)
    save_reverse_name_tables('script', script_names_rev)
    save_reverse_name_tables('territory', territory_names_rev)
    save_trie(language_names_fwd,
              data_filename('trie/language_to_name.marisa'))
    save_trie(script_names_fwd, data_filename('trie/script_to_name.marisa'))
    save_trie(territory_names_fwd,
              data_filename('trie/territory_to_name.marisa'))

    # Get the list of languages where we have any name data. These are base
    # language codes (without scripts or territories) which contain a name for
    # themselves.
    name_languages = [
        langcode for langcode in get_name_languages(cldr_path)
        if '-' not in langcode
        and '{}@{}'.format(langcode, langcode) in language_names_fwd
    ]

    # Add the languages that have autonyms in extra_language_data, perhaps because
    # we specifically put them there to get their autonyms right
    name_languages += [
        lang1 for (lang1, lang2, _, _) in extra_language_data if lang1 == lang2
    ]

    # Write the contents of data_dicts.py.
    with open('data_dicts.py', 'w', encoding='utf-8') as outfile:
        print(GENERATED_HEADER, file=outfile)
        write_python_dict(outfile, 'DEFAULT_SCRIPTS', lang_scripts)
        write_python_dict(outfile, 'LANGUAGE_REPLACEMENTS',
                          replacements['languageAlias'])
        write_python_dict(outfile, 'SCRIPT_REPLACEMENTS',
                          replacements['scriptAlias'])
        write_python_dict(outfile, 'TERRITORY_REPLACEMENTS',
                          replacements['territoryAlias'])
        write_python_dict(outfile, 'MACROLANGUAGES', macrolanguages)
        write_python_dict(outfile, 'NORMALIZED_MACROLANGUAGES',
                          norm_macrolanguages)
        write_python_dict(outfile, 'LIKELY_SUBTAGS', likely_subtags)
        write_python_dict(outfile, 'LANGUAGE_DISTANCES', language_distances)
        write_python_dict(outfile, 'DISPLAY_SEPARATORS', display_separators)
        write_python_set(outfile, 'LANGUAGES_WITH_NAME_DATA', name_languages)