def main(db_filename): # Create the database with LanguageDB(db_filename) as db: db.setup() load_registry(db, parse_registry(), 'en') load_cldr(db, Path(data_filename('cldr'))) load_bibliographic_aliases(db, Path(data_filename('bibliographic_codes.csv')))
def save_reverse_name_tables(category, rev_dict): for language, lang_dict in rev_dict.items(): if language in CLDR_LANGUAGES or language == 'und': os.makedirs(data_filename('trie/{}'.format(language)), exist_ok=True) save_trie( resolve_names(lang_dict, debug=True), data_filename('trie/{}/name_to_{}.marisa'.format( language, category)))
def read_cldr_supplemental(dataname): cldr_supp_path = data_filename('cldr-core-json/supplemental') filename = data_filename(f'{cldr_supp_path}/{dataname}.json') fulldata = json.load(open(filename, encoding='utf-8')) if dataname == 'aliases': data = fulldata['supplemental']['metadata']['alias'] else: data = fulldata['supplemental'][dataname] return data
def main(db_filename): # Create the database with LanguageDB(db_filename) as db: db.setup() load_cldr(db, Path(data_filename('cldr'))) load_registry(db, parse_registry(), 'en') load_bibliographic_aliases( db, Path(data_filename('bibliographic_codes.csv'))) load_custom_aliases(db, Path(data_filename('aliases.csv')))
def build_tries(cldr_path): language_names_rev = {} territory_names_rev = {} script_names_rev = {} language_names_fwd = {} territory_names_fwd = {} script_names_fwd = {} cldr_main_path = Path(cldr_path) / 'main' for subpath in sorted(cldr_main_path.iterdir()): if subpath.is_dir(): langcode = subpath.name if (subpath / 'languages.json').exists(): language_data = read_cldr_name_file(cldr_main_path, langcode, 'languages') update_names(language_names_fwd, language_names_rev, language_data) script_data = read_cldr_name_file(cldr_main_path, langcode, 'scripts') update_names(script_names_fwd, script_names_rev, script_data) territory_data = read_cldr_name_file(cldr_main_path, langcode, 'territories') update_names(territory_names_fwd, territory_names_rev, territory_data) iana_languages, iana_scripts, iana_territories = read_iana_registry_names() update_names(language_names_fwd, language_names_rev, iana_languages) update_names(script_names_fwd, script_names_rev, iana_scripts) update_names(territory_names_fwd, territory_names_rev, iana_territories) wiktionary_data = read_wiktionary_names( data_filename('wiktionary/codes-en.csv'), 'en') update_names(language_names_fwd, language_names_rev, wiktionary_data) extra_language_data = read_csv_names( data_filename('extra_language_names.csv')) update_names(language_names_fwd, language_names_rev, extra_language_data) save_reverse_name_tables('language', language_names_rev) save_reverse_name_tables('script', script_names_rev) save_reverse_name_tables('territory', territory_names_rev) save_trie(language_names_fwd, data_filename('trie/language_to_name.marisa')) save_trie(script_names_fwd, data_filename('trie/script_to_name.marisa')) save_trie(territory_names_fwd, data_filename('trie/territory_to_name.marisa'))
def read_cldr_supplemental(path, dataname): filename = data_filename('{}/supplemental/{}.json'.format(path, dataname)) fulldata = json.load(open(filename, encoding='utf-8')) if dataname == 'aliases': data = fulldata['supplemental']['metadata']['alias'] else: data = fulldata['supplemental'][dataname] return data
def read_cldr_names(path, language, category): """ Read CLDR's names for things in a particular language. """ filename = data_filename('{}/{}/{}.json'.format(path, language, category)) fulldata = json.load(open(filename, encoding='utf-8')) data = fulldata['main'][language]['localeDisplayNames'][category] return data
def parse_registry(): """ Yield a sequence of dictionaries, containing the info in the included IANA subtag registry file. """ with open(data_filename('language-subtag-registry.txt'), encoding='utf-8') as data_file: # 'yield from' instead of returning, so that we only close the file # when finished. yield from parse_file(data_file)
def read_language_distances(): language_info_path = data_filename( 'cldr/common/supplemental/languageInfo.xml') root = ET.fromstring(open(language_info_path).read()) matches = root.findall( './languageMatching/languageMatches[@type="written_new"]/languageMatch' ) tag_distances = {} for match in matches: attribs = match.attrib n_parts = attribs['desired'].count('_') + 1 if n_parts < 3: if attribs.get('oneway') == 'true': pairs = [(attribs['desired'], attribs['supported'])] else: pairs = [ (attribs['desired'], attribs['supported']), (attribs['supported'], attribs['desired']), ] for (desired, supported) in pairs: desired_distance = tag_distances.setdefault(desired, {}) desired_distance[supported] = int(attribs['distance']) # The 'languageInfo' data file contains distances for the unnormalized # tag 'sh', but we work mostly with normalized tags, and they don't # describe at all how to cope with this. # # 'sh' normalizes to 'sr-Latn', and when we're matching languages we # aren't matching scripts yet, so when 'sh' appears we'll add a # corresponding match for 'sr'. # # Then because we're kind of making this plan up, add 1 to the distance # so it's a worse match than ones that are actually clearly defined # in languageInfo. if desired == 'sh' or supported == 'sh': if desired == 'sh': desired = 'sr' if supported == 'sh': supported = 'sr' if desired != supported: # don't try to define a non-zero distance for sr <=> sr desired_distance = tag_distances.setdefault( desired, {}) desired_distance[supported] = int( attribs['distance']) + 1 return tag_distances
def code_to_names(category, code): """ Given the code for a language, script, or region, get a dictionary of its names in various languages. """ trie_name = '{}_to_name'.format(category) if trie_name not in TRIES: TRIES[trie_name] = load_trie(data_filename('trie/{}.marisa'.format(trie_name))) trie = TRIES[trie_name] lookup = code.lower() + '@' possible_keys = trie.keys(lookup) names = {} for key in possible_keys: target_language = key.split('@')[1] names[target_language] = get_trie_value(trie, key) return names
def read_validity_regex(): validity_options = [] for codetype in ('language', 'region', 'script', 'variant'): validity_path = data_filename(f'cldr-core/common/validity/{codetype}.xml') root = ET.fromstring(open(validity_path).read()) matches = root.findall('./idValidity/id') for match in matches: for item in match.text.strip().split(): if '~' in item: assert item[-2] == '~' prefix = item[:-3] range_start = item[-3] range_end = item[-1] option = f"{prefix}[{range_start}-{range_end}]" validity_options.append(option) else: validity_options.append(item) return '|'.join(validity_options)
def name_to_code(category, name, language: str = 'und'): """ Get a language, script, or territory by its name in some language. The language here must be a string representing a language subtag only. The `Language.find` method can handle other representations of a language and normalize them to this form. The default language, "und", will allow matching names in any language, so you can get the code 'fr' by looking up "French", "Français", or "francés". A small amount of fuzzy matching is supported: if the name can be shortened or lengthened to match a single language name, you get that language. This allows, for example, "Hakka Chinese" to match "Hakka". Occasionally, names are ambiguous in a way that can be resolved by specifying what name the language is supposed to be in. For example, there is a language named 'Malayo' in English, but it's different from the language named 'Malayo' in Spanish (which is Malay). Specifying the language will look up the name in a trie that is only in that language. """ assert '/' not in language, "Language codes cannot contain slashes" assert '-' not in language, "This code should be reduced to a language subtag only" trie_name = '{}/name_to_{}'.format(language, category) if trie_name not in TRIES: TRIES[trie_name] = load_trie( data_filename('trie/{}.marisa'.format(trie_name))) trie = TRIES[trie_name] lookup = normalize_name(name) if lookup in trie: return get_trie_value(trie, lookup) else: # Is this a language name plus extra verbiage? Maybe it has "...isch", # "... language", or "... Chinese" attached to it, for example. Look # for a matching prefix of the desired name with at least 4 characters. prefixes = trie.prefixes(lookup) if prefixes and len(prefixes[-1]) >= 4: return get_trie_value(trie, prefixes[-1]) else: return None
def name_to_code(category, name, language: str='und'): """ Get a language, script, or region by its name in some language. The language here must be a string representing a language subtag only. The `Language.find` method can handle other representations of a language and normalize them to this form. The default language, "und", will allow matching names in any language, so you can get the code 'fr' by looking up "French", "Français", or "francés". A small amount of fuzzy matching is supported: if the name can be shortened or lengthened to match a single language name, you get that language. This allows, for example, "Hakka Chinese" to match "Hakka". Occasionally, names are ambiguous in a way that can be resolved by specifying what name the language is supposed to be in. For example, there is a language named 'Malayo' in English, but it's different from the language named 'Malayo' in Spanish (which is Malay). Specifying the language will look up the name in a trie that is only in that language. """ assert '/' not in language, "Language codes cannot contain slashes" assert '-' not in language, "This code should be reduced to a language subtag only" trie_name = '{}/name_to_{}'.format(language, category) if trie_name not in TRIES: TRIES[trie_name] = load_trie(data_filename('trie/{}.marisa'.format(trie_name))) trie = TRIES[trie_name] lookup = normalize_name(name) if lookup in trie: return get_trie_value(trie, lookup) else: # Is this a language plus extra junk? Maybe it has "...isch", "... language", # or "... Chinese" attached to it, for example. prefixes = trie.prefixes(lookup) if prefixes and len(prefixes[-1]) >= 4: return get_trie_value(trie, prefixes[-1]) else: return None
def build_data(cldr_path, cldr_supp_path): lang_scripts = read_iana_registry_scripts() macrolanguages = read_iana_registry_macrolanguages() iana_replacements = read_iana_registry_replacements() language_distances = read_language_distances() display_separators = read_display_separators() alias_data = read_cldr_supplemental(cldr_supp_path, 'aliases') likely_subtags = read_cldr_supplemental(cldr_supp_path, 'likelySubtags') replacements = {} norm_macrolanguages = {} for alias_type in ['languageAlias', 'scriptAlias', 'territoryAlias']: aliases = alias_data[alias_type] # Initially populate 'languageAlias' with the aliases from the IANA file if alias_type == 'languageAlias': replacements[alias_type] = iana_replacements replacements[alias_type]['root'] = 'und' else: replacements[alias_type] = {} for code, value in aliases.items(): # Make all keys lowercase so they can be looked up # case-insensitively code = code.lower() # If there are multiple replacements, take the first one. For example, # we just replace the Soviet Union (SU) with Russia (RU), instead of # trying to do something context-sensitive and poorly standardized # that selects one of the successor countries to the Soviet Union. replacement = value['_replacement'].split()[0] if value['_reason'] == 'macrolanguage': norm_macrolanguages[code] = replacement else: replacements[alias_type][code] = replacement # This section builds the trie lookups. It was formerly a separate function, # `build_tries`, but now we want to know what set of languages it built names # for, so we want to have that information here. language_names_rev = {} territory_names_rev = {} script_names_rev = {} language_names_fwd = {} territory_names_fwd = {} script_names_fwd = {} cldr_main_path = Path(cldr_path) / 'main' override_language_data = read_csv_names( data_filename('override_language_names.csv')) update_names(language_names_fwd, language_names_rev, override_language_data) for langcode in get_name_languages(cldr_path): language_data = read_cldr_name_file(cldr_main_path, langcode, 'languages') update_names(language_names_fwd, language_names_rev, language_data) script_data = read_cldr_name_file(cldr_main_path, langcode, 'scripts') update_names(script_names_fwd, script_names_rev, script_data) territory_data = read_cldr_name_file(cldr_main_path, langcode, 'territories') update_names(territory_names_fwd, territory_names_rev, territory_data) iana_languages, iana_scripts, iana_territories = read_iana_registry_names() update_names(language_names_fwd, language_names_rev, iana_languages) update_names(script_names_fwd, script_names_rev, iana_scripts) update_names(territory_names_fwd, territory_names_rev, iana_territories) wiktionary_data = read_wiktionary_names( data_filename('wiktionary/codes-en.csv'), 'en') update_names(language_names_fwd, language_names_rev, wiktionary_data) extra_language_data = read_csv_names( data_filename('extra_language_names.csv')) update_names(language_names_fwd, language_names_rev, extra_language_data) save_reverse_name_tables('language', language_names_rev) save_reverse_name_tables('script', script_names_rev) save_reverse_name_tables('territory', territory_names_rev) save_trie(language_names_fwd, data_filename('trie/language_to_name.marisa')) save_trie(script_names_fwd, data_filename('trie/script_to_name.marisa')) save_trie(territory_names_fwd, data_filename('trie/territory_to_name.marisa')) # Get the list of languages where we have any name data. These are base # language codes (without scripts or territories) which contain a name for # themselves. name_languages = [ langcode for langcode in get_name_languages(cldr_path) if '-' not in langcode and '{}@{}'.format(langcode, langcode) in language_names_fwd ] # Add the languages that have autonyms in extra_language_data, perhaps because # we specifically put them there to get their autonyms right name_languages += [ lang1 for (lang1, lang2, _, _) in extra_language_data if lang1 == lang2 ] # Write the contents of data_dicts.py. with open('data_dicts.py', 'w', encoding='utf-8') as outfile: print(GENERATED_HEADER, file=outfile) write_python_dict(outfile, 'DEFAULT_SCRIPTS', lang_scripts) write_python_dict(outfile, 'LANGUAGE_REPLACEMENTS', replacements['languageAlias']) write_python_dict(outfile, 'SCRIPT_REPLACEMENTS', replacements['scriptAlias']) write_python_dict(outfile, 'TERRITORY_REPLACEMENTS', replacements['territoryAlias']) write_python_dict(outfile, 'MACROLANGUAGES', macrolanguages) write_python_dict(outfile, 'NORMALIZED_MACROLANGUAGES', norm_macrolanguages) write_python_dict(outfile, 'LIKELY_SUBTAGS', likely_subtags) write_python_dict(outfile, 'LANGUAGE_DISTANCES', language_distances) write_python_dict(outfile, 'DISPLAY_SEPARATORS', display_separators) write_python_set(outfile, 'LANGUAGES_WITH_NAME_DATA', name_languages)