Esempio n. 1
0
def _create_lang_data():
  """Generates language data from CLDR plus extensions.
  Returns a mapping from lang to a tuple of:
  - a set of scripts used in some region
  - a set of scripts not used in any region."""

  all_lang_scripts = collections.defaultdict(set)
  used_lang_scripts = collections.defaultdict(set)
  known_scripts = set()
  all_langs = set()
  for region in cldr_data.known_regions():
    lang_scripts = cldr_data.region_to_lang_scripts(region)
    for lang_script in lang_scripts:
      lang, script = lang_script.split('-')
      known_scripts.add(script)
      if lang == 'und':
        print 'used lang is und for script %s in region %s' % (script, region)
        continue
      used_lang_scripts[lang].add(script)
      all_lang_scripts[lang].add(script)
      all_langs.add(lang)

  for lang in cldr_data.known_langs():
    lang_scripts = cldr_data.lang_to_scripts(lang)
    all_lang_scripts[lang] |= lang_scripts
    known_scripts |= lang_scripts
    all_langs.add(lang)

  for lang in all_langs:
    script = cldr_data.get_likely_script(lang)
    if not is_excluded_script(script):
      all_lang_scripts[lang].add(script)

  for script in unicode_data.all_script_codes():
    if is_excluded_script(script):
      continue
    lang = cldr_data.get_likely_subtags('und-' + script)[0]
    if lang != 'und':
      all_lang_scripts[lang].add(script)
    elif script not in known_scripts:
      print 'adding script with unknown language %s' % script
      all_lang_scripts[lang].add(script)
    else:
      print '### script %s with unknown language already seen' % script

  all_langs = used_lang_scripts.keys() + all_lang_scripts.keys()
  lang_data = {}
  for lang in all_langs:
    if lang in used_lang_scripts:
      if lang in all_lang_scripts:
        unused_set = all_lang_scripts[lang] - used_lang_scripts[lang]
        lang_data[lang] = (used_lang_scripts[lang].copy(), unused_set if unused_set else set())
      else:
        lang_data[lang] = (used_lang_scripts[lang].copy(), set())
    else:
      lang_data[lang] = (set(), all_lang_scripts[lang].copy())

  return lang_data
Esempio n. 2
0
def _create_lang_data():
  """Generates language data from CLDR plus extensions.
  Returns a mapping from lang to a tuple of:
  - a set of scripts used in some region
  - a set of scripts not used in any region."""

  all_lang_scripts = collections.defaultdict(set)
  used_lang_scripts = collections.defaultdict(set)
  known_scripts = set()
  all_langs = set()
  for region in cldr_data.known_regions():
    lang_scripts = cldr_data.region_to_lang_scripts(region)
    for lang_script in lang_scripts:
      lang, script = lang_script.split('-')
      known_scripts.add(script)
      if lang == 'und':
        if _DEBUG:
          print 'used lang is und for script %s in region %s' % (script, region)
        continue
      used_lang_scripts[lang].add(script)
      all_lang_scripts[lang].add(script)
      all_langs.add(lang)

  for lang in cldr_data.known_langs():
    lang_scripts = cldr_data.lang_to_scripts(lang)
    all_lang_scripts[lang] |= lang_scripts
    known_scripts |= lang_scripts
    all_langs.add(lang)

  for lang in all_langs:
    script = cldr_data.get_likely_script(lang)
    if not is_excluded_script(script):
      all_lang_scripts[lang].add(script)

  for script in unicode_data.all_script_codes():
    if is_excluded_script(script):
      continue
    lang = cldr_data.get_likely_subtags('und-' + script)[0]
    if lang != 'und':
      if _DEBUG and script not in all_lang_scripts[lang]:
        print '# adding likely lang %s for script %s' % (lang, script)
      all_lang_scripts[lang].add(script)
    elif script not in known_scripts:
      if _DEBUG:
        print '# adding script with unknown language %s' % script
      all_lang_scripts[lang].add(script)
    elif _DEBUG:
      print '### script %s with unknown language already seen' % script

  # Patch: ensure ryu-Jpan exists
  # - Okinawan can be written in either Kana or a combination of Hira
  #   and Kanji. Rather than take a strong position on this, add a
  #   mapping to Jpan.
  all_lang_scripts['ryu'].add('Jpan')

  all_langs = used_lang_scripts.keys() + all_lang_scripts.keys()
  lang_data = {}
  for lang in all_langs:
    if lang in used_lang_scripts:
      if lang in all_lang_scripts:
        unused_set = all_lang_scripts[lang] - used_lang_scripts[lang]
        lang_data[lang] = (used_lang_scripts[lang].copy(),
                           unused_set if unused_set else set())
      else:
        lang_data[lang] = (used_lang_scripts[lang].copy(), set())
    else:
      lang_data[lang] = (set(), all_lang_scripts[lang].copy())

  return lang_data