Example #1
0
def _create_lang_data():
  """Generates language data from CLDR plus extensions.
  Returns a mapping from lang to a tuple of:
  - a set of scripts used in some region
  - a set of scripts not used in any region."""

  all_lang_scripts = collections.defaultdict(set)
  used_lang_scripts = collections.defaultdict(set)
  known_scripts = set()
  all_langs = set()
  for region in cldr_data.known_regions():
    lang_scripts = cldr_data.region_to_lang_scripts(region)
    for lang_script in lang_scripts:
      lang, script = lang_script.split('-')
      known_scripts.add(script)
      if lang == 'und':
        print 'used lang is und for script %s in region %s' % (script, region)
        continue
      used_lang_scripts[lang].add(script)
      all_lang_scripts[lang].add(script)
      all_langs.add(lang)

  for lang in cldr_data.known_langs():
    lang_scripts = cldr_data.lang_to_scripts(lang)
    all_lang_scripts[lang] |= lang_scripts
    known_scripts |= lang_scripts
    all_langs.add(lang)

  for lang in all_langs:
    script = cldr_data.get_likely_script(lang)
    if not is_excluded_script(script):
      all_lang_scripts[lang].add(script)

  for script in unicode_data.all_script_codes():
    if is_excluded_script(script):
      continue
    lang = cldr_data.get_likely_subtags('und-' + script)[0]
    if lang != 'und':
      all_lang_scripts[lang].add(script)
    elif script not in known_scripts:
      print 'adding script with unknown language %s' % script
      all_lang_scripts[lang].add(script)
    else:
      print '### script %s with unknown language already seen' % script

  all_langs = used_lang_scripts.keys() + all_lang_scripts.keys()
  lang_data = {}
  for lang in all_langs:
    if lang in used_lang_scripts:
      if lang in all_lang_scripts:
        unused_set = all_lang_scripts[lang] - used_lang_scripts[lang]
        lang_data[lang] = (used_lang_scripts[lang].copy(), unused_set if unused_set else set())
      else:
        lang_data[lang] = (used_lang_scripts[lang].copy(), set())
    else:
      lang_data[lang] = (set(), all_lang_scripts[lang].copy())

  return lang_data
Example #2
0
def _create_script_to_default_lang(lang_script_data):
  """Iterates over all the scripts in lang_script_data, and returns a map from each script
  to the default language code, generally based on cldr likely subtag data.  This assigns
  'en' to Latn by fiat (cldr defaults to 'und').  Some other scripts (e.g. Dsrt) just
  get 'und'.

  This checks that the default lang for a script actually uses that script in lang_script_data,
  when the default lang is not 'und'.
  """

  script_to_default_lang = {}
  all_scripts = set()
  script_to_used = collections.defaultdict(set)
  script_to_unused = collections.defaultdict(set)
  for lang in lang_script_data:
    used, unused = lang_script_data[lang]
    all_scripts |= used
    all_scripts |= unused
    for script in used:
      script_to_used[script].add(lang)
    for script in unused:
      script_to_unused[script].add(lang)

  # Add scripts without langs.
  all_scripts.add('Zsym')
  all_scripts.add('Qaae')

  for script in sorted(all_scripts):
    default_lang = cldr_data.get_likely_subtags('und-' + script)[0]

    # CLDR has no names for these, and two are collectives, so it's simpler to omit them.
    # if script in ['Kthi', 'Khar', 'Brah']:
    #   print 'likely lang for %s is %s, replace with und' % (script, default_lang)
    #   default_lang = 'und'

    if default_lang == 'und':
      if script == 'Latn':
        default_lang = 'en' # cultural bias...
      else:
        print 'no default lang for script %s' % script
        langs = script_to_used[script]
        if langs:
          default_lang = iter(langs).next()
          print 'using used lang %s from %s' % (default_lang, langs)
        else:
          langs = script_to_unused[script]
          if langs:
            default_lang = iter(langs).next()
            print 'using unused lang %s from %s' % (default_lang, langs)
          else:
            print 'defaulting to \'und\''
    else:
      used, unused = lang_script_data[default_lang]
      assert script in used or script in unused

    script_to_default_lang[script] = default_lang

  return script_to_default_lang
Example #3
0
def _create_script_to_default_lang(lang_script_data):
  """Iterates over all the scripts in lang_script_data, and returns a map
  from each script to the default language code, generally based on cldr
  likely subtag data.  This assigns 'en' to Latn by fiat (cldr defaults to
  'und').  Some other scripts (e.g. Dsrt) just get 'und'.

  This checks that the default lang for a script actually uses that script
  in lang_script_data, when the default lang is not 'und'.
  """

  script_to_default_lang = {}
  all_scripts = set()
  script_to_used = collections.defaultdict(set)
  script_to_unused = collections.defaultdict(set)
  for lang in lang_script_data:
    used, unused = lang_script_data[lang]
    all_scripts |= used
    all_scripts |= unused
    for script in used:
      script_to_used[script].add(lang)
    for script in unused:
      script_to_unused[script].add(lang)

  # Add scripts without langs.
  all_scripts.add('Zsym')
  all_scripts.add('Zsye')

  # Patch Klingon as default lang for (unused) script pIqaD
  script_to_used['Piqd'].add('tlh')

  for script in sorted(all_scripts):
    default_lang = cldr_data.get_likely_subtags('und-' + script)[0]

    if default_lang == 'und':
      if script == 'Latn':
        default_lang = 'en' # cultural bias...
      else:
        print 'no default lang for script %s' % script
        langs = script_to_used[script]
        if langs:
          default_lang = iter(langs).next()
          print 'using used lang %s from %s' % (default_lang, langs)
        else:
          langs = script_to_unused[script]
          if langs:
            default_lang = iter(langs).next()
            print 'using unused lang %s from %s' % (default_lang, langs)
          else:
            print 'defaulting to \'und\''
    else:
      used, unused = lang_script_data[default_lang]
      assert script in used or script in unused

    script_to_default_lang[script] = default_lang

  return script_to_default_lang
def get_family_id_to_default_lang_tag(family_id_to_lang_tags):
  """Return a mapping from family id to default lang tag, for families
  that have multiple lang tags.  This is based on likely subtags and
  the script of the family (Latn for LGC).
  """
  # TODO(dougfelt): this reintroduces language tags that we'd previously filtered
  # out.  We should not be doing this here.  Figure out a better way to handle this.

  family_id_to_default_lang_tag = {}
  for family_id, lang_tags in family_id_to_lang_tags.iteritems():
    parts = family_id.split('-')
    if len(parts) == 1:
      # 'sans' or 'serif'
      script = 'Latn'
    else:
      script = parts[1].capitalize()
    lang = cldr_data.get_likely_subtags('und-' + script)[0]
    # CLDR has no names for these, and two are collectives, so it's simpler to omit them.
    if script in ['Kthi', 'Khar', 'Brah']:
      print 'likely lang for %s is %s, replace with und' % (script, lang)
      lang = 'und'

    if lang == 'und':
      # special case
      if script == 'Latn':
        lang_tag = 'en'
      elif script == 'Aran':
        lang_tag = 'ur'
      else:
        lang_tag = 'und' + '-' + script
    elif lang not in lang_tags:
      lang_tag = lang + '-' + script
      if lang_tag not in lang_tags:
        print 'Akk, lang and lang_scr \'%s\' not listed for family %s' % (
            lang_tag, family_id)
    else:
      lang_tag = lang
    family_id_to_default_lang_tag[family_id] = lang_tag
  return family_id_to_default_lang_tag
Example #5
0
def _create_lang_data():
  """Generates language data from CLDR plus extensions.
  Returns a mapping from lang to a tuple of:
  - a set of scripts used in some region
  - a set of scripts not used in any region."""

  all_lang_scripts = collections.defaultdict(set)
  used_lang_scripts = collections.defaultdict(set)
  known_scripts = set()
  all_langs = set()
  for region in cldr_data.known_regions():
    lang_scripts = cldr_data.region_to_lang_scripts(region)
    for lang_script in lang_scripts:
      lang, script = lang_script.split('-')
      known_scripts.add(script)
      if lang == 'und':
        if _DEBUG:
          print 'used lang is und for script %s in region %s' % (script, region)
        continue
      used_lang_scripts[lang].add(script)
      all_lang_scripts[lang].add(script)
      all_langs.add(lang)

  for lang in cldr_data.known_langs():
    lang_scripts = cldr_data.lang_to_scripts(lang)
    all_lang_scripts[lang] |= lang_scripts
    known_scripts |= lang_scripts
    all_langs.add(lang)

  for lang in all_langs:
    script = cldr_data.get_likely_script(lang)
    if not is_excluded_script(script):
      all_lang_scripts[lang].add(script)

  for script in unicode_data.all_script_codes():
    if is_excluded_script(script):
      continue
    lang = cldr_data.get_likely_subtags('und-' + script)[0]
    if lang != 'und':
      if _DEBUG and script not in all_lang_scripts[lang]:
        print '# adding likely lang %s for script %s' % (lang, script)
      all_lang_scripts[lang].add(script)
    elif script not in known_scripts:
      if _DEBUG:
        print '# adding script with unknown language %s' % script
      all_lang_scripts[lang].add(script)
    elif _DEBUG:
      print '### script %s with unknown language already seen' % script

  # Patch: ensure ryu-Jpan exists
  # - Okinawan can be written in either Kana or a combination of Hira
  #   and Kanji. Rather than take a strong position on this, add a
  #   mapping to Jpan.
  all_lang_scripts['ryu'].add('Jpan')

  all_langs = used_lang_scripts.keys() + all_lang_scripts.keys()
  lang_data = {}
  for lang in all_langs:
    if lang in used_lang_scripts:
      if lang in all_lang_scripts:
        unused_set = all_lang_scripts[lang] - used_lang_scripts[lang]
        lang_data[lang] = (used_lang_scripts[lang].copy(),
                           unused_set if unused_set else set())
      else:
        lang_data[lang] = (used_lang_scripts[lang].copy(), set())
    else:
      lang_data[lang] = (set(), all_lang_scripts[lang].copy())

  return lang_data
def get_used_lang_data(supported_scripts):
  """Returns a mapping from lang to a tuple of:
  - a set of scripts used in some region
  - a set of scripts not used in any region"""

  # Get additional scripts for a lang by using get_likely_subtags from script to
  # lang.  This might not be the same as the likely script for a lang, but it does
  # indicate the language can be written in the script, or so we assume.
  lang_to_additional_script = {}
  for script in supported_scripts:
    lang = cldr_data.get_likely_subtags('und-' + script)[0]
    if lang != 'und':
      lang_to_additional_script[lang] = script

  unsupported_scripts = set()
  lang_data = {}
  used_lang_scripts = collections.defaultdict(set)
  for region in cldr_data.known_regions():
    lang_scripts = cldr_data.region_to_lang_scripts(region)
    for lang_script in lang_scripts:
      lang, script = lang_script.split('-')
      if script == 'Kana':
        print 'remap %s to use Jpan' % lang_script
        script = 'Jpan'
      if script not in supported_scripts:
        unsupported_scripts.add(script)
      used_lang_scripts[lang].add(script)

  if unsupported_scripts:
    print 'used scripts that are not supported: %s' % ', '.join(sorted(unsupported_scripts))

  known_langs = set(cldr_data.known_langs())
  for lang in lang_to_additional_script:
    if not lang in known_langs:
      print 'lang %s not in known langs' % lang
      known_langs.add(lang)

  for lang in known_langs:
    if lang in ['ryu', 'ain']:
      all_scripts = set(['Jpan'])
    else:
      all_scripts = set(cldr_data.lang_to_scripts(lang))

    # add additional scripts for lang
    if lang in lang_to_additional_script:
      script = lang_to_additional_script[lang]
      if script not in all_scripts:
        print 'cldr data does not have script %s for lang %s' % (script, lang)
        all_scripts.add(script)

    if not all_scripts & supported_scripts:
      print 'no supported scripts among %s for lang %s' % (all_scripts, lang)
      continue

    used_scripts = used_lang_scripts[lang]
    if not used_scripts:
      script = cldr_data.get_likely_script(lang)
      if script != 'Zzzz':
        used_scripts = set([script])

    unused_scripts = all_scripts - used_scripts
    lang_data[lang] = (used_scripts, unused_scripts)

  # Patch out langs whose sample data Noto doesn't support
  # A bunch of these resolve to the same sample.  Would be easier to check if I just had
  # sample names independent of language names, but then harder to remove the languages.
  for lang in ['abq', 'ady', 'aii-Cyrl', 'av', 'bua', 'chm']:
    if not lang in lang_data:
      print 'patched out lang %s not present' % lang
    else:
      print 'patch out lang %s' % lang
      del lang_data[lang]

  return lang_data