def generate_table(filename):
  with codecs.open(filename, 'w', 'utf-8') as f:
    script_to_samples = _get_script_to_samples()
    print >> f, _HTML_HEADER
    print >> f, '<table>'
    print >> f, '<tr><th>Script<br/>BCP<th>name<th>type<th>text'

    for script, samples in sorted(script_to_samples.iteritems()):
      script_en = cldr_data.get_english_script_name(script)
      print >> f, '<tr><th colspan=4>%s' % script_en
      for bcp, sample_type, sample_text in samples:
        try:
          lsrv = cldr_data.loc_tag_to_lsrv(bcp)
          lsrv = (lsrv[0], None, lsrv[2], lsrv[3])
          bcp_no_script = cldr_data.lsrv_to_loc_tag(lsrv)
          bcp_en = cldr_data.get_english_language_name(bcp_no_script)
          if not bcp_en:
            bcp_en = 'No name'
          if bcp_en == 'Unknown Language' and sample_type == 'chars':
            bcp_en = '(characters)'
        except:
          print 'could not get english name for %s' % bcp
          bcp_en = bcp

        cols = ['<tr>']
        cols.append(bcp_no_script)
        cols.append(bcp_en)
        cols.append(sample_type)
        cols.append(sample_text)
        print >> f, '<td>'.join(cols)
      print >> f, '<tr><td colspan=4>&nbsp;'
    print >> f, '</table>'
    print >> f, _HTML_FOOTER
Exemple #2
0
def generate_table(filename):
    with codecs.open(filename, 'w', 'utf-8') as f:
        script_to_samples = _get_script_to_samples()
        print >> f, _HTML_HEADER
        print >> f, '<table>'
        print >> f, '<tr><th>Script<br/>BCP<th>name<th>type<th>text'

        for script, samples in sorted(script_to_samples.iteritems()):
            script_en = cldr_data.get_english_script_name(script)
            print >> f, '<tr><th colspan=4>%s' % script_en
            for bcp, sample_type, sample_text in samples:
                try:
                    lsrv = cldr_data.loc_tag_to_lsrv(bcp)
                    lsrv = (lsrv[0], None, lsrv[2], lsrv[3])
                    bcp_no_script = cldr_data.lsrv_to_loc_tag(lsrv)
                    bcp_en = cldr_data.get_english_language_name(bcp_no_script)
                    if not bcp_en:
                        bcp_en = 'No name'
                    if bcp_en == 'Unknown Language' and sample_type == 'chars':
                        bcp_en = '(characters)'
                except:
                    print 'could not get english name for %s' % bcp
                    bcp_en = bcp

                cols = ['<tr>']
                cols.append(bcp_no_script)
                cols.append(bcp_en)
                cols.append(sample_type)
                cols.append(sample_text)
                print >> f, '<td>'.join(cols)
            print >> f, '<tr><td colspan=4>&nbsp;'
        print >> f, '</table>'
        print >> f, _HTML_FOOTER
def generate_table(filename):
    with codecs.open(filename, "w", "utf-8") as f:
        script_to_samples = _get_script_to_samples()
        f.write(_HTML_HEADER)
        f.write("<table>\n")
        f.write("<tr><th>Script<br/>BCP<th>name<th>type<th>text\n")

        for script, samples in sorted(script_to_samples.items()):
            script_en = cldr_data.get_english_script_name(script)
            f.write("<tr><th colspan=4>%s\n" % script_en)
            for bcp, sample_type, sample_text in samples:
                try:
                    lsrv = cldr_data.loc_tag_to_lsrv(bcp)
                    lsrv = (lsrv[0], None, lsrv[2], lsrv[3])
                    bcp_no_script = cldr_data.lsrv_to_loc_tag(lsrv)
                    bcp_en = cldr_data.get_english_language_name(bcp_no_script)
                    if not bcp_en:
                        bcp_en = "No name"
                    if bcp_en == "Unknown Language" and sample_type == "chars":
                        bcp_en = "(characters)"
                except:
                    print("could not get english name for %s" % bcp)
                    bcp_en = bcp

                cols = ["<tr>"]
                cols.append(bcp_no_script)
                cols.append(bcp_en)
                cols.append(sample_type)
                cols.append(sample_text)
                f.write("<td>".join(cols) + "\n")
            f.write("<tr><td colspan=4>&nbsp;\n")
        f.write("</table>\n")
        f.write(_HTML_FOOTER + "\n")
def _init_lang_for_script_map():
  locs_by_lit_pop = [loc for _, loc in cldr_data.get_lang_scrs_by_decreasing_global_lit_pop()]
  for t in locs_by_lit_pop:
    lsrv = cldr_data.loc_tag_to_lsrv(t)
    script = lsrv[1]
    if script not in _lang_for_script_map:
      lang = lsrv[0]
      # print '%s lang => %s' % (script, lang)
      _lang_for_script_map[script] = lang
Exemple #5
0
def _init_lang_for_script_map():
  locs_by_lit_pop = [loc for _, loc in cldr_data.get_lang_scrs_by_decreasing_global_lit_pop()]
  for t in locs_by_lit_pop:
    lsrv = cldr_data.loc_tag_to_lsrv(t)
    script = lsrv[1]
    if script not in _lang_for_script_map:
      lang = lsrv[0]
      # print('%s lang => %s' % (script, lang))
      _lang_for_script_map[script] = lang
Exemple #6
0
def _get_script_to_samples():
    script_to_samples = collections.defaultdict(list)

    sample_dir = tool_utils.resolve_path('[tools]/sample_texts')
    for f in sorted(os.listdir(sample_dir)):
        base, ext = path.splitext(f)
        if ext != '.txt' or '_' not in base:
            print 'skipping', f
            continue
        bcp, sample_type = base.split('_')
        try:
            lang, script, region, variant = cldr_data.loc_tag_to_lsrv(bcp)
        except:
            print 'bcp %s did not parse as lsrv' % bcp
            continue
        if script == 'Latn':
            continue
        script_to_samples[script].append((bcp, sample_type))

    for script, samples in sorted(script_to_samples.iteritems()):
        pref = {}
        for bcp, sample_type in samples:
            if bcp not in pref or sample_type == 'udhr':
                pref[bcp] = sample_type

        full_samples = []
        for bcp, sample_type in sorted(pref.iteritems()):
            filename = '%s_%s.txt' % (bcp, sample_type)
            filepath = path.join(sample_dir, filename)
            with codecs.open(filepath, 'r', 'utf-8') as f:
                sample_text = f.read()
            full_samples.append((bcp, sample_type, sample_text))

        script_to_samples[script] = full_samples

    return script_to_samples
def _get_script_to_samples():
    script_to_samples = collections.defaultdict(list)

    sample_dir = tool_utils.resolve_path("[tools]/sample_texts")
    for f in sorted(os.listdir(sample_dir)):
        base, ext = path.splitext(f)
        if ext != ".txt" or "_" not in base:
            print("skipping", f)
            continue
        bcp, sample_type = base.split("_")
        try:
            lang, script, region, variant = cldr_data.loc_tag_to_lsrv(bcp)
        except:
            print("bcp %s did not parse as lsrv" % bcp)
            continue
        if script == "Latn":
            continue
        script_to_samples[script].append((bcp, sample_type))

    for script, samples in sorted(script_to_samples.items()):
        pref = {}
        for bcp, sample_type in samples:
            if bcp not in pref or sample_type == "udhr":
                pref[bcp] = sample_type

        full_samples = []
        for bcp, sample_type in sorted(pref.items()):
            filename = "%s_%s.txt" % (bcp, sample_type)
            filepath = path.join(sample_dir, filename)
            with codecs.open(filepath, "r", "utf-8") as f:
                sample_text = f.read()
            full_samples.append((bcp, sample_type, sample_text))

        script_to_samples[script] = full_samples

    return script_to_samples
def _get_script_to_samples():
  script_to_samples = collections.defaultdict(list)

  sample_dir = tool_utils.resolve_path('[tools]/sample_texts')
  for f in sorted(os.listdir(sample_dir)):
    base, ext = path.splitext(f)
    if ext != '.txt' or '_' not in base:
      print 'skipping', f
      continue
    bcp, sample_type = base.split('_')
    try:
      lang, script, region, variant = cldr_data.loc_tag_to_lsrv(bcp)
    except:
      print 'bcp %s did not parse as lsrv' % bcp
      continue
    if script == 'Latn':
      continue
    script_to_samples[script].append((bcp, sample_type))

  for script, samples in sorted(script_to_samples.iteritems()):
    pref = {}
    for bcp, sample_type in samples:
      if bcp not in pref or sample_type == 'udhr':
        pref[bcp] = sample_type

    full_samples = []
    for bcp, sample_type in sorted(pref.iteritems()):
      filename = '%s_%s.txt' % (bcp, sample_type)
      filepath = path.join(sample_dir, filename)
      with codecs.open(filepath, 'r', 'utf-8') as f:
        sample_text = f.read()
      full_samples.append((bcp, sample_type, sample_text))

    script_to_samples[script] = full_samples

  return script_to_samples
def get_script_to_exemplar_data_map():
  """Return a map from script to 3-tuples of:
    - locale tuple (lang, script, region, variant)
    - cldr_relative path to src of exemplar data
    - tuple of the exemplar chars"""

  script_map = collections.defaultdict(dict)
  for directory in ['common', 'seed', 'exemplars']:
    data_dir = path.join(directory, 'main')
    for filename in os.listdir(path.join(CLDR_DIR, data_dir)):
      if not filename.endswith('.xml'):
        continue

      exemplar_list = cldr_data.get_exemplar_from_file(path.join(data_dir, filename))
      if not exemplar_list:
        if _VERBOSE:
          print '  no exemplar list for %s' % path.join(data_dir, filename)
        continue

      lsrv = cldr_data.loc_tag_to_lsrv(filename[:-4])
      if not lsrv:
        if _VERBOSE:
          print '  no lsrv for %s' % path.join(data_dir, filename)
        continue
      src = path.join(directory, filename)
      script = lsrv[1]
      if not script:
        if _VERBOSE:
          print '  no script for %s' % path.join(data_dir, filename)
        continue

      loc_tag = cldr_data.lsrv_to_loc_tag(lsrv)
      loc_to_exemplar_info = script_map[script]
      if loc_tag in loc_to_exemplar_info:
        if _VERBOSE:
          print 'skipping %s, already have exemplars for %s from %s' % (
              src, loc_tag, loc_to_exemplar_info[loc_tag][1])
        continue

      # fix exemplars that look incorrect
      if script == 'Arab' and 'd' in exemplar_list:
        if _VERBOSE:
          print 'found \'d\' in %s for %s' % (src, lsrv)
        no_latin = True
      else:
        no_latin = False
      # exclude exemplar strings, and restrict to letters and digits
      def accept_cp(cp):
        if len(cp) != 1:
          return False
        cat = unicode_data.category(cp)
        if cat[0] != 'L' and cat != 'Nd':
          return False
        if no_latin and cp in 'df':
          return False
        return True
      filtered_exemplar_list = filter(accept_cp, exemplar_list)

      # some exemplar lists don't surround strings with curly braces, and end up
      # with duplicate characters.  Flag these
      exemplar_chars = set()
      dup_chars = set()
      fixed_exemplar_list = []
      for cp in filtered_exemplar_list:
        if cp in exemplar_chars:
          dup_chars.add(cp)
        else:
          exemplar_chars.add(cp)
          fixed_exemplar_list.append(cp)
      if len(dup_chars) > 0 and _VERBOSE:
        print 'duplicate exemplars in %s: %s' % (
            src, ', '.join([u'\u200e%s\u200e (%x)' % (cp, ord(cp)) for cp in dup_chars]))
      loc_to_exemplar_info[loc_tag] = (lsrv, src, tuple(fixed_exemplar_list))

  # supplement with extra locale data
  for loc_tag in extra_locale_data.EXEMPLARS:
    exemplar_list = cldr_data.get_exemplar_from_extra_data(loc_tag)
    lang, script = loc_tag.split('-')
    lsrv = (lang, script, None, None)
    loc_to_exemplar_info = script_map[script]
    src = '[extra locale data]/%s' % loc_tag
    if loc_tag in loc_to_exemplar_info:
      if _VERBOSE:
        print 'skipping %s, already have exemplars for %s from %s' % (
            src, loc_tag, loc_to_exemplar_info[loc_tag][1])
      continue

    # restrict to letters, except for zsym
    def accept_cp(cp):
      cat = unicode_data.category(cp)
      return cat[0] == 'L' or cat == 'Nd'

    if 'Zsym' not in loc_tag:
      filtered_exemplar_list = filter(accept_cp, exemplar_list)
      if len(filtered_exemplar_list) != len(exemplar_list) and _VERBOSE:
        print 'filtered some characters from %s' % src
    else:
      filtered_exemplar_list = exemplar_list
    loc_to_exemplar_info[loc_tag] = (lsrv, src, tuple(filtered_exemplar_list))

  return script_map
def get_script_to_exemplar_data_map():
    """Return a map from script to 3-tuples of:
    - locale tuple (lang, script, region, variant)
    - cldr_relative path to src of exemplar data
    - tuple of the exemplar chars"""

    script_map = collections.defaultdict(dict)
    for directory in ['common', 'seed', 'exemplars']:
        data_dir = path.join(directory, 'main')
        for filename in os.listdir(path.join(CLDR_DIR, data_dir)):
            if not filename.endswith('.xml'):
                continue

            exemplar_list = cldr_data.get_exemplar_from_file(
                path.join(data_dir, filename))
            if not exemplar_list:
                if _VERBOSE:
                    print '  no exemplar list for %s' % path.join(
                        data_dir, filename)
                continue

            lsrv = cldr_data.loc_tag_to_lsrv(filename[:-4])
            if not lsrv:
                if _VERBOSE:
                    print '  no lsrv for %s' % path.join(data_dir, filename)
                continue
            src = path.join(directory, filename)
            script = lsrv[1]
            if not script:
                if _VERBOSE:
                    print '  no script for %s' % path.join(data_dir, filename)
                continue

            loc_tag = cldr_data.lsrv_to_loc_tag(lsrv)
            loc_to_exemplar_info = script_map[script]
            if loc_tag in loc_to_exemplar_info:
                if _VERBOSE:
                    print 'skipping %s, already have exemplars for %s from %s' % (
                        src, loc_tag, loc_to_exemplar_info[loc_tag][1])
                continue

            # fix exemplars that look incorrect
            if script == 'Arab' and 'd' in exemplar_list:
                if _VERBOSE:
                    print 'found \'d\' in %s for %s' % (src, lsrv)
                no_latin = True
            else:
                no_latin = False
            # exclude exemplar strings, and restrict to letters and digits
            def accept_cp(cp):
                if len(cp) != 1:
                    return False
                cat = unicode_data.category(cp)
                if cat[0] != 'L' and cat != 'Nd':
                    return False
                if no_latin and cp in 'df':
                    return False
                return True

            filtered_exemplar_list = filter(accept_cp, exemplar_list)

            # some exemplar lists don't surround strings with curly braces, and end up
            # with duplicate characters.  Flag these
            exemplar_chars = set()
            dup_chars = set()
            fixed_exemplar_list = []
            for cp in filtered_exemplar_list:
                if cp in exemplar_chars:
                    dup_chars.add(cp)
                else:
                    exemplar_chars.add(cp)
                    fixed_exemplar_list.append(cp)
            if len(dup_chars) > 0 and _VERBOSE:
                print 'duplicate exemplars in %s: %s' % (src, ', '.join([
                    u'\u200e%s\u200e (%x)' % (cp, ord(cp)) for cp in dup_chars
                ]))
            loc_to_exemplar_info[loc_tag] = (lsrv, src,
                                             tuple(fixed_exemplar_list))

    # supplement with extra locale data
    for loc_tag in extra_locale_data.EXEMPLARS:
        exemplar_list = cldr_data.get_exemplar_from_extra_data(loc_tag)
        lang, script = loc_tag.split('-')
        lsrv = (lang, script, None, None)
        loc_to_exemplar_info = script_map[script]
        src = '[extra locale data]/%s' % loc_tag
        if loc_tag in loc_to_exemplar_info:
            if _VERBOSE:
                print 'skipping %s, already have exemplars for %s from %s' % (
                    src, loc_tag, loc_to_exemplar_info[loc_tag][1])
            continue

        # restrict to letters, except for zsym
        def accept_cp(cp):
            cat = unicode_data.category(cp)
            return cat[0] == 'L' or cat == 'Nd'

        if 'Zsym' not in loc_tag:
            filtered_exemplar_list = filter(accept_cp, exemplar_list)
            if len(filtered_exemplar_list) != len(exemplar_list) and _VERBOSE:
                print 'filtered some characters from %s' % src
        else:
            filtered_exemplar_list = exemplar_list
        loc_to_exemplar_info[loc_tag] = (lsrv, src,
                                         tuple(filtered_exemplar_list))

    return script_map