def get_script_to_exemplar_data_map():
  """Return a map from script to 3-tuples of:
    - locale tuple (lang, script, region, variant)
    - cldr_relative path to src of exemplar data
    - tuple of the exemplar chars"""

  script_map = collections.defaultdict(dict)
  for directory in ['common', 'seed', 'exemplars']:
    data_dir = path.join(directory, 'main')
    for filename in os.listdir(path.join(CLDR_DIR, data_dir)):
      if not filename.endswith('.xml'):
        continue

      exemplar_list = cldr_data.get_exemplar_from_file(path.join(data_dir, filename))
      if not exemplar_list:
        if _VERBOSE:
          print '  no exemplar list for %s' % path.join(data_dir, filename)
        continue

      lsrv = cldr_data.loc_tag_to_lsrv(filename[:-4])
      if not lsrv:
        if _VERBOSE:
          print '  no lsrv for %s' % path.join(data_dir, filename)
        continue
      src = path.join(directory, filename)
      script = lsrv[1]
      if not script:
        if _VERBOSE:
          print '  no script for %s' % path.join(data_dir, filename)
        continue

      loc_tag = cldr_data.lsrv_to_loc_tag(lsrv)
      loc_to_exemplar_info = script_map[script]
      if loc_tag in loc_to_exemplar_info:
        if _VERBOSE:
          print 'skipping %s, already have exemplars for %s from %s' % (
              src, loc_tag, loc_to_exemplar_info[loc_tag][1])
        continue

      # fix exemplars that look incorrect
      if script == 'Arab' and 'd' in exemplar_list:
        if _VERBOSE:
          print 'found \'d\' in %s for %s' % (src, lsrv)
        no_latin = True
      else:
        no_latin = False
      # exclude exemplar strings, and restrict to letters and digits
      def accept_cp(cp):
        if len(cp) != 1:
          return False
        cat = unicode_data.category(cp)
        if cat[0] != 'L' and cat != 'Nd':
          return False
        if no_latin and cp in 'df':
          return False
        return True
      filtered_exemplar_list = filter(accept_cp, exemplar_list)

      # some exemplar lists don't surround strings with curly braces, and end up
      # with duplicate characters.  Flag these
      exemplar_chars = set()
      dup_chars = set()
      fixed_exemplar_list = []
      for cp in filtered_exemplar_list:
        if cp in exemplar_chars:
          dup_chars.add(cp)
        else:
          exemplar_chars.add(cp)
          fixed_exemplar_list.append(cp)
      if len(dup_chars) > 0 and _VERBOSE:
        print 'duplicate exemplars in %s: %s' % (
            src, ', '.join([u'\u200e%s\u200e (%x)' % (cp, ord(cp)) for cp in dup_chars]))
      loc_to_exemplar_info[loc_tag] = (lsrv, src, tuple(fixed_exemplar_list))

  # supplement with extra locale data
  for loc_tag in extra_locale_data.EXEMPLARS:
    exemplar_list = cldr_data.get_exemplar_from_extra_data(loc_tag)
    lang, script = loc_tag.split('-')
    lsrv = (lang, script, None, None)
    loc_to_exemplar_info = script_map[script]
    src = '[extra locale data]/%s' % loc_tag
    if loc_tag in loc_to_exemplar_info:
      if _VERBOSE:
        print 'skipping %s, already have exemplars for %s from %s' % (
            src, loc_tag, loc_to_exemplar_info[loc_tag][1])
      continue

    # restrict to letters, except for zsym
    def accept_cp(cp):
      cat = unicode_data.category(cp)
      return cat[0] == 'L' or cat == 'Nd'

    if 'Zsym' not in loc_tag:
      filtered_exemplar_list = filter(accept_cp, exemplar_list)
      if len(filtered_exemplar_list) != len(exemplar_list) and _VERBOSE:
        print 'filtered some characters from %s' % src
    else:
      filtered_exemplar_list = exemplar_list
    loc_to_exemplar_info[loc_tag] = (lsrv, src, tuple(filtered_exemplar_list))

  return script_map
def get_script_to_exemplar_data_map():
    """Return a map from script to 3-tuples of:
    - locale tuple (lang, script, region, variant)
    - cldr_relative path to src of exemplar data
    - tuple of the exemplar chars"""

    script_map = collections.defaultdict(dict)
    for directory in ['common', 'seed', 'exemplars']:
        data_dir = path.join(directory, 'main')
        for filename in os.listdir(path.join(CLDR_DIR, data_dir)):
            if not filename.endswith('.xml'):
                continue

            exemplar_list = cldr_data.get_exemplar_from_file(
                path.join(data_dir, filename))
            if not exemplar_list:
                if _VERBOSE:
                    print '  no exemplar list for %s' % path.join(
                        data_dir, filename)
                continue

            lsrv = cldr_data.loc_tag_to_lsrv(filename[:-4])
            if not lsrv:
                if _VERBOSE:
                    print '  no lsrv for %s' % path.join(data_dir, filename)
                continue
            src = path.join(directory, filename)
            script = lsrv[1]
            if not script:
                if _VERBOSE:
                    print '  no script for %s' % path.join(data_dir, filename)
                continue

            loc_tag = cldr_data.lsrv_to_loc_tag(lsrv)
            loc_to_exemplar_info = script_map[script]
            if loc_tag in loc_to_exemplar_info:
                if _VERBOSE:
                    print 'skipping %s, already have exemplars for %s from %s' % (
                        src, loc_tag, loc_to_exemplar_info[loc_tag][1])
                continue

            # fix exemplars that look incorrect
            if script == 'Arab' and 'd' in exemplar_list:
                if _VERBOSE:
                    print 'found \'d\' in %s for %s' % (src, lsrv)
                no_latin = True
            else:
                no_latin = False
            # exclude exemplar strings, and restrict to letters and digits
            def accept_cp(cp):
                if len(cp) != 1:
                    return False
                cat = unicode_data.category(cp)
                if cat[0] != 'L' and cat != 'Nd':
                    return False
                if no_latin and cp in 'df':
                    return False
                return True

            filtered_exemplar_list = filter(accept_cp, exemplar_list)

            # some exemplar lists don't surround strings with curly braces, and end up
            # with duplicate characters.  Flag these
            exemplar_chars = set()
            dup_chars = set()
            fixed_exemplar_list = []
            for cp in filtered_exemplar_list:
                if cp in exemplar_chars:
                    dup_chars.add(cp)
                else:
                    exemplar_chars.add(cp)
                    fixed_exemplar_list.append(cp)
            if len(dup_chars) > 0 and _VERBOSE:
                print 'duplicate exemplars in %s: %s' % (src, ', '.join([
                    u'\u200e%s\u200e (%x)' % (cp, ord(cp)) for cp in dup_chars
                ]))
            loc_to_exemplar_info[loc_tag] = (lsrv, src,
                                             tuple(fixed_exemplar_list))

    # supplement with extra locale data
    for loc_tag in extra_locale_data.EXEMPLARS:
        exemplar_list = cldr_data.get_exemplar_from_extra_data(loc_tag)
        lang, script = loc_tag.split('-')
        lsrv = (lang, script, None, None)
        loc_to_exemplar_info = script_map[script]
        src = '[extra locale data]/%s' % loc_tag
        if loc_tag in loc_to_exemplar_info:
            if _VERBOSE:
                print 'skipping %s, already have exemplars for %s from %s' % (
                    src, loc_tag, loc_to_exemplar_info[loc_tag][1])
            continue

        # restrict to letters, except for zsym
        def accept_cp(cp):
            cat = unicode_data.category(cp)
            return cat[0] == 'L' or cat == 'Nd'

        if 'Zsym' not in loc_tag:
            filtered_exemplar_list = filter(accept_cp, exemplar_list)
            if len(filtered_exemplar_list) != len(exemplar_list) and _VERBOSE:
                print 'filtered some characters from %s' % src
        else:
            filtered_exemplar_list = exemplar_list
        loc_to_exemplar_info[loc_tag] = (lsrv, src,
                                         tuple(filtered_exemplar_list))

    return script_map