def _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version):
    """Ensure all cps in these sequences are valid emoji cps or specific cps
  used in forming emoji sequences.  This is a 'pre-check' that reports
  this specific problem."""

    valid_cps = set(unicode_data.get_emoji())
    if unicode_version is None or unicode_version >= unicode_data.PROPOSED_EMOJI_AGE:
        valid_cps |= unicode_data.proposed_emoji_cps()
    else:
        valid_cps = set(cp for cp in valid_cps
                        if unicode_data.age(cp) <= unicode_version)
    valid_cps.add(0x200d)  # ZWJ
    valid_cps.add(0x20e3)  # combining enclosing keycap
    valid_cps.add(0xfe0f)  # variation selector (emoji presentation)
    valid_cps.add(0xfe82b)  # PUA value for unknown flag
    valid_cps |= TAG_SET  # used in subregion tag sequences

    not_emoji = {}
    for seq, fp in sorted_seq_to_filepath.iteritems():
        for cp in seq:
            if cp not in valid_cps:
                if cp not in not_emoji:
                    not_emoji[cp] = []
                not_emoji[cp].append(fp)

    if len(not_emoji):
        print('check valid emoji cps: %d non-emoji cp found' % len(not_emoji),
              file=sys.stderr)
        for cp in sorted(not_emoji):
            fps = not_emoji[cp]
            print('check valid emoji cps: %04x (in %d sequences)' %
                  (cp, len(fps)),
                  file=sys.stderr)
def _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version):
  """Ensure all cps in these sequences are valid emoji cps or specific cps
  used in forming emoji sequences.  This is a 'pre-check' that reports
  this specific problem."""

  valid_cps = set(unicode_data.get_emoji())
  if unicode_version is None or unicode_version >= unicode_data.PROPOSED_EMOJI_AGE:
    valid_cps |= unicode_data.proposed_emoji_cps()
  else:
    valid_cps = set(
        cp for cp in valid_cps if unicode_data.age(cp) <= unicode_version)
  valid_cps.add(0x200d)  # ZWJ
  valid_cps.add(0x20e3)  # combining enclosing keycap
  valid_cps.add(0xfe0f)  # variation selector (emoji presentation)
  valid_cps.add(0xfe82b)  # PUA value for unknown flag
  valid_cps |= TAG_SET  # used in subregion tag sequences

  not_emoji = {}
  for seq, fp in sorted_seq_to_filepath.iteritems():
    for cp in seq:
      if cp not in valid_cps:
        if cp not in not_emoji:
          not_emoji[cp] = []
        not_emoji[cp].append(fp)

  if len(not_emoji):
    print(
        'check valid emoji cps: %d non-emoji cp found' % len(not_emoji),
        file=sys.stderr)
    for cp in sorted(not_emoji):
      fps = not_emoji[cp]
      print(
          'check valid emoji cps: %04x (in %d sequences)' % (cp, len(fps)),
          file=sys.stderr)
def _create_flag_sets(data_dir):
    """Returns map from flag name to pairs of cp_set, boolean.
  These get added to a codepoint name if the the boolean matches
  the result of 'cp in cp_set'."""
    # These are hardcoded for now, should be able to specify on
    # command line... (TODO)

    # I propose supporting some emoji in Noto even if they don't have text
    # variation sequences proposed, we can remove those for Android if they
    # disagree.
    emoji_only = (unicode_data.get_emoji() -
                  unicode_data.get_unicode_emoji_variants('proposed_extra'))

    current_sym2_path = path.join(data_dir, 'NotoSansSymbols2-Regular.ttf')
    current_sym2 = CodeList.fromfontcmap(current_sym2_path).codeset()

    sym2_path = path.join(data_dir, 'notosanssymbols2_cmap.txt')
    with open(sym2_path, 'r') as f:
        sym2_cmap = f.read()
    expect_sym2 = tool_utils.parse_int_ranges(sym2_cmap)

    add_sym2 = expect_sym2 - current_sym2

    # True means set flag if cp in set, False means set if not in set
    flag_sets = {
        'ref only': (expect_sym2, False),
        'emoji only': (emoji_only, True),
        'add': (add_sym2, True),
    }
    return flag_sets
def _create_codeset_from_expr(expr_list, flag_sets, data_dir, codelist_map):
  """Processes expr_list in order, building a codeset.
  See _read_flag_data_from_file for information on expr_list.
  This can modify flag_sets and codelist_map."""

  result = ()
  for op, exp in expr_list:
    if exp not in flag_sets:
      # its a codelist
      codes = _load_codelist(exp, data_dir, codelist_map).codeset()
    else:
      codes_or_spec = flag_sets[exp]
      if isinstance(codes_or_spec, (set, frozenset)):
        codes = codes_or_spec
      else:
        # replace the spec with the actual codes
        if codes_or_spec == None:
          # we only know about '_emoji_' and '_math_'
          if exp == '_emoji_':
            codes = (
                unicode_data.get_emoji() -
                unicode_data.get_unicode_emoji_variants('proposed_extra'))
          elif exp == '_math_':
            codes = unicode_data.chars_with_property('Math')
          else:
            raise Exception('unknown special codeset "%s"' % exp)
        else:
          codes = _load_codelist(
              codes_or_spec, data_dir, codelist_map).codeset()
        flag_sets[exp] = codes
    if op == '|':
      if not result:
        # it appers that python 'optimizes' |= by replacing the lhs by rhs if
        # lhs is an empty set, but this changes the type of lhs to frozenset...
        result = set(codes)
      else:
        result |= codes
    elif op == '&':
      result &= codes
    elif op == '-':
      result -= codes
    else:
      raise Exception('unknown op "%s"' % op)

  return result
def _check_valid_emoji(sorted_seqs):
  """Ensure all emoji are either valid emoji or specific chars."""

  valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps())
  valid_cps.add(0x200d)  # ZWJ
  valid_cps.add(0x20e3)  # combining enclosing keycap
  valid_cps.add(0xfe0f)  # variation selector (emoji presentation)
  valid_cps.add(0xfe82b)  # PUA value for unknown flag

  not_emoji = set()
  for seq in sorted_seqs:
    for cp in seq:
      if cp not in valid_cps:
        not_emoji.add(cp)

  if len(not_emoji):
    print >> sys.stderr, '%d non-emoji found:' % len(not_emoji)
    for cp in sorted(not_emoji):
      print >> sys.stderr, '%04X' % cp
def _check_valid_emoji(sorted_seqs):
    """Ensure all emoji are either valid emoji or specific chars."""

    valid_cps = set(unicode_data.get_emoji()
                    | unicode_data.proposed_emoji_cps())
    valid_cps.add(0x200d)  # ZWJ
    valid_cps.add(0x20e3)  # combining enclosing keycap
    valid_cps.add(0xfe0f)  # variation selector (emoji presentation)
    valid_cps.add(0xfe82b)  # PUA value for unknown flag

    not_emoji = set()
    for seq in sorted_seqs:
        for cp in seq:
            if cp not in valid_cps:
                not_emoji.add(cp)

    if len(not_emoji):
        print >> sys.stderr, '%d non-emoji found:' % len(not_emoji)
        for cp in sorted(not_emoji):
            print >> sys.stderr, '%04X' % cp
Exemple #7
0
def _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version):
    """Ensure all cps in these sequences are valid emoji cps or specific cps
  used in forming emoji sequences.  This is a 'pre-check' that reports
  this specific problem."""

    coverage_pass = True

    valid_cps = set(unicode_data.get_emoji())
    if unicode_version is None or unicode_version >= unicode_data.PROPOSED_EMOJI_AGE:
        valid_cps |= unicode_data.proposed_emoji_cps()
    else:
        valid_cps = set(cp for cp in valid_cps
                        if unicode_data.age(cp) <= unicode_version)
    valid_cps.add(0x200d)  # ZWJ
    valid_cps.add(0x20e3)  # combining enclosing keycap
    valid_cps.add(0xfe0f)  # variation selector (emoji presentation)
    valid_cps.add(0xfe82b)  # PUA value for unknown flag
    valid_cps |= TAG_SET  # used in subregion tag sequences

    not_emoji = {}
    for seq, fp in sorted_seq_to_filepath.items():
        for cp in seq:
            if cp not in valid_cps:
                if cp not in not_emoji:
                    not_emoji[cp] = []
                not_emoji[cp].append(fp)

    if len(not_emoji):
        print(f'check valid emoji cps: {len(not_emoji)} non-emoji cp found',
              file=sys.stderr)
        for cp in sorted(not_emoji):
            fps = not_emoji[cp]
            print(
                f'check the following cp: {cp} - {not_emoji.get(cp)[0]} (in {len(fps)} sequences)',
                file=sys.stderr)
        coverage_pass = False

    if not coverage_pass:
        exit(
            "Please fix the problems metioned above or run: make BYPASS_SEQUENCE_CHECK='True'"
        )
Exemple #8
0
def _check_valid_emoji(sorted_seq_to_filepath):
    """Ensure all emoji are either valid emoji or specific chars."""

    valid_cps = set(unicode_data.get_emoji()
                    | unicode_data.proposed_emoji_cps())
    valid_cps.add(0x200d)  # ZWJ
    valid_cps.add(0x20e3)  # combining enclosing keycap
    valid_cps.add(0xfe0f)  # variation selector (emoji presentation)
    valid_cps.add(0xfe82b)  # PUA value for unknown flag

    not_emoji = {}
    for seq, fp in sorted_seq_to_filepath.iteritems():
        for cp in seq:
            if cp not in valid_cps:
                if cp not in not_emoji:
                    not_emoji[cp] = []
                not_emoji[cp].append(fp)

    if len(not_emoji):
        print >> sys.stderr, '%d non-emoji found:' % len(not_emoji)
        for cp in sorted(not_emoji):
            print >> sys.stderr, '%04x (in %s)' % (cp, ', '.join(
                not_emoji[cp]))
def _check_coverage(seq_to_filepath, unicode_version):
  """Ensure we have all and only the cps and sequences that we need for the
  font as of this version."""

  age = unicode_version

  non_vs_to_canonical = {}
  for k in seq_to_filepath:
    if EMOJI_VS in k:
      non_vs = unicode_data.strip_emoji_vs(k)
      non_vs_to_canonical[non_vs] = k

  aliases = add_aliases.read_default_emoji_aliases()
  for k, v in sorted(aliases.items()):
    if v not in seq_to_filepath and v not in non_vs_to_canonical:
      alias_str = unicode_data.seq_to_string(k)
      target_str = unicode_data.seq_to_string(v)
      print('coverage: alias %s missing target %s' % (alias_str, target_str))
      continue
    if k in seq_to_filepath or k in non_vs_to_canonical:
      alias_str = unicode_data.seq_to_string(k)
      target_str = unicode_data.seq_to_string(v)
      print('coverage: alias %s already exists as %s (%s)' % (
          alias_str, target_str, seq_name(v)))
      continue
    filename = seq_to_filepath.get(v) or seq_to_filepath[non_vs_to_canonical[v]]
    seq_to_filepath[k] = 'alias:' + filename

  # check single emoji, this includes most of the special chars
  emoji = sorted(unicode_data.get_emoji(age=age))
  for cp in emoji:
    if tuple([cp]) not in seq_to_filepath:
      print(
          'coverage: missing single %04x (%s)' % (
              cp, unicode_data.name(cp, '<no name>')))

  # special characters
  # all but combining enclosing keycap are currently marked as emoji
  for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a):
    if cp not in emoji and tuple([cp]) not in seq_to_filepath:
      print('coverage: missing special %04x (%s)' % (cp, unicode_data.name(cp)))

  # combining sequences
  comb_seq_to_name = sorted(
      unicode_data.get_emoji_combining_sequences(age=age).iteritems())
  for seq, name in comb_seq_to_name:
    if seq not in seq_to_filepath:
      # strip vs and try again
      non_vs_seq = unicode_data.strip_emoji_vs(seq)
      if non_vs_seq not in seq_to_filepath:
        print('coverage: missing combining sequence %s (%s)' %
              (unicode_data.seq_to_string(seq), name))

  # flag sequences
  flag_seq_to_name = sorted(
      unicode_data.get_emoji_flag_sequences(age=age).iteritems())
  for seq, name in flag_seq_to_name:
    if seq not in seq_to_filepath:
      print('coverage: missing flag sequence %s (%s)' %
            (unicode_data.seq_to_string(seq), name))

  # skin tone modifier sequences
  mod_seq_to_name = sorted(
      unicode_data.get_emoji_modifier_sequences(age=age).iteritems())
  for seq, name in mod_seq_to_name:
    if seq not in seq_to_filepath:
      print('coverage: missing modifier sequence %s (%s)' % (
          unicode_data.seq_to_string(seq), name))

  # zwj sequences
  # some of ours include the emoji presentation variation selector and some
  # don't, and the same is true for the canonical sequences.  normalize all
  # of them to omit it to test coverage, but report the canonical sequence.
  zwj_seq_without_vs = set()
  for seq in seq_to_filepath:
    if ZWJ not in seq:
      continue
    if EMOJI_VS in seq:
      seq = tuple(cp for cp in seq if cp != EMOJI_VS)
    zwj_seq_without_vs.add(seq)

  for seq, name in sorted(
      unicode_data.get_emoji_zwj_sequences(age=age).iteritems()):
    if EMOJI_VS in seq:
      test_seq = tuple(s for s in seq if s != EMOJI_VS)
    else:
      test_seq = seq
    if test_seq not in zwj_seq_without_vs:
      print('coverage: missing (canonical) zwj sequence %s (%s)' % (
          unicode_data.seq_to_string(seq), name))

  # check for 'unknown flag'
  # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that
  # don't start with our prefix so 'unknown_flag' would be excluded by default.
  if tuple([0xfe82b]) not in seq_to_filepath:
    print('coverage: missing unknown flag PUA fe82b')
def _check_coverage(seq_to_filepath, unicode_version):
    """Ensure we have all and only the cps and sequences that we need for the
  font as of this version."""

    age = unicode_version

    non_vs_to_canonical = {}
    for k in seq_to_filepath:
        if EMOJI_VS in k:
            non_vs = unicode_data.strip_emoji_vs(k)
            non_vs_to_canonical[non_vs] = k

    aliases = add_aliases.read_default_emoji_aliases()
    for k, v in sorted(aliases.items()):
        if v not in seq_to_filepath and v not in non_vs_to_canonical:
            alias_str = unicode_data.seq_to_string(k)
            target_str = unicode_data.seq_to_string(v)
            print('coverage: alias %s missing target %s' %
                  (alias_str, target_str))
            continue
        if k in seq_to_filepath or k in non_vs_to_canonical:
            alias_str = unicode_data.seq_to_string(k)
            target_str = unicode_data.seq_to_string(v)
            print('coverage: alias %s already exists as %s (%s)' %
                  (alias_str, target_str, seq_name(v)))
            continue
        filename = seq_to_filepath.get(v) or seq_to_filepath[
            non_vs_to_canonical[v]]
        seq_to_filepath[k] = 'alias:' + filename

    # check single emoji, this includes most of the special chars
    emoji = sorted(unicode_data.get_emoji(age=age))
    for cp in emoji:
        if tuple([cp]) not in seq_to_filepath:
            print('coverage: missing single %04x (%s)' %
                  (cp, unicode_data.name(cp, '<no name>')))

    # special characters
    # all but combining enclosing keycap are currently marked as emoji
    for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a):
        if cp not in emoji and tuple([cp]) not in seq_to_filepath:
            print('coverage: missing special %04x (%s)' %
                  (cp, unicode_data.name(cp)))

    # combining sequences
    comb_seq_to_name = sorted(
        unicode_data.get_emoji_combining_sequences(age=age).iteritems())
    for seq, name in comb_seq_to_name:
        if seq not in seq_to_filepath:
            # strip vs and try again
            non_vs_seq = unicode_data.strip_emoji_vs(seq)
            if non_vs_seq not in seq_to_filepath:
                print('coverage: missing combining sequence %s (%s)' %
                      (unicode_data.seq_to_string(seq), name))

    # flag sequences
    flag_seq_to_name = sorted(
        unicode_data.get_emoji_flag_sequences(age=age).iteritems())
    for seq, name in flag_seq_to_name:
        if seq not in seq_to_filepath:
            print('coverage: missing flag sequence %s (%s)' %
                  (unicode_data.seq_to_string(seq), name))

    # skin tone modifier sequences
    mod_seq_to_name = sorted(
        unicode_data.get_emoji_modifier_sequences(age=age).iteritems())
    for seq, name in mod_seq_to_name:
        if seq not in seq_to_filepath:
            print('coverage: missing modifier sequence %s (%s)' %
                  (unicode_data.seq_to_string(seq), name))

    # zwj sequences
    # some of ours include the emoji presentation variation selector and some
    # don't, and the same is true for the canonical sequences.  normalize all
    # of them to omit it to test coverage, but report the canonical sequence.
    zwj_seq_without_vs = set()
    for seq in seq_to_filepath:
        if ZWJ not in seq:
            continue
        if EMOJI_VS in seq:
            seq = tuple(cp for cp in seq if cp != EMOJI_VS)
        zwj_seq_without_vs.add(seq)

    for seq, name in sorted(
            unicode_data.get_emoji_zwj_sequences(age=age).iteritems()):
        if EMOJI_VS in seq:
            test_seq = tuple(s for s in seq if s != EMOJI_VS)
        else:
            test_seq = seq
        if test_seq not in zwj_seq_without_vs:
            print('coverage: missing (canonical) zwj sequence %s (%s)' %
                  (unicode_data.seq_to_string(seq), name))

    # check for 'unknown flag'
    # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that
    # don't start with our prefix so 'unknown_flag' would be excluded by default.
    if tuple([0xfe82b]) not in seq_to_filepath:
        print('coverage: missing unknown flag PUA fe82b')
def _check_coverage(seq_to_filepath, unicode_version):
    """Ensure we have all and only the cps and sequences that we need for the
  font as of this version."""

    coverage_pass = True
    age = unicode_version

    non_vs_to_canonical = {}
    for k in seq_to_filepath:
        if EMOJI_VS in k:
            non_vs = unicode_data.strip_emoji_vs(k)
            non_vs_to_canonical[non_vs] = k

    aliases = add_aliases.read_default_emoji_aliases()
    for k, v in sorted(aliases.items()):
        if v not in seq_to_filepath and v not in non_vs_to_canonical:
            alias_str = unicode_data.seq_to_string(k)
            target_str = unicode_data.seq_to_string(v)
            print(f'coverage: alias {alias_str} missing target {target_str}')
            coverage_pass = False
            continue
        if k in seq_to_filepath or k in non_vs_to_canonical:
            alias_str = unicode_data.seq_to_string(k)
            target_str = unicode_data.seq_to_string(v)
            print(
                f'coverage: alias {alias_str} already exists as {target_str} ({seq_name(v)})'
            )
            coverage_pass = False
            continue
        filename = seq_to_filepath.get(v) or seq_to_filepath[
            non_vs_to_canonical[v]]
        seq_to_filepath[k] = 'alias:' + filename

    # check single emoji, this includes most of the special chars
    emoji = sorted(unicode_data.get_emoji())
    for cp in emoji:
        if tuple([cp]) not in seq_to_filepath:
            print(f'coverage: missing single {cp} ({unicode_data.name(cp)})')
            coverage_pass = False

    # special characters
    # all but combining enclosing keycap are currently marked as emoji
    for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + list(range(0x30, 0x3a)):
        if cp not in emoji and tuple([cp]) not in seq_to_filepath:
            print(f'coverage: missing special {cp} ({unicode_data.name(cp)})')
            coverage_pass = False

    # combining sequences
    comb_seq_to_name = sorted(unicode_data._emoji_sequence_data.items())
    for seq, name in comb_seq_to_name:
        if seq not in seq_to_filepath:
            # strip vs and try again
            non_vs_seq = unicode_data.strip_emoji_vs(seq)
            if non_vs_seq not in seq_to_filepath:
                print(
                    f'coverage: missing combining sequence {unicode_data.seq_to_string(seq)} ({name})'
                )
                coverage_pass = False

    # check for 'unknown flag'
    # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that
    # don't start with our prefix so 'unknown_flag' would be excluded by default.
    if tuple([0xfe82b]) not in seq_to_filepath:
        print('coverage: missing unknown flag PUA fe82b')
        coverage_pass = False

    if not coverage_pass:
        exit(
            "Please fix the problems metioned above or run: make BYPASS_SEQUENCE_CHECK='True'"
        )