def _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version): """Ensure all cps in these sequences are valid emoji cps or specific cps used in forming emoji sequences. This is a 'pre-check' that reports this specific problem.""" valid_cps = set(unicode_data.get_emoji()) if unicode_version is None or unicode_version >= unicode_data.PROPOSED_EMOJI_AGE: valid_cps |= unicode_data.proposed_emoji_cps() else: valid_cps = set(cp for cp in valid_cps if unicode_data.age(cp) <= unicode_version) valid_cps.add(0x200d) # ZWJ valid_cps.add(0x20e3) # combining enclosing keycap valid_cps.add(0xfe0f) # variation selector (emoji presentation) valid_cps.add(0xfe82b) # PUA value for unknown flag valid_cps |= TAG_SET # used in subregion tag sequences not_emoji = {} for seq, fp in sorted_seq_to_filepath.iteritems(): for cp in seq: if cp not in valid_cps: if cp not in not_emoji: not_emoji[cp] = [] not_emoji[cp].append(fp) if len(not_emoji): print('check valid emoji cps: %d non-emoji cp found' % len(not_emoji), file=sys.stderr) for cp in sorted(not_emoji): fps = not_emoji[cp] print('check valid emoji cps: %04x (in %d sequences)' % (cp, len(fps)), file=sys.stderr)
def _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version): """Ensure all cps in these sequences are valid emoji cps or specific cps used in forming emoji sequences. This is a 'pre-check' that reports this specific problem.""" valid_cps = set(unicode_data.get_emoji()) if unicode_version is None or unicode_version >= unicode_data.PROPOSED_EMOJI_AGE: valid_cps |= unicode_data.proposed_emoji_cps() else: valid_cps = set( cp for cp in valid_cps if unicode_data.age(cp) <= unicode_version) valid_cps.add(0x200d) # ZWJ valid_cps.add(0x20e3) # combining enclosing keycap valid_cps.add(0xfe0f) # variation selector (emoji presentation) valid_cps.add(0xfe82b) # PUA value for unknown flag valid_cps |= TAG_SET # used in subregion tag sequences not_emoji = {} for seq, fp in sorted_seq_to_filepath.iteritems(): for cp in seq: if cp not in valid_cps: if cp not in not_emoji: not_emoji[cp] = [] not_emoji[cp].append(fp) if len(not_emoji): print( 'check valid emoji cps: %d non-emoji cp found' % len(not_emoji), file=sys.stderr) for cp in sorted(not_emoji): fps = not_emoji[cp] print( 'check valid emoji cps: %04x (in %d sequences)' % (cp, len(fps)), file=sys.stderr)
def _create_flag_sets(data_dir): """Returns map from flag name to pairs of cp_set, boolean. These get added to a codepoint name if the the boolean matches the result of 'cp in cp_set'.""" # These are hardcoded for now, should be able to specify on # command line... (TODO) # I propose supporting some emoji in Noto even if they don't have text # variation sequences proposed, we can remove those for Android if they # disagree. emoji_only = (unicode_data.get_emoji() - unicode_data.get_unicode_emoji_variants('proposed_extra')) current_sym2_path = path.join(data_dir, 'NotoSansSymbols2-Regular.ttf') current_sym2 = CodeList.fromfontcmap(current_sym2_path).codeset() sym2_path = path.join(data_dir, 'notosanssymbols2_cmap.txt') with open(sym2_path, 'r') as f: sym2_cmap = f.read() expect_sym2 = tool_utils.parse_int_ranges(sym2_cmap) add_sym2 = expect_sym2 - current_sym2 # True means set flag if cp in set, False means set if not in set flag_sets = { 'ref only': (expect_sym2, False), 'emoji only': (emoji_only, True), 'add': (add_sym2, True), } return flag_sets
def _create_codeset_from_expr(expr_list, flag_sets, data_dir, codelist_map): """Processes expr_list in order, building a codeset. See _read_flag_data_from_file for information on expr_list. This can modify flag_sets and codelist_map.""" result = () for op, exp in expr_list: if exp not in flag_sets: # its a codelist codes = _load_codelist(exp, data_dir, codelist_map).codeset() else: codes_or_spec = flag_sets[exp] if isinstance(codes_or_spec, (set, frozenset)): codes = codes_or_spec else: # replace the spec with the actual codes if codes_or_spec == None: # we only know about '_emoji_' and '_math_' if exp == '_emoji_': codes = ( unicode_data.get_emoji() - unicode_data.get_unicode_emoji_variants('proposed_extra')) elif exp == '_math_': codes = unicode_data.chars_with_property('Math') else: raise Exception('unknown special codeset "%s"' % exp) else: codes = _load_codelist( codes_or_spec, data_dir, codelist_map).codeset() flag_sets[exp] = codes if op == '|': if not result: # it appers that python 'optimizes' |= by replacing the lhs by rhs if # lhs is an empty set, but this changes the type of lhs to frozenset... result = set(codes) else: result |= codes elif op == '&': result &= codes elif op == '-': result -= codes else: raise Exception('unknown op "%s"' % op) return result
def _check_valid_emoji(sorted_seqs): """Ensure all emoji are either valid emoji or specific chars.""" valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps()) valid_cps.add(0x200d) # ZWJ valid_cps.add(0x20e3) # combining enclosing keycap valid_cps.add(0xfe0f) # variation selector (emoji presentation) valid_cps.add(0xfe82b) # PUA value for unknown flag not_emoji = set() for seq in sorted_seqs: for cp in seq: if cp not in valid_cps: not_emoji.add(cp) if len(not_emoji): print >> sys.stderr, '%d non-emoji found:' % len(not_emoji) for cp in sorted(not_emoji): print >> sys.stderr, '%04X' % cp
def _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version): """Ensure all cps in these sequences are valid emoji cps or specific cps used in forming emoji sequences. This is a 'pre-check' that reports this specific problem.""" coverage_pass = True valid_cps = set(unicode_data.get_emoji()) if unicode_version is None or unicode_version >= unicode_data.PROPOSED_EMOJI_AGE: valid_cps |= unicode_data.proposed_emoji_cps() else: valid_cps = set(cp for cp in valid_cps if unicode_data.age(cp) <= unicode_version) valid_cps.add(0x200d) # ZWJ valid_cps.add(0x20e3) # combining enclosing keycap valid_cps.add(0xfe0f) # variation selector (emoji presentation) valid_cps.add(0xfe82b) # PUA value for unknown flag valid_cps |= TAG_SET # used in subregion tag sequences not_emoji = {} for seq, fp in sorted_seq_to_filepath.items(): for cp in seq: if cp not in valid_cps: if cp not in not_emoji: not_emoji[cp] = [] not_emoji[cp].append(fp) if len(not_emoji): print(f'check valid emoji cps: {len(not_emoji)} non-emoji cp found', file=sys.stderr) for cp in sorted(not_emoji): fps = not_emoji[cp] print( f'check the following cp: {cp} - {not_emoji.get(cp)[0]} (in {len(fps)} sequences)', file=sys.stderr) coverage_pass = False if not coverage_pass: exit( "Please fix the problems metioned above or run: make BYPASS_SEQUENCE_CHECK='True'" )
def _check_valid_emoji(sorted_seq_to_filepath): """Ensure all emoji are either valid emoji or specific chars.""" valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps()) valid_cps.add(0x200d) # ZWJ valid_cps.add(0x20e3) # combining enclosing keycap valid_cps.add(0xfe0f) # variation selector (emoji presentation) valid_cps.add(0xfe82b) # PUA value for unknown flag not_emoji = {} for seq, fp in sorted_seq_to_filepath.iteritems(): for cp in seq: if cp not in valid_cps: if cp not in not_emoji: not_emoji[cp] = [] not_emoji[cp].append(fp) if len(not_emoji): print >> sys.stderr, '%d non-emoji found:' % len(not_emoji) for cp in sorted(not_emoji): print >> sys.stderr, '%04x (in %s)' % (cp, ', '.join( not_emoji[cp]))
def _check_coverage(seq_to_filepath, unicode_version): """Ensure we have all and only the cps and sequences that we need for the font as of this version.""" age = unicode_version non_vs_to_canonical = {} for k in seq_to_filepath: if EMOJI_VS in k: non_vs = unicode_data.strip_emoji_vs(k) non_vs_to_canonical[non_vs] = k aliases = add_aliases.read_default_emoji_aliases() for k, v in sorted(aliases.items()): if v not in seq_to_filepath and v not in non_vs_to_canonical: alias_str = unicode_data.seq_to_string(k) target_str = unicode_data.seq_to_string(v) print('coverage: alias %s missing target %s' % (alias_str, target_str)) continue if k in seq_to_filepath or k in non_vs_to_canonical: alias_str = unicode_data.seq_to_string(k) target_str = unicode_data.seq_to_string(v) print('coverage: alias %s already exists as %s (%s)' % ( alias_str, target_str, seq_name(v))) continue filename = seq_to_filepath.get(v) or seq_to_filepath[non_vs_to_canonical[v]] seq_to_filepath[k] = 'alias:' + filename # check single emoji, this includes most of the special chars emoji = sorted(unicode_data.get_emoji(age=age)) for cp in emoji: if tuple([cp]) not in seq_to_filepath: print( 'coverage: missing single %04x (%s)' % ( cp, unicode_data.name(cp, '<no name>'))) # special characters # all but combining enclosing keycap are currently marked as emoji for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a): if cp not in emoji and tuple([cp]) not in seq_to_filepath: print('coverage: missing special %04x (%s)' % (cp, unicode_data.name(cp))) # combining sequences comb_seq_to_name = sorted( unicode_data.get_emoji_combining_sequences(age=age).iteritems()) for seq, name in comb_seq_to_name: if seq not in seq_to_filepath: # strip vs and try again non_vs_seq = unicode_data.strip_emoji_vs(seq) if non_vs_seq not in seq_to_filepath: print('coverage: missing combining sequence %s (%s)' % (unicode_data.seq_to_string(seq), name)) # flag sequences flag_seq_to_name = sorted( unicode_data.get_emoji_flag_sequences(age=age).iteritems()) for seq, name in flag_seq_to_name: if seq not in seq_to_filepath: print('coverage: missing flag sequence %s (%s)' % (unicode_data.seq_to_string(seq), name)) # skin tone modifier sequences mod_seq_to_name = sorted( unicode_data.get_emoji_modifier_sequences(age=age).iteritems()) for seq, name in mod_seq_to_name: if seq not in seq_to_filepath: print('coverage: missing modifier sequence %s (%s)' % ( unicode_data.seq_to_string(seq), name)) # zwj sequences # some of ours include the emoji presentation variation selector and some # don't, and the same is true for the canonical sequences. normalize all # of them to omit it to test coverage, but report the canonical sequence. zwj_seq_without_vs = set() for seq in seq_to_filepath: if ZWJ not in seq: continue if EMOJI_VS in seq: seq = tuple(cp for cp in seq if cp != EMOJI_VS) zwj_seq_without_vs.add(seq) for seq, name in sorted( unicode_data.get_emoji_zwj_sequences(age=age).iteritems()): if EMOJI_VS in seq: test_seq = tuple(s for s in seq if s != EMOJI_VS) else: test_seq = seq if test_seq not in zwj_seq_without_vs: print('coverage: missing (canonical) zwj sequence %s (%s)' % ( unicode_data.seq_to_string(seq), name)) # check for 'unknown flag' # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that # don't start with our prefix so 'unknown_flag' would be excluded by default. if tuple([0xfe82b]) not in seq_to_filepath: print('coverage: missing unknown flag PUA fe82b')
def _check_coverage(seq_to_filepath, unicode_version): """Ensure we have all and only the cps and sequences that we need for the font as of this version.""" age = unicode_version non_vs_to_canonical = {} for k in seq_to_filepath: if EMOJI_VS in k: non_vs = unicode_data.strip_emoji_vs(k) non_vs_to_canonical[non_vs] = k aliases = add_aliases.read_default_emoji_aliases() for k, v in sorted(aliases.items()): if v not in seq_to_filepath and v not in non_vs_to_canonical: alias_str = unicode_data.seq_to_string(k) target_str = unicode_data.seq_to_string(v) print('coverage: alias %s missing target %s' % (alias_str, target_str)) continue if k in seq_to_filepath or k in non_vs_to_canonical: alias_str = unicode_data.seq_to_string(k) target_str = unicode_data.seq_to_string(v) print('coverage: alias %s already exists as %s (%s)' % (alias_str, target_str, seq_name(v))) continue filename = seq_to_filepath.get(v) or seq_to_filepath[ non_vs_to_canonical[v]] seq_to_filepath[k] = 'alias:' + filename # check single emoji, this includes most of the special chars emoji = sorted(unicode_data.get_emoji(age=age)) for cp in emoji: if tuple([cp]) not in seq_to_filepath: print('coverage: missing single %04x (%s)' % (cp, unicode_data.name(cp, '<no name>'))) # special characters # all but combining enclosing keycap are currently marked as emoji for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a): if cp not in emoji and tuple([cp]) not in seq_to_filepath: print('coverage: missing special %04x (%s)' % (cp, unicode_data.name(cp))) # combining sequences comb_seq_to_name = sorted( unicode_data.get_emoji_combining_sequences(age=age).iteritems()) for seq, name in comb_seq_to_name: if seq not in seq_to_filepath: # strip vs and try again non_vs_seq = unicode_data.strip_emoji_vs(seq) if non_vs_seq not in seq_to_filepath: print('coverage: missing combining sequence %s (%s)' % (unicode_data.seq_to_string(seq), name)) # flag sequences flag_seq_to_name = sorted( unicode_data.get_emoji_flag_sequences(age=age).iteritems()) for seq, name in flag_seq_to_name: if seq not in seq_to_filepath: print('coverage: missing flag sequence %s (%s)' % (unicode_data.seq_to_string(seq), name)) # skin tone modifier sequences mod_seq_to_name = sorted( unicode_data.get_emoji_modifier_sequences(age=age).iteritems()) for seq, name in mod_seq_to_name: if seq not in seq_to_filepath: print('coverage: missing modifier sequence %s (%s)' % (unicode_data.seq_to_string(seq), name)) # zwj sequences # some of ours include the emoji presentation variation selector and some # don't, and the same is true for the canonical sequences. normalize all # of them to omit it to test coverage, but report the canonical sequence. zwj_seq_without_vs = set() for seq in seq_to_filepath: if ZWJ not in seq: continue if EMOJI_VS in seq: seq = tuple(cp for cp in seq if cp != EMOJI_VS) zwj_seq_without_vs.add(seq) for seq, name in sorted( unicode_data.get_emoji_zwj_sequences(age=age).iteritems()): if EMOJI_VS in seq: test_seq = tuple(s for s in seq if s != EMOJI_VS) else: test_seq = seq if test_seq not in zwj_seq_without_vs: print('coverage: missing (canonical) zwj sequence %s (%s)' % (unicode_data.seq_to_string(seq), name)) # check for 'unknown flag' # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that # don't start with our prefix so 'unknown_flag' would be excluded by default. if tuple([0xfe82b]) not in seq_to_filepath: print('coverage: missing unknown flag PUA fe82b')
def _check_coverage(seq_to_filepath, unicode_version): """Ensure we have all and only the cps and sequences that we need for the font as of this version.""" coverage_pass = True age = unicode_version non_vs_to_canonical = {} for k in seq_to_filepath: if EMOJI_VS in k: non_vs = unicode_data.strip_emoji_vs(k) non_vs_to_canonical[non_vs] = k aliases = add_aliases.read_default_emoji_aliases() for k, v in sorted(aliases.items()): if v not in seq_to_filepath and v not in non_vs_to_canonical: alias_str = unicode_data.seq_to_string(k) target_str = unicode_data.seq_to_string(v) print(f'coverage: alias {alias_str} missing target {target_str}') coverage_pass = False continue if k in seq_to_filepath or k in non_vs_to_canonical: alias_str = unicode_data.seq_to_string(k) target_str = unicode_data.seq_to_string(v) print( f'coverage: alias {alias_str} already exists as {target_str} ({seq_name(v)})' ) coverage_pass = False continue filename = seq_to_filepath.get(v) or seq_to_filepath[ non_vs_to_canonical[v]] seq_to_filepath[k] = 'alias:' + filename # check single emoji, this includes most of the special chars emoji = sorted(unicode_data.get_emoji()) for cp in emoji: if tuple([cp]) not in seq_to_filepath: print(f'coverage: missing single {cp} ({unicode_data.name(cp)})') coverage_pass = False # special characters # all but combining enclosing keycap are currently marked as emoji for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + list(range(0x30, 0x3a)): if cp not in emoji and tuple([cp]) not in seq_to_filepath: print(f'coverage: missing special {cp} ({unicode_data.name(cp)})') coverage_pass = False # combining sequences comb_seq_to_name = sorted(unicode_data._emoji_sequence_data.items()) for seq, name in comb_seq_to_name: if seq not in seq_to_filepath: # strip vs and try again non_vs_seq = unicode_data.strip_emoji_vs(seq) if non_vs_seq not in seq_to_filepath: print( f'coverage: missing combining sequence {unicode_data.seq_to_string(seq)} ({name})' ) coverage_pass = False # check for 'unknown flag' # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that # don't start with our prefix so 'unknown_flag' would be excluded by default. if tuple([0xfe82b]) not in seq_to_filepath: print('coverage: missing unknown flag PUA fe82b') coverage_pass = False if not coverage_pass: exit( "Please fix the problems metioned above or run: make BYPASS_SEQUENCE_CHECK='True'" )