def _add_aliases(keys, aliases): for k, v in sorted(aliases.iteritems()): k_str = unicode_data.seq_to_string(k) v_str = unicode_data.seq_to_string(v) if k in keys: msg = '' if v in keys else ' but it\'s not present' print('have alias image %s, should use %s%s' % (k_str, v_str, msg)) elif v not in keys: print('can\'t use alias %s, no image matching %s' % (k_str, v_str)) to_add = {k for k, v in aliases.iteritems() if k not in keys and v in keys} return keys | to_add
def _get_filepath(cp): def get_key_filepath(key): for i in range(len(dir_infos)): info = dir_infos[i] if key in info.filemap: basepath = basepaths[i] return path.join(basepath, info.filemap[key]) return None cp_key = tuple([cp]) cp_key = unicode_data.get_canonical_emoji_sequence(cp_key) or cp_key fp = get_key_filepath(cp_key) if not fp: if cp_key in aliases: fp = get_key_filepath(aliases[cp_key]) else: print 'no alias for %s' % unicode_data.seq_to_string(cp_key) if not fp: print 'no part for %s in %s' % (unicode_data.seq_to_string( cp_key), unicode_data.seq_to_string(key_tuple)) return fp
def _get_name(key_tuple, annotated_tuples): CELL_PREFIX = '<td%s>' % ('' if annotated_tuples is None or key_tuple not in annotated_tuples else ' class="aname"') seq_name = unicode_data.get_emoji_sequence_name(key_tuple) if seq_name == None: if key_tuple == (0x20e3, ): seq_name = '(combining enlosing keycap)' elif key_tuple == (0xfe82b, ): seq_name = '(unknown flag PUA codepoint)' else: print 'no name for %s' % unicode_data.seq_to_string(key_tuple) seq_name = '(oops)' return CELL_PREFIX + seq_name
def _get_name(key_tuple, annotations): annotation = None if annotations is None else annotations.get(key_tuple) CELL_PREFIX = '<td%s>' % ( '' if annotation is None else ' class="%s"' % annotation) seq_name = unicode_data.get_emoji_sequence_name(key_tuple) if seq_name == None: if key_tuple == (0x20e3,): seq_name = '(combining enlosing keycap)' elif key_tuple == (0xfe82b,): seq_name = '(unknown flag PUA codepoint)' else: print('no name for %s' % unicode_data.seq_to_string(key_tuple)) seq_name = '(oops)' return CELL_PREFIX + seq_name
def _parse_annotation_file(afile): """Parse file and return a map from sequences to one of 'ok', 'warning', or 'error'. The file format consists of two kinds of lines. One defines the annotation to apply, it consists of the text 'annotation:' followed by one of 'ok', 'warning', or 'error'. The other defines a sequence that should get the most recently defined annotation, this is a series of codepoints expressed in hex separated by spaces. The initial default annotation is 'error'. '#' starts a comment to end of line, blank lines are ignored. """ annotations = {} line_re = re.compile(r'annotation:\s*(ok|warning|error)|([0-9a-f ]+)') annotation = 'error' with open(afile, 'r') as f: for line in f: line = line.strip() if not line or line[0] == '#': continue m = line_re.match(line) if not m: raise Exception('could not parse annotation "%s"' % line) new_annotation = m.group(1) if new_annotation: annotation = new_annotation else: seq = tuple([int(s, 16) for s in m.group(2).split()]) canonical_seq = unicode_data.get_canonical_emoji_sequence(seq) if canonical_seq: seq = canonical_seq if seq in annotations: raise Exception( 'duplicate sequence %s in annotations' % unicode_data.seq_to_string(seq)) annotations[seq] = annotation return annotations
def sequence_to_filename(seq, prefix, suffix): return ''.join((prefix, unicode_data.seq_to_string(seq), suffix))
def generate_names(src_dir, dst_dir, skip_limit=20, omit_groups=None, pretty_print=False, verbose=False): srcdir = tool_utils.resolve_path(src_dir) if not path.isdir(srcdir): print('%s is not a directory' % src_dir, file=sys.stderr) return if omit_groups: unknown_groups = set(omit_groups) - set( unicode_data.get_emoji_groups()) if unknown_groups: print( 'did not recognize %d group%s: %s' % (len(unknown_groups), '' if len(unknown_groups) == 1 else 's', ', '.join('"%s"' % g for g in omit_groups if g in unknown_groups)), file=sys.stderr) print('valid groups are:\n %s' % ('\n '.join(g for g in unicode_data.get_emoji_groups())), file=sys.stderr) return print('omitting %d group%s: %s' % (len(omit_groups), '' if len(omit_groups) == 1 else 's', ', '.join('"%s"' % g for g in omit_groups))) else: # might be None print('keeping all groups') omit_groups = [] # make sure the destination exists dstdir = tool_utils.ensure_dir_exists(tool_utils.resolve_path(dst_dir)) # _get_image_data returns canonical cp sequences print('src dir:', srcdir) seq_to_file = generate_emoji_html._get_image_data(srcdir, 'png', 'emoji_u') print('seq to file has %d sequences' % len(seq_to_file)) # Aliases add non-gendered versions using gendered images for the most part. # But when we display the images, we don't distinguish genders in the # naming, we rely on the images-- so these look redundant. So we # intentionally don't generate images for these. # However, the alias file also includes the flag aliases, which we do want, # and it also fails to exclude the unknown flag pua (since it doesn't # map to anything), so we need to adjust for this. canonical_aliases = generate_emoji_html._get_canonical_aliases() aliases = set([ cps for cps in canonical_aliases.keys() if not unicode_data.is_regional_indicator_seq(cps) ]) aliases.add((0xfe82b, )) # unknown flag PUA excluded = aliases | generate_emoji_html._get_canonical_excluded() # The flag aliases have distinct names, so we _do_ want to show them # multiple times. to_add = {} for seq in canonical_aliases: if unicode_data.is_regional_indicator_seq(seq): replace_seq = canonical_aliases[seq] if seq in seq_to_file: print('warning, alias %s has file %s' % (unicode_data.regional_indicator_seq_to_string(seq), seq_to_file[seq])) continue replace_file = seq_to_file.get(replace_seq) if replace_file: to_add[seq] = replace_file seq_to_file.update(to_add) data = [] last_skipped_group = None skipcount = 0 for group in unicode_data.get_emoji_groups(): if group in omit_groups: continue name_data = [] for seq in unicode_data.get_emoji_in_group(group): if seq in excluded: continue seq_file = seq_to_file.get(seq, None) if seq_file is None: skipcount += 1 if verbose: if group != last_skipped_group: print('group %s' % group) last_skipped_group = group print(' %s (%s)' % (unicode_data.seq_to_string(seq), ', '.join( unicode_data.name(cp, 'x') for cp in seq))) if skip_limit >= 0 and skipcount > skip_limit: raise Exception('skipped too many items') else: name_data.append(_name_data(seq, seq_file)) data.append({'category': group, 'emojis': name_data}) outfile = path.join(dstdir, 'data.json') with open(outfile, 'w') as f: indent = 2 if pretty_print else None separators = None if pretty_print else (',', ':') json.dump(data, f, indent=indent, separators=separators) print('wrote %s' % outfile)
def _check_coverage(seq_to_filepath, unicode_version): """Ensure we have all and only the cps and sequences that we need for the font as of this version.""" age = unicode_version non_vs_to_canonical = {} for k in seq_to_filepath: if EMOJI_VS in k: non_vs = unicode_data.strip_emoji_vs(k) non_vs_to_canonical[non_vs] = k aliases = add_aliases.read_default_emoji_aliases() for k, v in sorted(aliases.items()): if v not in seq_to_filepath and v not in non_vs_to_canonical: alias_str = unicode_data.seq_to_string(k) target_str = unicode_data.seq_to_string(v) print('coverage: alias %s missing target %s' % (alias_str, target_str)) continue if k in seq_to_filepath or k in non_vs_to_canonical: alias_str = unicode_data.seq_to_string(k) target_str = unicode_data.seq_to_string(v) print('coverage: alias %s already exists as %s (%s)' % ( alias_str, target_str, seq_name(v))) continue filename = seq_to_filepath.get(v) or seq_to_filepath[non_vs_to_canonical[v]] seq_to_filepath[k] = 'alias:' + filename # check single emoji, this includes most of the special chars emoji = sorted(unicode_data.get_emoji(age=age)) for cp in emoji: if tuple([cp]) not in seq_to_filepath: print( 'coverage: missing single %04x (%s)' % ( cp, unicode_data.name(cp, '<no name>'))) # special characters # all but combining enclosing keycap are currently marked as emoji for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a): if cp not in emoji and tuple([cp]) not in seq_to_filepath: print('coverage: missing special %04x (%s)' % (cp, unicode_data.name(cp))) # combining sequences comb_seq_to_name = sorted( unicode_data.get_emoji_combining_sequences(age=age).iteritems()) for seq, name in comb_seq_to_name: if seq not in seq_to_filepath: # strip vs and try again non_vs_seq = unicode_data.strip_emoji_vs(seq) if non_vs_seq not in seq_to_filepath: print('coverage: missing combining sequence %s (%s)' % (unicode_data.seq_to_string(seq), name)) # flag sequences flag_seq_to_name = sorted( unicode_data.get_emoji_flag_sequences(age=age).iteritems()) for seq, name in flag_seq_to_name: if seq not in seq_to_filepath: print('coverage: missing flag sequence %s (%s)' % (unicode_data.seq_to_string(seq), name)) # skin tone modifier sequences mod_seq_to_name = sorted( unicode_data.get_emoji_modifier_sequences(age=age).iteritems()) for seq, name in mod_seq_to_name: if seq not in seq_to_filepath: print('coverage: missing modifier sequence %s (%s)' % ( unicode_data.seq_to_string(seq), name)) # zwj sequences # some of ours include the emoji presentation variation selector and some # don't, and the same is true for the canonical sequences. normalize all # of them to omit it to test coverage, but report the canonical sequence. zwj_seq_without_vs = set() for seq in seq_to_filepath: if ZWJ not in seq: continue if EMOJI_VS in seq: seq = tuple(cp for cp in seq if cp != EMOJI_VS) zwj_seq_without_vs.add(seq) for seq, name in sorted( unicode_data.get_emoji_zwj_sequences(age=age).iteritems()): if EMOJI_VS in seq: test_seq = tuple(s for s in seq if s != EMOJI_VS) else: test_seq = seq if test_seq not in zwj_seq_without_vs: print('coverage: missing (canonical) zwj sequence %s (%s)' % ( unicode_data.seq_to_string(seq), name)) # check for 'unknown flag' # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that # don't start with our prefix so 'unknown_flag' would be excluded by default. if tuple([0xfe82b]) not in seq_to_filepath: print('coverage: missing unknown flag PUA fe82b')
def _check_coverage(seq_to_filepath, unicode_version): """Ensure we have all and only the cps and sequences that we need for the font as of this version.""" age = unicode_version non_vs_to_canonical = {} for k in seq_to_filepath: if EMOJI_VS in k: non_vs = unicode_data.strip_emoji_vs(k) non_vs_to_canonical[non_vs] = k aliases = add_aliases.read_default_emoji_aliases() for k, v in sorted(aliases.items()): if v not in seq_to_filepath and v not in non_vs_to_canonical: alias_str = unicode_data.seq_to_string(k) target_str = unicode_data.seq_to_string(v) print('coverage: alias %s missing target %s' % (alias_str, target_str)) continue if k in seq_to_filepath or k in non_vs_to_canonical: alias_str = unicode_data.seq_to_string(k) target_str = unicode_data.seq_to_string(v) print('coverage: alias %s already exists as %s (%s)' % (alias_str, target_str, seq_name(v))) continue filename = seq_to_filepath.get(v) or seq_to_filepath[ non_vs_to_canonical[v]] seq_to_filepath[k] = 'alias:' + filename # check single emoji, this includes most of the special chars emoji = sorted(unicode_data.get_emoji(age=age)) for cp in emoji: if tuple([cp]) not in seq_to_filepath: print('coverage: missing single %04x (%s)' % (cp, unicode_data.name(cp, '<no name>'))) # special characters # all but combining enclosing keycap are currently marked as emoji for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a): if cp not in emoji and tuple([cp]) not in seq_to_filepath: print('coverage: missing special %04x (%s)' % (cp, unicode_data.name(cp))) # combining sequences comb_seq_to_name = sorted( unicode_data.get_emoji_combining_sequences(age=age).iteritems()) for seq, name in comb_seq_to_name: if seq not in seq_to_filepath: # strip vs and try again non_vs_seq = unicode_data.strip_emoji_vs(seq) if non_vs_seq not in seq_to_filepath: print('coverage: missing combining sequence %s (%s)' % (unicode_data.seq_to_string(seq), name)) # flag sequences flag_seq_to_name = sorted( unicode_data.get_emoji_flag_sequences(age=age).iteritems()) for seq, name in flag_seq_to_name: if seq not in seq_to_filepath: print('coverage: missing flag sequence %s (%s)' % (unicode_data.seq_to_string(seq), name)) # skin tone modifier sequences mod_seq_to_name = sorted( unicode_data.get_emoji_modifier_sequences(age=age).iteritems()) for seq, name in mod_seq_to_name: if seq not in seq_to_filepath: print('coverage: missing modifier sequence %s (%s)' % (unicode_data.seq_to_string(seq), name)) # zwj sequences # some of ours include the emoji presentation variation selector and some # don't, and the same is true for the canonical sequences. normalize all # of them to omit it to test coverage, but report the canonical sequence. zwj_seq_without_vs = set() for seq in seq_to_filepath: if ZWJ not in seq: continue if EMOJI_VS in seq: seq = tuple(cp for cp in seq if cp != EMOJI_VS) zwj_seq_without_vs.add(seq) for seq, name in sorted( unicode_data.get_emoji_zwj_sequences(age=age).iteritems()): if EMOJI_VS in seq: test_seq = tuple(s for s in seq if s != EMOJI_VS) else: test_seq = seq if test_seq not in zwj_seq_without_vs: print('coverage: missing (canonical) zwj sequence %s (%s)' % (unicode_data.seq_to_string(seq), name)) # check for 'unknown flag' # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that # don't start with our prefix so 'unknown_flag' would be excluded by default. if tuple([0xfe82b]) not in seq_to_filepath: print('coverage: missing unknown flag PUA fe82b')
def _check_coverage(seq_to_filepath, unicode_version): """Ensure we have all and only the cps and sequences that we need for the font as of this version.""" coverage_pass = True age = unicode_version non_vs_to_canonical = {} for k in seq_to_filepath: if EMOJI_VS in k: non_vs = unicode_data.strip_emoji_vs(k) non_vs_to_canonical[non_vs] = k aliases = add_aliases.read_default_emoji_aliases() for k, v in sorted(aliases.items()): if v not in seq_to_filepath and v not in non_vs_to_canonical: alias_str = unicode_data.seq_to_string(k) target_str = unicode_data.seq_to_string(v) print(f'coverage: alias {alias_str} missing target {target_str}') coverage_pass = False continue if k in seq_to_filepath or k in non_vs_to_canonical: alias_str = unicode_data.seq_to_string(k) target_str = unicode_data.seq_to_string(v) print( f'coverage: alias {alias_str} already exists as {target_str} ({seq_name(v)})' ) coverage_pass = False continue filename = seq_to_filepath.get(v) or seq_to_filepath[ non_vs_to_canonical[v]] seq_to_filepath[k] = 'alias:' + filename # check single emoji, this includes most of the special chars emoji = sorted(unicode_data.get_emoji()) for cp in emoji: if tuple([cp]) not in seq_to_filepath: print(f'coverage: missing single {cp} ({unicode_data.name(cp)})') coverage_pass = False # special characters # all but combining enclosing keycap are currently marked as emoji for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + list(range(0x30, 0x3a)): if cp not in emoji and tuple([cp]) not in seq_to_filepath: print(f'coverage: missing special {cp} ({unicode_data.name(cp)})') coverage_pass = False # combining sequences comb_seq_to_name = sorted(unicode_data._emoji_sequence_data.items()) for seq, name in comb_seq_to_name: if seq not in seq_to_filepath: # strip vs and try again non_vs_seq = unicode_data.strip_emoji_vs(seq) if non_vs_seq not in seq_to_filepath: print( f'coverage: missing combining sequence {unicode_data.seq_to_string(seq)} ({name})' ) coverage_pass = False # check for 'unknown flag' # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that # don't start with our prefix so 'unknown_flag' would be excluded by default. if tuple([0xfe82b]) not in seq_to_filepath: print('coverage: missing unknown flag PUA fe82b') coverage_pass = False if not coverage_pass: exit( "Please fix the problems metioned above or run: make BYPASS_SEQUENCE_CHECK='True'" )
def generate_names( src_dir, dst_dir, skip_limit=20, omit_groups=None, pretty_print=False, verbose=False): srcdir = tool_utils.resolve_path(src_dir) if not path.isdir(srcdir): print('%s is not a directory' % src_dir, file=sys.stderr) return if omit_groups: unknown_groups = set(omit_groups) - set(unicode_data.get_emoji_groups()) if unknown_groups: print('did not recognize %d group%s: %s' % ( len(unknown_groups), '' if len(unknown_groups) == 1 else 's', ', '.join('"%s"' % g for g in omit_groups if g in unknown_groups)), file=sys.stderr) print('valid groups are:\n %s' % ( '\n '.join(g for g in unicode_data.get_emoji_groups())), file=sys.stderr) return print('omitting %d group%s: %s' % ( len(omit_groups), '' if len(omit_groups) == 1 else 's', ', '.join('"%s"' % g for g in omit_groups))) else: # might be None print('keeping all groups') omit_groups = [] # make sure the destination exists dstdir = tool_utils.ensure_dir_exists( tool_utils.resolve_path(dst_dir)) # _get_image_data returns canonical cp sequences print('src dir:', srcdir) seq_to_file = generate_emoji_html._get_image_data(srcdir, 'png', 'emoji_u') print('seq to file has %d sequences' % len(seq_to_file)) # Aliases add non-gendered versions using gendered images for the most part. # But when we display the images, we don't distinguish genders in the # naming, we rely on the images-- so these look redundant. So we # intentionally don't generate images for these. # However, the alias file also includes the flag aliases, which we do want, # and it also fails to exclude the unknown flag pua (since it doesn't # map to anything), so we need to adjust for this. canonical_aliases = generate_emoji_html._get_canonical_aliases() aliases = set([ cps for cps in canonical_aliases.keys() if not unicode_data.is_regional_indicator_seq(cps)]) aliases.add((0xfe82b,)) # unknown flag PUA excluded = aliases | generate_emoji_html._get_canonical_excluded() # The flag aliases have distinct names, so we _do_ want to show them # multiple times. to_add = {} for seq in canonical_aliases: if unicode_data.is_regional_indicator_seq(seq): replace_seq = canonical_aliases[seq] if seq in seq_to_file: print('warning, alias %s has file %s' % ( unicode_data.regional_indicator_seq_to_string(seq), seq_to_file[seq])) continue replace_file = seq_to_file.get(replace_seq) if replace_file: to_add[seq] = replace_file seq_to_file.update(to_add) data = [] last_skipped_group = None skipcount = 0 for group in unicode_data.get_emoji_groups(): if group in omit_groups: continue name_data = [] for seq in unicode_data.get_emoji_in_group(group): if seq in excluded: continue seq_file = seq_to_file.get(seq, None) if seq_file is None: skipcount += 1 if verbose: if group != last_skipped_group: print('group %s' % group) last_skipped_group = group print(' %s (%s)' % ( unicode_data.seq_to_string(seq), ', '.join(unicode_data.name(cp, 'x') for cp in seq))) if skip_limit >= 0 and skipcount > skip_limit: raise Exception('skipped too many items') else: name_data.append(_name_data(seq, seq_file)) data.append({'category': group, 'emojis': name_data}) outfile = path.join(dstdir, 'data.json') with open(outfile, 'w') as f: indent = 2 if pretty_print else None separators = None if pretty_print else (',', ':') json.dump(data, f, indent=indent, separators=separators) print('wrote %s' % outfile)