def main(): DEFAULT_OUTFILE = 'font_cmaps_temp.xml' parser = argparse.ArgumentParser() parser.add_argument( '-o', '--outfile', help='output file to write ("%s" if no name provided)' % DEFAULT_OUTFILE, metavar='name', nargs='?', default=None, const=DEFAULT_OUTFILE) parser.add_argument( '-p', '--paths', help='list of directory paths to search for noto fonts ' '(default is standard noto phase2 paths)', metavar='path', nargs='*', default=None) args = parser.parse_args() cmapdata = font_cmap_data(args.paths) if args.outfile: cmap_data.write_cmap_data_file(cmapdata, args.outfile, pretty=True) else: print( unicode(cmap_data.write_cmap_data(cmapdata, pretty=True), "utf-8"))
def ttc_filenames(ttc, data): """Returns likely filenames for each ttc file. The filenames are based on the postscript name from the name table for each font. When there is no information, the string '<unknown x>' is provided with either 'ttf' or 'otf' in place of 'x' depending on the info in the sfnt header. """ names = [] for font_entry in ttc.fonts: name_entry = None file_name = None for ix in font_entry.tables: if ttc.tables[ix].tag == 'name': name_entry = ttc.tables[ix] break if name_entry: offset = name_entry.offset limit = offset + name_entry.length name_table = NameTable() name_table.decompile(data[offset:limit], None) ps_name = None for r in name_table.names: if (r.nameID, r.platformID, r.platEncID, r.langID) == (6, 3, 1, 0x409): ps_name = unicode(r.string, 'UTF-16BE') break if ps_name: file_name = ps_name if '-' not in ps_name: file_name += '-Regular' file_name += '.' + font_entry.fmt names.append(file_name or ('<unknown %s>' % font_entry.fmt)) return names
def sub(esc_match): esc_type = esc_match.group(1) esc_val = esc_match.group(2) if esc_type == 'x': esc_len = 2 elif esc_type == 'u': esc_len = 4 elif esc_type == 'U': esc_len = 6 else: raise ValueError('internal error') if len(esc_val) < esc_len: error = 'Unicode escape too short: "%s"' % (esc_match.group(0)) raise ValueError(error) unival = int(esc_val[:esc_len], 16) if unival > 0x10ffff: error = 'Unicode escape value too large: "%X"' % unival raise ValueError(error) if unival < 0x10000: prefix = unichr(unival) else: prefix = unicode('\\U%08X' % unival, encoding='unicode_escape', errors='strict') return prefix + esc_val[esc_len:]
def _xml_to_dict(element): result = {} for child in list(element): if 'alt' in child.attrib: continue key = child.get('type') key = key.replace('_', '-') result[key] = unicode(child.text) return result
def _xml_to_dict(element): result = {} for child in list(element): if "alt" in child.attrib: continue key = child.get("type") key = key.replace("_", "-") result[key] = unicode(child.text) return result
def get_name_records(font): """Get a font's 'name' table records as a dictionary of Unicode strings.""" name_table = font['name'] names = {} for record in name_table.names: name_ids = (record.platformID, record.platEncID, record.langID) if name_ids != (3, 1, 0x409): continue names[record.nameID] = unicode(record.string, 'UTF-16BE') return names
def get_scripts(text): """Return the set of scripts in this text. Excludes some common chars.""" # ignore these chars, we assume they are ok in any script exclusions = {0x00, 0x0A, 0x0D, 0x20, 0xA0, 0xFEFF} zyyy_chars = set() scripts = set() ustr = unicode(text, 'utf8') for cp in ustr: if ord(cp) in exclusions: continue script = unicode_data.script(cp) if script == 'Zyyy': # common/undetermined zyyy_chars.add(cp if cp < '\u00fe' else ord(cp)) elif not script == 'Zinh': # inherited scripts.add(script) return scripts, zyyy_chars
def main(): DEFAULT_UNICODE_VERSION = 9.0 parser = argparse.ArgumentParser() parser.add_argument( '--scripts', help='list of pseudo-script codes, empty for all ' 'phase 2 scripts', metavar='code', nargs='*') parser.add_argument( '--unicode_version', help='version of unicode to use (default %s)' % DEFAULT_UNICODE_VERSION, metavar='version', type=float, default=DEFAULT_UNICODE_VERSION) parser.add_argument( '--unicode_only', help='only use unicode data, not noto-specific data', action='store_true') parser.add_argument( '-p', '--phase', help='noto phase (default 3)', metavar='phase', type=int, default=3) parser.add_argument( '--outfile', help='write to output file, otherwise to stdout', metavar='fname', nargs='?', const='-default-') parser.add_argument( '--verbose', help='log to stderr as each script is complete', action='store_true') args = parser.parse_args() if not args.scripts: scripts = set(s.strip() for s in _PHASE_TWO_SCRIPTS.split(',')) else: scripts = _check_scripts(args.scripts) cmapdata = get_cmap_data( scripts, args.unicode_version, args.phase, args.unicode_only, args.verbose) if args.outfile: if args.outfile == '-default-': args.outfile = 'lint_cmap_%s.xml' % args.unicode_version sys.stderr.write('writing %s\n' % args.outfile) cmap_data.write_cmap_data_file(cmapdata, args.outfile, pretty=True) else: print(unicode(cmap_data.write_cmap_data(cmapdata, pretty=True), "utf-8"))
def _get_language_name_from_file(language, cldr_file_path): cache_key = (language, cldr_file_path) try: return _LANGUAGE_NAME_FROM_FILE_CACHE[cache_key] except KeyError: pass data_file = path.join(CLDR_DIR, cldr_file_path) try: root = ElementTree.parse(data_file).getroot() except IOError: _LANGUAGE_NAME_FROM_FILE_CACHE[cache_key] = None return None parent = root.find('.//languages') if parent is None: return None for tag in parent: assert tag.tag == 'language' if tag.get('type').replace('_', '-') == language: _LANGUAGE_NAME_FROM_FILE_CACHE[cache_key] = unicode(tag.text) return _LANGUAGE_NAME_FROM_FILE_CACHE[cache_key] return None
def main(): merge_table = { 'Historic': [ 'Avestan', 'Carian', 'Egyptian Hieroglyphs', 'Imperial Aramaic', 'Pahlavi', # Should be 'Inscriptional Pahlavi', 'Parthian', # Should be 'Inscriptional Parthian', 'Linear B', 'Lycian', 'Lydian', 'Mandaic', 'Old Persian', 'Old South Arabian', 'Old Turkic', 'Osmanya', 'Phags-Pa', 'Phoenician', 'Samaritan', 'Sumero-Akkadian Cuneiform', 'Ugaritic', ], 'South Asian': [ 'Devanagari', 'Bengali', 'Gurmukhi', 'Gujarati', 'Oriya', 'Tamil', 'Telugu', 'Kannada', 'Malayalam', 'Sinhala', 'Thaana', 'Brahmi', 'Kaithi', 'Kharoshthi', # Move to Historic? 'Lepcha', 'Limbu', 'Meetei Mayek', 'Ol Chiki', 'Saurashtra', 'Syloti Nagri', ], 'Southeast Asian': [ 'Thai', 'Lao', 'Khmer', 'Batak', 'Buginese', 'Buhid', 'Cham', 'Hanunoo', 'Javanese', 'Kayah Li', 'New Tai Lue', 'Rejang', 'Sundanese', 'Tagalog', 'Tagbanwa', 'Tai Le', 'Tai Tham', 'Tai Viet', ], '': [ # LGC, 'Armenian', 'Bamum', 'Canadian Aboriginal', 'Cherokee', 'Coptic', 'Cypriot Syllabary', 'Deseret', 'Ethiopic', 'Georgian', 'Glagolitic', 'Gothic', 'Hebrew', 'Lisu', 'NKo', 'Ogham', 'Old Italic', 'Runic', 'Shavian', 'Tifinagh', 'Vai', ], } add_ui_alternative(merge_table, 'South Asian') add_ui_alternative(merge_table, 'Southeast Asian') for merge_target in sorted(merge_table): for weight in ['Regular', 'Bold']: merger = merge.Merger() source_fonts = merge_table[merge_target] if '' not in source_fonts: source_fonts = [''] + source_fonts # The LGC font regular_sources = [make_font_file_name(script, weight) for script in source_fonts] regular_sources = [font for font in regular_sources if os.path.isfile(font)] if len(regular_sources) <= 1: continue print('Merging Noto Sans %s %s' % (merge_target, weight)) for index, fontfile in enumerate(regular_sources): if not has_gsub_table(fontfile): regular_sources[index] = add_gsub_to_font(fontfile) font = merger.merge(regular_sources) first_font = source_fonts[0] if first_font != merge_target: for name_record in font['name'].names: name = unicode(name_record.string, 'UTF-16BE') name = name.replace(make_font_name(first_font), make_font_name(merge_target)) name = name.replace(make_puncless_font_name(first_font), make_puncless_font_name(merge_target)) name_record.string = name.encode('UTF-16BE') font.save(make_font_file_name( merge_target, weight, directory='combined/unhinted'))
def cp_to_str(cp): if cp < 0x10000: return unichr(cp) return unicode(r'\U%08X' % cp, encoding='unicode_escape')
def get_sample_from_sample_file(language, script): filepath = path.join(SAMPLE_TEXT_DIR, language+'-'+script+'.txt') if path.exists(filepath): return unicode(open(filepath).read().strip(), 'UTF-8') return None