Example #1
0
def main():
    if len(sys.argv) > 1:
        for arg in sys.argv[1:]:
            if arg[0] == '@':
                for f in ls(charset_path(arg[1:])):
                    print(f)
            else:
                for f in ls(arg):
                    print(f)
    else:
        for f in ls('.'):
            print(f)
Example #2
0
def get_font_data():
    fonts = {}
    path = charset_path('font-metadata')
    for path in ls(path):
        print('Reading font data: %s' % path)
        try:
            font_data = get_font_file_data(path)
        except Exception as e:
            print('Error: %s' % e)
            continue
        if font_data is None:
            print('Skipping this font because reasons.')
        elif font_data[0] is None:
            print('Error: Font has no name.')
        elif font_data[0] in fonts:
            newchars = fonts[font_data[0]][1].update(font_data[1])
            newvendor = font_data[2] if fonts[
                font_data[0]][2] is None else fonts[font_data[0]][2]
            fonts[font_data[0]] = (font_data[0], newchars, newvendor, None)
        else:
            fonts[font_data[0]] = (font_data[0], font_data[1], font_data[2],
                                   None)
    path = charset_path('acquisition', 'fonts')
    for modfile in ls(path):
        mod = load_plugin(modfile)
        if mod is not None:
            for name, path, url in mod.list_fonts():
                print('Reading font data: %s' % path)
                try:
                    font_data = get_font_file_data(path)
                except Exception as e:
                    print('Error: %s' % e)
                    continue
                if font_data is None:
                    print('Skipping this font because reasons.')
                elif font_data[0] is None:
                    print('Error: Font has no name.')
                elif font_data[0] in fonts:
                    newchars = fonts[font_data[0]][1].update(font_data[1])
                    newvendor = font_data[2] if fonts[
                        font_data[0]][2] is None else fonts[font_data[0]][2]
                    fonts[font_data[0]] = (font_data[0], newchars, newvendor,
                                           url)
                else:
                    fonts[font_data[0]] = (font_data[0], font_data[1],
                                           font_data[2], url)
    fonts = [fonts[k] for k in fonts]
    fonts.sort(key=lambda font: font[0].lower())
    return fonts
Example #3
0
def main():
	mappings = charset_path('out', 'MAPPINGS')
	with cd(charset_path('mappings')):
		for path in ls('.'):
			out = generate(path, mappings)
			if out is not None:
				print('mappings/%s -> %s' % (path[2:], out))
	create_indices(mappings)
	puadata = charset_path('out', 'PUADATA')
	with cd(charset_path('puadata')):
		for path in ls('.'):
			out = generate(path, puadata)
			if out is not None:
				print('puadata/%s -> %s' % (path[2:], out))
	create_indices(puadata)
Example #4
0
def main():
    path = charset_path('acquisition', 'unidata')
    for modfile in ls(path):
        mod = load_plugin(modfile)
        if mod is not None:
            for name, path in mod.list_files():
                print('%s -> %s' % (name, path))
def get_puadata():
	with cd(charset_path('puadata')):
		for path in ls('.'):
			if os.path.basename(path) == 'sources.txt':
				meta = {}
				chars = {}
				blocks = []
				for line in expand(path):
					if line:
						fields = strip_comment(line).split(':', 2)
						if len(fields) == 2:
							meta[fields[0].strip()] = fields[1].strip()
					else:
						break
				for line in expand(os.path.join(os.path.dirname(path), 'unicodedata.txt')):
					fields = line.split(';')
					try:
						cp = int(fields[0], 16)
						chars[cp] = fields
					except ValueError:
						continue
				for line in expand(os.path.join(os.path.dirname(path), 'blocks.txt')):
					fields = line.split(';')
					if len(fields) == 2:
						blockname = fields[1].strip()
						fields = fields[0].split('..')
						if len(fields) == 2:
							try:
								start = int(fields[0], 16)
								stop = int(fields[1], 16)
								blocks.append((start, stop, blockname))
							except ValueError:
								continue
				blocks.sort()
				yield meta, chars, blocks
Example #6
0
def main():
    path = charset_path('acquisition', 'entities')
    for modfile in ls(path):
        mod = load_plugin(modfile)
        if mod is not None:
            for cp, entity in mod.list_entities():
                print('%s: %s' % (cp, entity))
Example #7
0
def get_unidata():
	ranges = {}
	chars = {}
	path = charset_path('acquisition', 'unidata')
	for modfile in ls(path):
		mod = load_plugin(modfile)
		if mod is not None:
			print('Reading Unicode data: %s' % modfile)
			for name, path in mod.list_files():
				if name == 'UnicodeData.txt':
					with open(path, 'r') as ucd:
						for line in ucd:
							fields = line.strip().split(';')
							try:
								cp = int(fields[0], 16)
								if fields[1][:1] == '<' and fields[1][-3:] == 'st>':
									range_name = fields[1][1:-1].split(', ')
									if range_name[0] not in ranges:
										ranges[range_name[0]] = [cp, cp, fields, fields]
									elif range_name[1] == 'First':
										ranges[range_name[0]][0] = cp
										ranges[range_name[0]][2] = fields
									elif range_name[1] == 'Last':
										ranges[range_name[0]][1] = cp
										ranges[range_name[0]][3] = fields
								else:
									chars[cp] = fields
							except ValueError:
								continue
	return ranges, chars
def get_assertions():
    assertions = {}
    with cd(charset_path('identifiers')):
        for path in ls('.'):
            headers = []
            dotdotdot = False
            for line in expand(path):
                if is_atline(line):
                    headers = []
                    dotdotdot = False
                    for field in strip_comment(line).split():
                        if field == '...':
                            dotdotdot = True
                            break
                        elif field[0] == '@':
                            headers.append(field[1:].lower())
                        else:
                            headers.append(field.lower())
                else:
                    aa = []
                    fields = strip_comment(line).split()
                    for i in range(0, len(fields)):
                        if dotdotdot or i < len(headers):
                            if fields[i] != '--':
                                aa.append(
                                    (headers[i if i < len(headers) else -1],
                                     fields[i].lower()))
                    for a in aa:
                        if a not in assertions:
                            assertions[a] = []
                        assertions[a].extend(aa)
    return assertions
Example #9
0
def main():
    path = charset_path('acquisition', 'fonts')
    for modfile in ls(path):
        mod = load_plugin(modfile)
        if mod is not None:
            for name, path, url in mod.list_fonts():
                print('%s (%s) -> %s' % (name, url, path))
Example #10
0
def main():
    path = charset_path('acquisition', 'vendors')
    for modfile in ls(path):
        mod = load_plugin(modfile)
        if mod is not None:
            for vendor in mod.list_vendors():
                for key in sorted(vendor.keys()):
                    print('%s: %s' % (key, vendor[key]))
                print()
Example #11
0
def main():
    with cd(charset_path('mappings')):
        for path in ls('.'):
            result = verify(path)
            if result is not None:
                if len(result) > 0:
                    print('mappings/%s: FAILED:\n%r' % (path[2:], result))
                else:
                    print('mappings/%s: PASSED' % path[2:])
Example #12
0
def get_entities():
	entities = {}
	path = charset_path('acquisition', 'entities')
	for modfile in ls(path):
		mod = load_plugin(modfile)
		if mod is not None:
			for cp, entity in mod.list_entities():
				if cp not in entities:
					entities[cp] = entity
	return entities
def main():
    assertions = get_assertions()
    with cd(charset_path('mappings')):
        for path in ls('.'):
            errors = verify(path, assertions)
            if errors is not None:
                if len(errors) > 0:
                    print('mappings/%s: FAILED:' % path[2:])
                    for e in errors:
                        print('Encoding with %s %s must have %s %s.' % e)
                else:
                    print('mappings/%s: PASSED' % path[2:])
Example #14
0
def get_puadata():
	with cd(charset_path('puadata')):
		for path in ls('.'):
			if os.path.basename(path) == 'sources.txt':
				print('Reading Private Use Area data: %s' % path)
				meta = {}
				chars = {}
				for line in expand(path):
					if line:
						fields = strip_comment(line).split(':', 2)
						if len(fields) == 2:
							meta[fields[0].strip()] = fields[1].strip()
					else:
						break
				for line in expand(os.path.join(os.path.dirname(path), 'unicodedata.txt')):
					fields = line.split(';')
					try:
						cp = int(fields[0], 16)
						chars[cp] = fields
					except ValueError:
						continue
				yield meta, chars
Example #15
0
def main():
	categories = {}
	by_charset = {}
	by_mibenum = {}
	by_codepage = {}
	by_cfstrenc = {}
	by_nsstrenc = {}
	by_name = {}
	by_kte = {}
	ranges, chars = get_unidata()
	fonts = get_font_data()
	with cd(charset_path('mappings')):
		for path in ls('.'):
			meta, root = read_encoding(path)
			if meta['display'] is None or meta['name'] is None:
				print('Skipping: Is a fragment or encoding has no name.')
				continue
			basedir = charset_path('out', 'encoding', meta['name'])
			build_encoding(ranges, chars, fonts, meta, root, basedir)
			if meta['category'] in categories:
				categories[meta['category']].append(meta)
			else:
				categories[meta['category']] = [meta]
			if 'charset' in meta:
				for cs in meta['charset']:
					by_charset[cs] = meta
			if 'mibenum' in meta:
				for i in meta['mibenum']:
					try:
						by_mibenum[int(i)] = meta
					except ValueError:
						pass
			if 'codepage' in meta:
				for i in meta['codepage']:
					try:
						by_codepage[int(i)] = meta
					except ValueError:
						pass
			if 'cfstringencoding' in meta:
				for i in meta['cfstringencoding']:
					try:
						by_cfstrenc[int(i)] = meta
					except ValueError:
						pass
			if 'nsstringencoding' in meta:
				for i in meta['nsstringencoding']:
					try:
						by_nsstrenc[int(i)] = meta
					except ValueError:
						pass
			by_name[meta['name']] = meta
			for n in meta['name_other']:
				by_name[n] = meta
			if 'filename-kte' in meta:
				for n in meta['filename-kte']:
					by_kte[n] = meta

	basedir = charset_path('out', 'encoding')
	if not os.path.exists(basedir):
		os.makedirs(basedir)
	path = os.path.join(basedir, 'index.shtml')
	print('Writing encoding index: %s' % path)
	with open(path, 'w') as f:
		print('<!--#include virtual="/static/head.html"-->', file=f)
		print('<title>Character Encodings - Legacy Encodings</title>', file=f)
		print('<link rel="stylesheet" href="/charset/shared/enclist.css">', file=f)
		print('<!--#include virtual="/static/body.html"-->', file=f)
		print('<p class="breadcrumb"><a href="/charset/">Character Encodings</a> &raquo;</p>', file=f)
		print('<h1>Legacy Encodings</h1>', file=f)
		for category in sorted(categories, key=lambda c: nat_key(c if ' - ' in c else ' - ' + c)):
			print('<h2>%s</h2>' % html_encode(category), file=f)
			print('<div class="enclist-wrapper"><table class="enclist">', file=f)
			for m in sorted(categories[category], key=lambda m: nat_key(m['display'])):
				print('<tr><td>%s</td></tr>' % encoding_link(m), file=f)
			print('</table></div>', file=f)
		print('<h2>By IANA Charset</h2>', file=f)
		print('<div class="enclist-wrapper"><table class="enclist">', file=f)
		for cs in sorted(by_charset, key=nat_key):
			print('<tr><td class="charset">%s</td><td>%s</td></tr>' % (cs, encoding_link(by_charset[cs])), file=f)
		print('</table></div>', file=f)
		print('<h2>By IANA MIBenum</h2>', file=f)
		print('<div class="enclist-wrapper"><table class="enclist">', file=f)
		for i in sorted(by_mibenum):
			print('<tr><td>%d</td><td>%s</td></tr>' % (i, encoding_link(by_mibenum[i])), file=f)
		print('</table></div>', file=f)
		print('<h2>By Code Page</h2>', file=f)
		print('<div class="enclist-wrapper"><table class="enclist">', file=f)
		for i in sorted(by_codepage):
			print('<tr><td>%03d</td><td>%s</td></tr>' % (i, encoding_link(by_codepage[i])), file=f)
		print('</table></div>', file=f)
		print('<h2>By CFStringEncoding</h2>', file=f)
		print('<div class="enclist-wrapper"><table class="enclist">', file=f)
		for i in sorted(by_cfstrenc):
			print('<tr><td>%d</td><td>%s</td></tr>' % (i, encoding_link(by_cfstrenc[i])), file=f)
		print('</table></div>', file=f)
		print('<h2>By NSStringEncoding</h2>', file=f)
		print('<div class="enclist-wrapper"><table class="enclist">', file=f)
		for i in sorted(by_nsstrenc):
			print('<tr><td>%d</td><td>%s</td></tr>' % (i, encoding_link(by_nsstrenc[i])), file=f)
		print('</table></div>', file=f)
		print('<!--#include virtual="/static/tail.html"-->', file=f)

	path = charset_path('out', 'encoding.php')
	print('Writing encoding redirect: %s' % path)
	with open(path, 'w') as f:
		print('<?php', file=f)
		print('if (isset($_GET[\'file\'])) {', file=f)
		print('\t$file = $_GET[\'file\'];', file=f)
		print('\tswitch ($file) {', file=f)
		for k in sorted(by_kte):
			print('\t\tcase \'%s\': header(\'Location: /charset/encoding/%s\'); exit(0);' % (k, by_kte[k]['name']), file=f)
		print('\t}', file=f)
		print('}', file=f)
		print('if (isset($_GET[\'name\'])) {', file=f)
		print('\t$name = preg_replace(\'/[^A-Za-z0-9]+/\', \'\', $_GET[\'name\']);', file=f)
		print('\tswitch ($name) {', file=f)
		for k in sorted(by_name):
			print('\t\tcase \'%s\': header(\'Location: /charset/encoding/%s\'); exit(0);' % (k, by_name[k]['name']), file=f)
		print('\t}', file=f)
		print('}', file=f)
		print('header(\'Location: /charset/encoding/\');', file=f)