Beispiel #1
0
def main():
    strip_atlines = False
    strip_comments = False
    prefix = None
    i = 1
    while i < len(sys.argv):
        arg = sys.argv[i]
        i += 1
        if arg == '--help':
            print('usage: expand.py [<options>] <filename> [...]')
            print('  -a        strip metadata')
            print('  +a        keep metadata (default)')
            print('  -c        strip comments')
            print('  +c        keep comments (default)')
            print('  -d path   change directory before processing files')
        elif arg == '-a':
            strip_atlines = True
        elif arg == '+a':
            strip_atlines = False
        elif arg == '-c':
            strip_comments = True
        elif arg == '+c':
            strip_comments = False
        elif arg == '-d' and i < len(sys.argv):
            prefix = sys.argv[i]
            i += 1
        elif prefix is None:
            for line in expand(arg, strip_atlines, strip_comments):
                print(line)
        else:
            with cd(prefix):
                for line in expand(arg, strip_atlines, strip_comments):
                    print(line)
def get_puadata():
	with cd(charset_path('puadata')):
		for path in ls('.'):
			if os.path.basename(path) == 'sources.txt':
				meta = {}
				chars = {}
				blocks = []
				for line in expand(path):
					if line:
						fields = strip_comment(line).split(':', 2)
						if len(fields) == 2:
							meta[fields[0].strip()] = fields[1].strip()
					else:
						break
				for line in expand(os.path.join(os.path.dirname(path), 'unicodedata.txt')):
					fields = line.split(';')
					try:
						cp = int(fields[0], 16)
						chars[cp] = fields
					except ValueError:
						continue
				for line in expand(os.path.join(os.path.dirname(path), 'blocks.txt')):
					fields = line.split(';')
					if len(fields) == 2:
						blockname = fields[1].strip()
						fields = fields[0].split('..')
						if len(fields) == 2:
							try:
								start = int(fields[0], 16)
								stop = int(fields[1], 16)
								blocks.append((start, stop, blockname))
							except ValueError:
								continue
				blocks.sort()
				yield meta, chars, blocks
def get_assertions():
    assertions = {}
    with cd(charset_path('identifiers')):
        for path in ls('.'):
            headers = []
            dotdotdot = False
            for line in expand(path):
                if is_atline(line):
                    headers = []
                    dotdotdot = False
                    for field in strip_comment(line).split():
                        if field == '...':
                            dotdotdot = True
                            break
                        elif field[0] == '@':
                            headers.append(field[1:].lower())
                        else:
                            headers.append(field.lower())
                else:
                    aa = []
                    fields = strip_comment(line).split()
                    for i in range(0, len(fields)):
                        if dotdotdot or i < len(headers):
                            if fields[i] != '--':
                                aa.append(
                                    (headers[i if i < len(headers) else -1],
                                     fields[i].lower()))
                    for a in aa:
                        if a not in assertions:
                            assertions[a] = []
                        assertions[a].extend(aa)
    return assertions
Beispiel #4
0
def get_puadata():
	with cd(charset_path('puadata')):
		for path in ls('.'):
			if os.path.basename(path) == 'sources.txt':
				print('Reading Private Use Area data: %s' % path)
				meta = {}
				chars = {}
				for line in expand(path):
					if line:
						fields = strip_comment(line).split(':', 2)
						if len(fields) == 2:
							meta[fields[0].strip()] = fields[1].strip()
					else:
						break
				for line in expand(os.path.join(os.path.dirname(path), 'unicodedata.txt')):
					fields = line.split(';')
					try:
						cp = int(fields[0], 16)
						chars[cp] = fields
					except ValueError:
						continue
				yield meta, chars
def verify(path, assertions):
    atlines = []
    for line in expand(path):
        k, v = split_atline(line)
        if k is not None and v is not None:
            atlines.append((k.lower(), v.lower()))
    if len(atlines) == 0:
        return None
    else:
        errors = []
        for a in atlines:
            if a in assertions:
                for b in assertions[a]:
                    if b not in atlines:
                        errors.append((a[0], a[1], b[0], b[1]))
        return errors
Beispiel #6
0
def generate(inpath, outpath):
	txt = None
	lines = []
	for line in expand(inpath):
		t = __generate_matcher.match(line)
		if t is not None:
			txt = t
		elif not is_atline(line):
			lines.append(line)
	if txt is None:
		return None
	else:
		path = os.path.join(outpath, txt)
		parent = os.path.dirname(path)
		if not os.path.exists(parent):
			os.makedirs(parent)
		with open(path, 'w') as f:
			for line in lines:
				print(line, file=f)
		return path
Beispiel #7
0
def read_encoding(path):
	meta = {}
	map = {}
	print('Reading encoding data: %s' % path)
	for line in expand(path):
		k, v = split_atline(line)
		if k is not None and v is not None:
			if k in meta:
				meta[k].append(v)
			else:
				meta[k] = [v]
			continue
		b, c, ba, ca = split_mapline(line)
		if b is not None and c is not None:
			if not ba or tuple(b) not in map:
				map[tuple(b)] = tuple(c)
			continue

	if 'category' in meta and len(meta['category']) > 0:
		category = meta['category'][0]
	else:
		category = 'Unsorted'
	meta['category'] = category

	display = []
	name = []
	if 'display' in meta:
		display += meta['display']
	if 'name' in meta:
		display += meta['name']
		name += meta['name']
	if 'display' in meta:
		name += meta['display']
	if 'alias' in meta:
		display += meta['alias']
		name += meta['alias']
	if 'charset' in meta:
		display += meta['charset']
		name += meta['charset']
	if len(display) > 0:
		display_other = display[1:]
		display = display[0]
	else:
		display_other = None
		display = None
	if len(name) > 0:
		name_other = [re.sub('[^A-Za-z0-9]+', '', n) for n in name[1:]]
		name = re.sub('[^A-Za-z0-9]+', '', name[0])
	else:
		name_other = None
		name = None
	meta['display'] = display
	meta['display_other'] = display_other
	meta['name'] = name
	meta['name_other'] = name_other

	def tree_insert(tree, k, v):
		if len(k) == 1:
			tree['leaf'][k[0]] = v
		if len(k) > 1:
			if k[0] not in tree['branch']:
				tree['branch'][k[0]] = {'leaf': {}, 'branch': {}}
			tree_insert(tree['branch'][k[0]], k[1:], v)

	root = {'leaf': {}, 'branch': {}}
	for k in map:
		tree_insert(root, k, map[k])
	return meta, root
Beispiel #8
0
def verify(path):
    url = None
    map = {}
    for line in expand(path):
        u = __verify_matcher.match(line)
        if u is not None:
            url = u
        else:
            b, c, ba, ca = split_mapline(line)
            if b is not None and c is not None:
                if not ba or tuple(b) not in map:
                    map[tuple(b)] = tuple(c)
    if url is None:
        return None
    else:
        expmap = {}
        with open(acquire(url, 'local'), 'r') as f:
            for line in f:
                # Hacks for reference encodings: JIS X 0208 has three columns.
                if url.endswith('/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT'):
                    m = __jis_x_0208_matcher.match(line)
                    if m is not None:
                        line = line[:m.start(1)] + m.group(2) + '\t' + m.group(
                            3) + line[m.end(3):]
                # Hacks for reference encodings: Adobe reverses bytes and chars.
                if '/MAPPINGS/VENDORS/ADOBE/' in url:
                    m = __adobe_line_matcher.match(line)
                    if m is not None:
                        if (int(m.group(2), 16), ) in expmap:
                            continue
                        elif url.endswith('/symbol.txt') and m.group(
                                1) == '00B5' and m.group(2) == '6D':
                            continue
                        else:
                            line = line[:m.start(1)] + '0x' + m.group(
                                2) + '\t0x' + m.group(1) + line[m.end(2):]
                # Hacks for reference encodings: Bytes delimited by +0x.
                m = __delimited_byte_matcher.match(line)
                if m is not None:
                    line = line[:m.start(1)] + __delimited_byte_sub.sub(
                        '', m.group(1)) + line[m.end(1):]
                # Hacks for reference encodings: Odd number of hex digits in bytes.
                line = __odd_hex_digits_sub.sub('\\g<1>0\\g<2>', line)
                # Hacks for reference encodings: Apple's <LR> and <RL> markup.
                line = __apple_lr_sub.sub('\\g<1>0x202D\\g<2>+0x202C', line)
                line = __apple_rl_sub.sub('\\g<1>0x202E\\g<2>+0x202C', line)
                # End hacks.
                b, c, ba, ca = split_mapline(line)
                if b is not None and c is not None:
                    expmap[tuple(b)] = tuple(c)
        if not url.endswith('/MAPPINGS/OBSOLETE/EASTASIA/OTHER/CNS11643.TXT'):
            # Hacks for reference encodings: No ASCII characters.
            if all((x, ) not in expmap and (x, x) not in expmap
                   for x in range(0, 128)):
                for x in range(0, 128):
                    expmap[(x, )] = (x, )
            # Hacks for reference encodings: No C0 control characters.
            if all((x, ) not in expmap and (x, x) not in expmap
                   for x in range(0, 32) + [127]):
                for x in range(0, 32) + [127]:
                    expmap[(x, )] = (x, )
            # Hacks for reference encodings: No C1 control characters.
            if '/MAPPINGS/VENDORS/ADOBE/' in url or '/MAPPINGS/OBSOLETE/EASTASIA/' in url:
                if any((x, ) in expmap
                       for x in range(160, 256)) or url.endswith('/BIG5.TXT'):
                    if all((x, ) not in expmap and (x, x) not in expmap
                           for x in range(128, 160)):
                        for x in range(128, 160):
                            expmap[(x, )] = (x, )
        # Hacks for reference encodings: Undefined characters mapped to U+FFFD.
        for k, v in expmap.items():
            if v == (0xFFFD, ):
                del expmap[k]
        # End hacks.
        return set(map.items()) ^ set(expmap.items())