def main(): strip_atlines = False strip_comments = False prefix = None i = 1 while i < len(sys.argv): arg = sys.argv[i] i += 1 if arg == '--help': print('usage: expand.py [<options>] <filename> [...]') print(' -a strip metadata') print(' +a keep metadata (default)') print(' -c strip comments') print(' +c keep comments (default)') print(' -d path change directory before processing files') elif arg == '-a': strip_atlines = True elif arg == '+a': strip_atlines = False elif arg == '-c': strip_comments = True elif arg == '+c': strip_comments = False elif arg == '-d' and i < len(sys.argv): prefix = sys.argv[i] i += 1 elif prefix is None: for line in expand(arg, strip_atlines, strip_comments): print(line) else: with cd(prefix): for line in expand(arg, strip_atlines, strip_comments): print(line)
def get_puadata(): with cd(charset_path('puadata')): for path in ls('.'): if os.path.basename(path) == 'sources.txt': meta = {} chars = {} blocks = [] for line in expand(path): if line: fields = strip_comment(line).split(':', 2) if len(fields) == 2: meta[fields[0].strip()] = fields[1].strip() else: break for line in expand(os.path.join(os.path.dirname(path), 'unicodedata.txt')): fields = line.split(';') try: cp = int(fields[0], 16) chars[cp] = fields except ValueError: continue for line in expand(os.path.join(os.path.dirname(path), 'blocks.txt')): fields = line.split(';') if len(fields) == 2: blockname = fields[1].strip() fields = fields[0].split('..') if len(fields) == 2: try: start = int(fields[0], 16) stop = int(fields[1], 16) blocks.append((start, stop, blockname)) except ValueError: continue blocks.sort() yield meta, chars, blocks
def get_assertions(): assertions = {} with cd(charset_path('identifiers')): for path in ls('.'): headers = [] dotdotdot = False for line in expand(path): if is_atline(line): headers = [] dotdotdot = False for field in strip_comment(line).split(): if field == '...': dotdotdot = True break elif field[0] == '@': headers.append(field[1:].lower()) else: headers.append(field.lower()) else: aa = [] fields = strip_comment(line).split() for i in range(0, len(fields)): if dotdotdot or i < len(headers): if fields[i] != '--': aa.append( (headers[i if i < len(headers) else -1], fields[i].lower())) for a in aa: if a not in assertions: assertions[a] = [] assertions[a].extend(aa) return assertions
def get_puadata(): with cd(charset_path('puadata')): for path in ls('.'): if os.path.basename(path) == 'sources.txt': print('Reading Private Use Area data: %s' % path) meta = {} chars = {} for line in expand(path): if line: fields = strip_comment(line).split(':', 2) if len(fields) == 2: meta[fields[0].strip()] = fields[1].strip() else: break for line in expand(os.path.join(os.path.dirname(path), 'unicodedata.txt')): fields = line.split(';') try: cp = int(fields[0], 16) chars[cp] = fields except ValueError: continue yield meta, chars
def verify(path, assertions): atlines = [] for line in expand(path): k, v = split_atline(line) if k is not None and v is not None: atlines.append((k.lower(), v.lower())) if len(atlines) == 0: return None else: errors = [] for a in atlines: if a in assertions: for b in assertions[a]: if b not in atlines: errors.append((a[0], a[1], b[0], b[1])) return errors
def generate(inpath, outpath): txt = None lines = [] for line in expand(inpath): t = __generate_matcher.match(line) if t is not None: txt = t elif not is_atline(line): lines.append(line) if txt is None: return None else: path = os.path.join(outpath, txt) parent = os.path.dirname(path) if not os.path.exists(parent): os.makedirs(parent) with open(path, 'w') as f: for line in lines: print(line, file=f) return path
def read_encoding(path): meta = {} map = {} print('Reading encoding data: %s' % path) for line in expand(path): k, v = split_atline(line) if k is not None and v is not None: if k in meta: meta[k].append(v) else: meta[k] = [v] continue b, c, ba, ca = split_mapline(line) if b is not None and c is not None: if not ba or tuple(b) not in map: map[tuple(b)] = tuple(c) continue if 'category' in meta and len(meta['category']) > 0: category = meta['category'][0] else: category = 'Unsorted' meta['category'] = category display = [] name = [] if 'display' in meta: display += meta['display'] if 'name' in meta: display += meta['name'] name += meta['name'] if 'display' in meta: name += meta['display'] if 'alias' in meta: display += meta['alias'] name += meta['alias'] if 'charset' in meta: display += meta['charset'] name += meta['charset'] if len(display) > 0: display_other = display[1:] display = display[0] else: display_other = None display = None if len(name) > 0: name_other = [re.sub('[^A-Za-z0-9]+', '', n) for n in name[1:]] name = re.sub('[^A-Za-z0-9]+', '', name[0]) else: name_other = None name = None meta['display'] = display meta['display_other'] = display_other meta['name'] = name meta['name_other'] = name_other def tree_insert(tree, k, v): if len(k) == 1: tree['leaf'][k[0]] = v if len(k) > 1: if k[0] not in tree['branch']: tree['branch'][k[0]] = {'leaf': {}, 'branch': {}} tree_insert(tree['branch'][k[0]], k[1:], v) root = {'leaf': {}, 'branch': {}} for k in map: tree_insert(root, k, map[k]) return meta, root
def verify(path): url = None map = {} for line in expand(path): u = __verify_matcher.match(line) if u is not None: url = u else: b, c, ba, ca = split_mapline(line) if b is not None and c is not None: if not ba or tuple(b) not in map: map[tuple(b)] = tuple(c) if url is None: return None else: expmap = {} with open(acquire(url, 'local'), 'r') as f: for line in f: # Hacks for reference encodings: JIS X 0208 has three columns. if url.endswith('/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT'): m = __jis_x_0208_matcher.match(line) if m is not None: line = line[:m.start(1)] + m.group(2) + '\t' + m.group( 3) + line[m.end(3):] # Hacks for reference encodings: Adobe reverses bytes and chars. if '/MAPPINGS/VENDORS/ADOBE/' in url: m = __adobe_line_matcher.match(line) if m is not None: if (int(m.group(2), 16), ) in expmap: continue elif url.endswith('/symbol.txt') and m.group( 1) == '00B5' and m.group(2) == '6D': continue else: line = line[:m.start(1)] + '0x' + m.group( 2) + '\t0x' + m.group(1) + line[m.end(2):] # Hacks for reference encodings: Bytes delimited by +0x. m = __delimited_byte_matcher.match(line) if m is not None: line = line[:m.start(1)] + __delimited_byte_sub.sub( '', m.group(1)) + line[m.end(1):] # Hacks for reference encodings: Odd number of hex digits in bytes. line = __odd_hex_digits_sub.sub('\\g<1>0\\g<2>', line) # Hacks for reference encodings: Apple's <LR> and <RL> markup. line = __apple_lr_sub.sub('\\g<1>0x202D\\g<2>+0x202C', line) line = __apple_rl_sub.sub('\\g<1>0x202E\\g<2>+0x202C', line) # End hacks. b, c, ba, ca = split_mapline(line) if b is not None and c is not None: expmap[tuple(b)] = tuple(c) if not url.endswith('/MAPPINGS/OBSOLETE/EASTASIA/OTHER/CNS11643.TXT'): # Hacks for reference encodings: No ASCII characters. if all((x, ) not in expmap and (x, x) not in expmap for x in range(0, 128)): for x in range(0, 128): expmap[(x, )] = (x, ) # Hacks for reference encodings: No C0 control characters. if all((x, ) not in expmap and (x, x) not in expmap for x in range(0, 32) + [127]): for x in range(0, 32) + [127]: expmap[(x, )] = (x, ) # Hacks for reference encodings: No C1 control characters. if '/MAPPINGS/VENDORS/ADOBE/' in url or '/MAPPINGS/OBSOLETE/EASTASIA/' in url: if any((x, ) in expmap for x in range(160, 256)) or url.endswith('/BIG5.TXT'): if all((x, ) not in expmap and (x, x) not in expmap for x in range(128, 160)): for x in range(128, 160): expmap[(x, )] = (x, ) # Hacks for reference encodings: Undefined characters mapped to U+FFFD. for k, v in expmap.items(): if v == (0xFFFD, ): del expmap[k] # End hacks. return set(map.items()) ^ set(expmap.items())