def generate_match_table3(ranges): "Current match table format." # Yet another attempt, similar to generate_match_table2 except # in packing format. # # Total match size now (at time of writing): 1194 bytes. # # This is the current encoding format used in duk_lexer.c. be = dukutil.BitEncoder() freq = [0] * (0x10ffff + 1) # informative def enc(x): freq[x] += 1 if x <= 0x0e: # 4-bit encoding be.bits(x, 4) return x -= 0x0e + 1 if x <= 0xfd: # 12-bit encoding be.bits(0x0f, 4) be.bits(x, 8) return x -= 0xfd + 1 if x <= 0xfff: # 24-bit encoding be.bits(0x0f, 4) be.bits(0xfe, 8) be.bits(x, 12) return x -= 0xfff + 1 if True: # 36-bit encoding be.bits(0x0f, 4) be.bits(0xff, 8) be.bits(x, 24) return raise Exception('cannot encode') prev_re = 0 for rs, re in ranges: r1 = rs - prev_re # 1 or above (no unjoined ranges) r2 = re - rs # 0 or above enc(r1) enc(r2) prev_re = re enc(0) # end marker data, nbits = be.getBytes(), be.getNumBits() return data, freq
def gen_strings_data_bitpacked(strlist): be = dukutil.BitEncoder() # Strings are encoded as follows: a string begins in lowercase # mode and recognizes the following 5-bit symbols: # # 0-25 'a' ... 'z' # 26 '_' # 27 0x00 (actually decoded to 0xff, internal marker) # 28 reserved # 29 switch to uppercase for one character # (next 5-bit symbol must be in range 0-25) # 30 switch to uppercase # 31 read a 7-bit character verbatim # # Uppercase mode is the same except codes 29 and 30 switch to # lowercase. UNDERSCORE = 26 ZERO = 27 SWITCH1 = 29 SWITCH = 30 SEVENBIT = 31 maxlen = 0 n_optimal = 0 n_switch1 = 0 n_switch = 0 n_sevenbit = 0 for s, d in strlist: be.bits(len(s), 5) if len(s) > maxlen: maxlen = len(s) # 5-bit character, mode specific mode = 'lowercase' for idx, c in enumerate(s): # This encoder is not that optimal, but good enough for now. islower = (ord(c) >= ord('a') and ord(c) <= ord('z')) isupper = (ord(c) >= ord('A') and ord(c) <= ord('Z')) islast = (idx == len(s) - 1) isnextlower = False isnextupper = False if not islast: c2 = s[idx + 1] isnextlower = (ord(c2) >= ord('a') and ord(c2) <= ord('z')) isnextupper = (ord(c2) >= ord('A') and ord(c2) <= ord('Z')) if c == '_': be.bits(UNDERSCORE, 5) n_optimal += 1 elif c == '\x00': be.bits(ZERO, 5) n_optimal += 1 elif islower and mode == 'lowercase': be.bits(ord(c) - ord('a'), 5) n_optimal += 1 elif isupper and mode == 'uppercase': be.bits(ord(c) - ord('A'), 5) n_optimal += 1 elif islower and mode == 'uppercase': if isnextlower: be.bits(SWITCH, 5) be.bits(ord(c) - ord('a'), 5) mode = 'lowercase' n_switch += 1 else: be.bits(SWITCH1, 5) be.bits(ord(c) - ord('a'), 5) n_switch1 += 1 elif isupper and mode == 'lowercase': if isnextupper: be.bits(SWITCH, 5) be.bits(ord(c) - ord('A'), 5) mode = 'uppercase' n_switch += 1 else: be.bits(SWITCH1, 5) be.bits(ord(c) - ord('A'), 5) n_switch1 += 1 else: assert (ord(c) >= 0 and ord(c) <= 127) be.bits(SEVENBIT, 5) be.bits(ord(c), 7) n_sevenbit += 1 #print 'sevenbit for: %r' % c # end marker not necessary, C code knows length from define res = be.getByteString() print ('%d strings, %d bytes of string init data, %d maximum string length, ' + \ 'encoding: optimal=%d,switch1=%d,switch=%d,sevenbit=%d') % \ (len(strlist), len(res), maxlen, \ n_optimal, n_switch1, n_switch, n_sevenbit) return res, maxlen
def generate_tables(convmap): "Generate bit-packed case conversion table for a given conversion map." # The bitstream encoding is based on manual inspection for whatever # regularity the Unicode case conversion rules have. # # Start with a full description of case conversions which does not # cover all codepoints; unmapped codepoints convert to themselves. # Scan for range-to-range mappings with a range of skips starting from 1. # Whenever a valid range is found, remove it from the map. Finally, # output the remaining case conversions (1:1 and 1:n) on a per codepoint # basis. # # This is very slow because we always scan from scratch, but its the # most reliable and simple way to scan ranges = [ ] # range mappings (2 or more consecutive mappings with a certain skip) singles = [] # 1:1 character mappings multis = [] # 1:n character mappings # Ranges with skips for skip in xrange(1, 6 + 1): # skips 1...6 are useful while True: start_i, start_o, count = find_first_range_with_skip(convmap, skip) if start_i is None: break print 'skip %d: %d %d %d' % (skip, start_i, start_o, count) ranges.append([start_i, start_o, count, skip]) # 1:1 conversions k = convmap.keys() k.sort() for i in k: if len(convmap[i]) > 1: continue singles.append([i, ord(convmap[i])]) # codepoint, codepoint del convmap[i] # There are many mappings to 2-char sequences with latter char being U+0399. # These could be handled as a special case, but we don't do that right now. # # [8064L, u'\u1f08\u0399'] # [8065L, u'\u1f09\u0399'] # [8066L, u'\u1f0a\u0399'] # [8067L, u'\u1f0b\u0399'] # [8068L, u'\u1f0c\u0399'] # [8069L, u'\u1f0d\u0399'] # [8070L, u'\u1f0e\u0399'] # [8071L, u'\u1f0f\u0399'] # ... # # tmp = {} # k = convmap.keys() # k.sort() # for i in k: # if len(convmap[i]) == 2 and convmap[i][1] == u'\u0399': # tmp[i] = convmap[i][0] # del convmap[i] # print repr(tmp) # # skip = 1 # while True: # start_i, start_o, count = find_first_range_with_skip(tmp, skip) # if start_i is None: # break # print 'special399, skip %d: %d %d %d' % (skip, start_i, start_o, count) # print len(tmp.keys()) # print repr(tmp) # XXX: need to put 12 remaining mappings back to convmap # 1:n conversions k = convmap.keys() k.sort() for i in k: multis.append([i, convmap[i]]) # codepoint, string del convmap[i] for t in singles: print repr(t) for t in multis: print repr(t) print 'range mappings: %d' % len(ranges) print 'single character mappings: %d' % len(singles) print 'complex mappings (1:n): %d' % len(multis) print 'remaining (should be zero): %d' % len(convmap.keys()) # XXX: opportunities for diff encoding skip=3 ranges? prev = None for t in ranges: # range: [start_i, start_o, count, skip] if t[3] != 3: continue if prev is not None: print '%d %d' % (t[0] - prev[0], t[1] - prev[1]) else: print 'start: %d %d' % (t[0], t[1]) prev = t # bit packed encoding be = dukutil.BitEncoder() for curr_skip in xrange(1, 7): # 1...6 count = 0 for r in ranges: start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3] if skip != curr_skip: continue count += 1 be.bits(count, 6) print 'encode: skip=%d, count=%d' % (curr_skip, count) for r in ranges: start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3] if skip != curr_skip: continue be.bits(start_i, 16) be.bits(start_o, 16) be.bits(r_count, 7) be.bits(0x3f, 6) # maximum count value = end of skips count = len(singles) be.bits(count, 7) for t in singles: cp_i, cp_o = t[0], t[1] be.bits(cp_i, 16) be.bits(cp_o, 16) count = len(multis) be.bits(count, 7) for t in multis: cp_i, str_o = t[0], t[1] be.bits(cp_i, 16) be.bits(len(str_o), 2) for i in xrange(len(str_o)): be.bits(ord(str_o[i]), 16) return be.getBytes(), be.getNumBits()