Esempio n. 1
0
def main():
    parser = optparse.OptionParser()
    parser.add_option('--unicode-data', dest='unicode_data')
    parser.add_option('--special-casing', dest='special_casing')
    parser.add_option('--out-source', dest='out_source')
    parser.add_option('--out-header', dest='out_header')
    parser.add_option('--table-name-lc',
                      dest='table_name_lc',
                      default='caseconv_lc')
    parser.add_option('--table-name-uc',
                      dest='table_name_uc',
                      default='caseconv_uc')
    (opts, args) = parser.parse_args()

    unicode_data = UnicodeData(opts.unicode_data)
    special_casing = SpecialCasing(opts.special_casing)

    uc, lc, tc = get_base_conversion_maps(unicode_data)
    update_special_casings(uc, lc, tc, special_casing)

    # XXX: ASCII and non-BMP filtering could be an option but is now hardcoded

    # ascii is handled with 'fast path' so not needed here
    t = clonedict(uc)
    remove_ascii_part(t)
    uc_bytes, uc_nbits = generate_tables(t)

    t = clonedict(lc)
    remove_ascii_part(t)
    lc_bytes, lc_nbits = generate_tables(t)

    # Generate C source and header files
    genc = dukutil.GenerateC()
    genc.emitHeader('extract_caseconv.py')
    genc.emitArray(uc_bytes,
                   opts.table_name_uc,
                   size=len(uc_bytes),
                   typename='duk_uint8_t',
                   intvalues=True,
                   const=True)
    genc.emitArray(lc_bytes,
                   opts.table_name_lc,
                   size=len(lc_bytes),
                   typename='duk_uint8_t',
                   intvalues=True,
                   const=True)
    f = open(opts.out_source, 'wb')
    f.write(genc.getString())
    f.close()

    genc = dukutil.GenerateC()
    genc.emitHeader('extract_caseconv.py')
    genc.emitLine('extern const duk_uint8_t %s[%d];' %
                  (opts.table_name_uc, len(uc_bytes)))
    genc.emitLine('extern const duk_uint8_t %s[%d];' %
                  (opts.table_name_lc, len(lc_bytes)))
    f = open(opts.out_header, 'wb')
    f.write(genc.getString())
    f.close()
Esempio n. 2
0
def main():
	parser = optparse.OptionParser()
	parser.add_option('--unicode-data', dest='unicode_data')      # UnicodeData.txt
	parser.add_option('--special-casing', dest='special_casing')  # SpecialCasing.txt
	parser.add_option('--include-categories', dest='include_categories')
	parser.add_option('--exclude-categories', dest='exclude_categories', default='NONE')
	parser.add_option('--out-source', dest='out_source')
	parser.add_option('--out-header', dest='out_header')
	parser.add_option('--out-png', dest='out_png')
	parser.add_option('--table-name', dest='table_name', default='match_table')
	(opts, args) = parser.parse_args()

	unidata = opts.unicode_data
	catsinc = []
	if opts.include_categories != '':
		catsinc = opts.include_categories.split(',')
	catsexc = []
	if opts.exclude_categories != 'NONE':
		catsexc = opts.exclude_categories.split(',')

	print 'CATSEXC: %s' % repr(catsexc)
	print 'CATSINC: %s' % repr(catsinc)

	# pseudocategories
	filter_ascii = ('ASCII' in catsexc)
	filter_nonbmp = ('NONBMP' in catsexc)

	# Read raw result
	def filter1(x):
		if filter_ascii and x <= 0x7f:
			# exclude ascii
			return False
		if filter_nonbmp and x >= 0x10000:
			# exclude non-bmp
			return False
		return True

	res = read_unicode_data(unidata, catsinc, catsexc, filter1)

	# Raw output
	print('RAW OUTPUT:')
	print('===========')
	print('\n'.join(res))

	# Scan ranges
	print('')
	print('RANGES:')
	print('=======')
	ranges = scan_ranges(res)
	for i in ranges:
		if i[0] == i[1]:
			print('0x%04x' % i[0])
		else:
			print('0x%04x ... 0x%04x' % (i[0], i[1]))
	print('')
	print('%d ranges total' % len(ranges))

	# Generate match table
	print('')
	print('MATCH TABLE:')
	print('============')
	#matchtable1 = generate_match_table1(ranges)
	#matchtable2 = generate_match_table2(ranges)
	matchtable3, freq = generate_match_table3(ranges)
	print 'match table: %s' % repr(matchtable3)
	print 'match table length: %d bytes' % len(matchtable3)
	print 'encoding freq:'
	for i in xrange(len(freq)):
		if freq[i] == 0:
			continue
		print '  %6d: %d' % (i, freq[i])

	print('')
	print('MATCH C TABLE -> file %s' % repr(opts.out_header))

	# Create C source and header files
	genc = dukutil.GenerateC()
	genc.emitHeader('extract_chars.py')
	genc.emitArray(matchtable3, opts.table_name, bytesize=len(matchtable3), typename='duk_uint8_t', intvalues=True, const=True)
	if opts.out_source is not None:
		f = open(opts.out_source, 'wb')
		f.write(genc.getString())
		f.close()

	genc = dukutil.GenerateC()
	genc.emitHeader('extract_chars.py')
	genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name, len(matchtable3)))
	if opts.out_header is not None:
		f = open(opts.out_header, 'wb')
		f.write(genc.getString())
		f.close()

	# Image (for illustrative purposes only)
	if opts.out_png is not None:
		generate_png(res, opts.out_png)
Esempio n. 3
0
def main():
    parser = optparse.OptionParser()
    parser.add_option('--command',
                      dest='command',
                      default='caseconv_bitpacked')
    parser.add_option('--unicode-data', dest='unicode_data')
    parser.add_option('--special-casing', dest='special_casing')
    parser.add_option('--out-source', dest='out_source')
    parser.add_option('--out-header', dest='out_header')
    parser.add_option('--table-name-lc',
                      dest='table_name_lc',
                      default='caseconv_lc')
    parser.add_option('--table-name-uc',
                      dest='table_name_uc',
                      default='caseconv_uc')
    parser.add_option('--table-name-re-canon-lookup',
                      dest='table_name_re_canon_lookup',
                      default='caseconv_re_canon_lookup')
    (opts, args) = parser.parse_args()

    unicode_data = UnicodeData(opts.unicode_data)
    special_casing = SpecialCasing(opts.special_casing)

    uc, lc, tc = get_base_conversion_maps(unicode_data)
    update_special_casings(uc, lc, tc, special_casing)

    if opts.command == 'caseconv_bitpacked':
        # XXX: ASCII and non-BMP filtering could be an option but is now hardcoded

        # ascii is handled with 'fast path' so not needed here
        t = clonedict(uc)
        remove_ascii_part(t)
        uc_bytes, uc_nbits = generate_tables(t)

        t = clonedict(lc)
        remove_ascii_part(t)
        lc_bytes, lc_nbits = generate_tables(t)

        # Generate C source and header files
        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitArray(uc_bytes,
                       opts.table_name_uc,
                       size=len(uc_bytes),
                       typename='duk_uint8_t',
                       intvalues=True,
                       const=True)
        genc.emitArray(lc_bytes,
                       opts.table_name_lc,
                       size=len(lc_bytes),
                       typename='duk_uint8_t',
                       intvalues=True,
                       const=True)
        f = open(opts.out_source, 'wb')
        f.write(genc.getString())
        f.close()

        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitLine('extern const duk_uint8_t %s[%d];' %
                      (opts.table_name_uc, len(uc_bytes)))
        genc.emitLine('extern const duk_uint8_t %s[%d];' %
                      (opts.table_name_lc, len(lc_bytes)))
        f = open(opts.out_header, 'wb')
        f.write(genc.getString())
        f.close()
    elif opts.command == 're_canon_lookup':
        # direct canonicalization lookup for case insensitive regexps, includes ascii part
        t = clonedict(uc)
        re_canon_lookup = generate_regexp_canonicalize_lookup(t)

        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitArray(re_canon_lookup,
                       opts.table_name_re_canon_lookup,
                       size=len(re_canon_lookup),
                       typename='duk_uint16_t',
                       intvalues=True,
                       const=True)
        f = open(opts.out_source, 'wb')
        f.write(genc.getString())
        f.close()

        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitLine('extern const duk_uint16_t %s[%d];' %
                      (opts.table_name_re_canon_lookup, len(re_canon_lookup)))
        f = open(opts.out_header, 'wb')
        f.write(genc.getString())
        f.close()
    else:
        raise Exception('invalid command: %r' % opts.command)