def compressed_cp_lines(cps): values_per_line = 12 bytes_ = [] for cp in cps: lzw.add_cp(bytes_, int(cp, 16)) compressed_bytes = lzw.compress(bytes_) print 'rewrote {} * 32 = {} bits as {} * 8 = {} bits'.format(len(cps), len(cps)*32, len(bytes_), len(bytes_)*8) print 'compressed to {} * 16 = {} bits'.format(len(compressed_bytes), len(compressed_bytes) * 16) return lzw.compressed_bytes_to_lines(compressed_bytes, values_per_line)
def compressed_case_mapping_lines(mappings): values_per_line = 12 bytes_ = [] for t in mappings: lzw.add_cp(bytes_, int(t[0], 16)) lzw.add_short(bytes_, t[1][0]) lzw.add_short(bytes_, t[1][1]) compressed_bytes = lzw.compress(bytes_) print 'rewrote {} * 64 = {} bits as {} * 8 = {} bits'.format(len(mappings), len(mappings)*64, len(bytes_), len(bytes_)*8) print 'compressed to {} * 16 = {} bits'.format(len(compressed_bytes), len(compressed_bytes) * 16) return lzw.compressed_bytes_to_lines(compressed_bytes, values_per_line)
def uncompressed_prop_bytes(cp_prop_pairs): bytes_ = [] for pair in cp_prop_pairs: lzw.add_cp(bytes_, pair[0]) lzw.add_byte(bytes_, pair[1]) return bytes_
ccc = 0 if cp in cccs_dict: ccc = cccs_dict[cp] nfd_quick_check = 'quick_check::yes' if cp in quick_check_maps['NFD']: nfd_quick_check = quick_check_maps['NFD'][cp] nfkd_quick_check = 'quick_check::yes' if cp in quick_check_maps['NFKD']: nfkd_quick_check = quick_check_maps['NFKD'][cp] nfc_quick_check = 'quick_check::yes' if cp in quick_check_maps['NFC']: nfc_quick_check = quick_check_maps['NFC'][cp] nfkc_quick_check = 'quick_check::yes' if cp in quick_check_maps['NFKC']: nfkc_quick_check = quick_check_maps['NFKC'][cp] lzw.add_cp(prop_bytes_, cp) lzw.add_short(prop_bytes_, canonical_decomp[0]) lzw.add_short(prop_bytes_, canonical_decomp[1]) lzw.add_short(prop_bytes_, compatible_decomp[0]) lzw.add_short(prop_bytes_, compatible_decomp[1]) lzw.add_byte(prop_bytes_, int(ccc)) lzw.add_byte(prop_bytes_, \ quick_checks_to_byte(nfd_quick_check, nfkd_quick_check)) lzw.add_byte(prop_bytes_, \ quick_checks_to_byte(nfc_quick_check, nfkc_quick_check)) value_per_line = 12 compressed_bytes = lzw.compress(prop_bytes_) props_lines, num_shorts = lzw.compressed_bytes_to_lines( compressed_bytes, value_per_line) #print 'rewrote {} * 144 = {} bits as {} * 8 = {} bits'.format(len(all_cps), len(all_cps)*144, len(prop_bytes_), len(prop_bytes_)*8)
cpp_file = open('collation_data_0.cpp', 'w') cpp_file.write( collation_data_0_file_form.format(implicit_weights_segments_str, len(implicit_weights_segments), reorder_group_str, len(reorder_group_strings), ce_lines, len(compressed_ces), len(collation_elements))) key_bytes = [] #value_bytes = [] value_strings = [] for k, v in sorted(fcc_cet.items(), key=lambda x: original_order[x[0]]): lzw.add_byte(key_bytes, len(k)) for x in k: lzw.add_cp(key_bytes, x) value_strings.append('{{{}, {}}}'.format(v[0], v[1])) #lzw.add_short(value_bytes, v[0]) #lzw.add_short(value_bytes, v[1]) compressed_keys = lzw.compress(key_bytes) # The other data sets are optimizaed by LZW compression. This one is # heavily pessimized. # compressed_values = lzw.compress(value_bytes) #print 'rewrote {} * 128 = {} bits as {} * 8 = {} bits'.format(len(fcc_cet), len(fcc_cet)*128, len(key_bytes), len(key_bytes)*8) #print 'compressed to {} * 16 = {} bits'.format(len(compressed_keys), len(compressed_keys) * 16) key_lines = values_to_lines(map(lambda x: hex(x), compressed_keys), 'uint16_t', 2500) #print 'rewrote {} * 32 = {} bits as {} * 8 = {} bits'.format(len(fcc_cet), len(fcc_cet)*32, len(value_bytes), len(value_bytes)*8) #print 'compressed to {} * 16 = {} bits'.format(len(compressed_values), len(compressed_values) * 16)