def WriteVariantInfo(variant_info, output_variant_types, output_variant_tokens, output_variant_strings): """Writes single kanji variants info. The token output is an array of uint32s, where array[3 * i], array[3 * i + 1] and array[3 * i + 2] are the index of target, index of original, and variant type ID. See rewriter/single_kanji_rewriter.cc. """ (variant_types, variant_items) = variant_info serialized_string_array_builder.SerializeToFile(variant_types, output_variant_types) strings = [] with open(output_variant_tokens, 'wb') as f: for index, (target, original, variant_type) in enumerate(variant_items): f.write(struct.pack('<I', 2 * index)) f.write(struct.pack('<I', 2 * index + 1)) f.write(struct.pack('<I', variant_type)) strings.append(target) strings.append(original) serialized_string_array_builder.SerializeToFile(strings, output_variant_strings)
def OutputData(emoji_data_list, token_dict, token_array_file, string_array_file): """Output token and string arrays to files.""" sorted_token_dict = sorted(six.iteritems(token_dict)) strings = {} for reading, _ in sorted_token_dict: strings[reading] = 0 for (emoji, android_pua, utf8_description, docomo_description, softbank_description, kddi_description) in emoji_data_list: strings[emoji] = 0 strings[utf8_description] = 0 strings[docomo_description] = 0 strings[softbank_description] = 0 strings[kddi_description] = 0 sorted_strings = sorted(strings.keys()) for index, s in enumerate(sorted_strings): strings[s] = index with open(token_array_file, 'wb') as f: for reading, value_list in sorted_token_dict: reading_index = strings[reading] for value_index in value_list: (emoji, android_pua, utf8_description, docomo_description, softbank_description, kddi_description) = emoji_data_list[value_index] f.write(struct.pack('<I', reading_index)) f.write(struct.pack('<I', strings[emoji])) f.write(struct.pack('<I', android_pua)) f.write(struct.pack('<I', strings[utf8_description])) f.write(struct.pack('<I', strings[docomo_description])) f.write(struct.pack('<I', strings[softbank_description])) f.write(struct.pack('<I', strings[kddi_description])) serialized_string_array_builder.SerializeToFile(sorted_strings, string_array_file)
def WriteData(input_path, output_value_array_path, output_error_array_path, output_correction_array_path): outputs = [] with codecs.open(input_path, 'r', encoding='utf-8') as input_stream: input_stream = code_generator_util.SkipLineComment(input_stream) input_stream = code_generator_util.ParseColumnStream(input_stream, num_column=3) # ex. (value, error, correction) = ("雰囲気", "ふいんき", "ふんいき") for value, error, correction in input_stream: outputs.append([value, error, correction]) # In order to lookup the entries via |error| with binary search, # sort outputs here. outputs.sort(key=lambda x: (x[1], x[0])) serialized_string_array_builder.SerializeToFile( [value for (value, _, _) in outputs], output_value_array_path) serialized_string_array_builder.SerializeToFile( [error for (_, error, _) in outputs], output_error_array_path) serialized_string_array_builder.SerializeToFile( [correction for (_, _, correction) in outputs], output_correction_array_path)
def main(): opts = _ParseOptions() result = [] with open(opts.input, 'r') as stream: for line in stream: line = line.rstrip('\r\n') fields = line.split('\t') key = fields[0] lid = int(fields[1]) rid = int(fields[2]) cost = int(fields[3]) value = fields[4] if key == value: value = '' result.append((key, value, lid, rid, cost)) # Sort entries in ascending order of key. result.sort(key=lambda e: e[0]) # Write keys to serialized string array. serialized_string_array_builder.SerializeToFile( list(entry[0] for entry in result), opts.output_key_array) # Write values to serialized string array. serialized_string_array_builder.SerializeToFile( list(entry[1] for entry in result), opts.output_value_array) # Write a sequence of (lid, rid, cost) to uint32 array: # {lid[0], rid[0], cost[0], lid[1], rid[1], cost[1], ...} # So the final array has 3 * len(result) elements. with open(opts.output_token_array, 'wb') as f: for _, _, lid, rid, cost in result: f.write(struct.pack('<I', lid)) f.write(struct.pack('<I', lid)) f.write(struct.pack('<I', cost))
def main(): options = ParseOptions() pos_database = pos_util.PosDataBase() pos_database.Parse(options.id_file, options.special_pos_file) inflection_map = pos_util.InflectionMap() inflection_map.Parse(options.cforms_file) user_pos = pos_util.UserPos(pos_database, inflection_map) user_pos.Parse(options.user_pos_file) OutputUserPosData(user_pos.data, options.output_token_array, options.output_string_array) if options.output_pos_list: serialized_string_array_builder.SerializeToFile( [pos for (pos, _) in user_pos.data], options.output_pos_list)
def WriteSingleKanji(single_kanji_dic, output_tokens, output_string_array): """Writes single kanji list for readings. The token output is an array of uint32s, where array[2 * i] and array[2 * i + 1] are the indices of key and value in the string array. See rewriter/single_kanji_rewriter.cc. """ strings = [] with open(output_tokens, 'wb') as f: for index, (key, value) in enumerate(single_kanji_dic): f.write(struct.pack('<I', 2 * index)) f.write(struct.pack('<I', 2 * index + 1)) strings.append(key) strings.append(value) serialized_string_array_builder.SerializeToFile(strings, output_string_array)
def WriteZeroQueryData(zero_query_dict, output_token_array, output_string_array): # Collect all the strings and assing index in ascending order string_index = {} for key, entry_list in zero_query_dict.iteritems(): string_index[key] = 0 for entry in entry_list: string_index[entry.value] = 0 sorted_strings = sorted(string_index) for i, s in enumerate(sorted_strings): string_index[s] = i with open(output_token_array, 'wb') as f: for key in sorted(zero_query_dict): for entry in zero_query_dict[key]: f.write(struct.pack('<I', string_index[key])) f.write(struct.pack('<I', string_index[entry.value])) f.write(struct.pack('<H', entry.entry_type)) f.write(struct.pack('<H', entry.emoji_type)) f.write(struct.pack('<I', entry.emoji_android_pua)) serialized_string_array_builder.SerializeToFile(sorted_strings, output_string_array)
def OutputUserPosData(user_pos_data, output_token_array, output_string_array): string_index = {} for user_pos, conjugation_list in user_pos_data: string_index[ToString(user_pos)] = 0 for value_suffix, key_suffix, _ in conjugation_list: string_index[ToString(value_suffix)] = 0 string_index[ToString(key_suffix)] = 0 for index, s in enumerate(sorted(string_index)): string_index[s] = index with open(output_token_array, 'wb') as f: for user_pos, conjugation_list in sorted(user_pos_data): user_pos_index = string_index[ToString(user_pos)] for value_suffix, key_suffix, conjugation_id in conjugation_list: # One entry is serialized to 8 byte (four uint16 components). f.write(struct.pack('<H', user_pos_index)) f.write(struct.pack('<H', string_index[ToString(value_suffix)])) f.write(struct.pack('<H', string_index[ToString(key_suffix)])) f.write(struct.pack('<H', conjugation_id)) serialized_string_array_builder.SerializeToFile( sorted(string_index.keys()), output_string_array)
def main(): options, args = ParseOptions() ids = ReadCounterSuffixPosIds(options.id_file) suffixes = ReadCounterSuffixes(args, ids) serialized_string_array_builder.SerializeToFile(suffixes, options.output)