Esempio n. 1
0
def WriteVariantInfo(variant_info, output_variant_types, output_variant_tokens,
                     output_variant_strings):
    """Writes single kanji variants info.

  The token output is an array of uint32s, where array[3 * i],
  array[3 * i + 1] and array[3 * i + 2] are the index of target, index of
  original,  and variant type ID. See rewriter/single_kanji_rewriter.cc.
  """
    (variant_types, variant_items) = variant_info

    serialized_string_array_builder.SerializeToFile(variant_types,
                                                    output_variant_types)

    strings = []
    with open(output_variant_tokens, 'wb') as f:
        for index, (target, original,
                    variant_type) in enumerate(variant_items):
            f.write(struct.pack('<I', 2 * index))
            f.write(struct.pack('<I', 2 * index + 1))
            f.write(struct.pack('<I', variant_type))
            strings.append(target)
            strings.append(original)

    serialized_string_array_builder.SerializeToFile(strings,
                                                    output_variant_strings)
Esempio n. 2
0
def OutputData(emoji_data_list, token_dict,
               token_array_file, string_array_file):
  """Output token and string arrays to files."""
  sorted_token_dict = sorted(six.iteritems(token_dict))

  strings = {}
  for reading, _ in sorted_token_dict:
    strings[reading] = 0
  for (emoji, android_pua, utf8_description, docomo_description,
       softbank_description, kddi_description) in emoji_data_list:
    strings[emoji] = 0
    strings[utf8_description] = 0
    strings[docomo_description] = 0
    strings[softbank_description] = 0
    strings[kddi_description] = 0
  sorted_strings = sorted(strings.keys())
  for index, s in enumerate(sorted_strings):
    strings[s] = index

  with open(token_array_file, 'wb') as f:
    for reading, value_list in sorted_token_dict:
      reading_index = strings[reading]
      for value_index in value_list:
        (emoji, android_pua, utf8_description, docomo_description,
         softbank_description, kddi_description) = emoji_data_list[value_index]
        f.write(struct.pack('<I', reading_index))
        f.write(struct.pack('<I', strings[emoji]))
        f.write(struct.pack('<I', android_pua))
        f.write(struct.pack('<I', strings[utf8_description]))
        f.write(struct.pack('<I', strings[docomo_description]))
        f.write(struct.pack('<I', strings[softbank_description]))
        f.write(struct.pack('<I', strings[kddi_description]))

  serialized_string_array_builder.SerializeToFile(sorted_strings,
                                                  string_array_file)
Esempio n. 3
0
def WriteData(input_path, output_value_array_path, output_error_array_path,
              output_correction_array_path):
    outputs = []
    with codecs.open(input_path, 'r', encoding='utf-8') as input_stream:
        input_stream = code_generator_util.SkipLineComment(input_stream)
        input_stream = code_generator_util.ParseColumnStream(input_stream,
                                                             num_column=3)
        # ex. (value, error, correction) = ("雰囲気", "ふいんき", "ふんいき")
        for value, error, correction in input_stream:
            outputs.append([value, error, correction])

    # In order to lookup the entries via |error| with binary search,
    # sort outputs here.
    outputs.sort(key=lambda x: (x[1], x[0]))

    serialized_string_array_builder.SerializeToFile(
        [value for (value, _, _) in outputs], output_value_array_path)
    serialized_string_array_builder.SerializeToFile(
        [error for (_, error, _) in outputs], output_error_array_path)
    serialized_string_array_builder.SerializeToFile(
        [correction for (_, _, correction) in outputs],
        output_correction_array_path)
Esempio n. 4
0
def main():
    opts = _ParseOptions()

    result = []
    with open(opts.input, 'r') as stream:
        for line in stream:
            line = line.rstrip('\r\n')
            fields = line.split('\t')
            key = fields[0]
            lid = int(fields[1])
            rid = int(fields[2])
            cost = int(fields[3])
            value = fields[4]

            if key == value:
                value = ''

            result.append((key, value, lid, rid, cost))

    # Sort entries in ascending order of key.
    result.sort(key=lambda e: e[0])

    # Write keys to serialized string array.
    serialized_string_array_builder.SerializeToFile(
        list(entry[0] for entry in result), opts.output_key_array)

    # Write values to serialized string array.
    serialized_string_array_builder.SerializeToFile(
        list(entry[1] for entry in result), opts.output_value_array)

    # Write a sequence of (lid, rid, cost) to uint32 array:
    #   {lid[0], rid[0], cost[0], lid[1], rid[1], cost[1], ...}
    # So the final array has 3 * len(result) elements.
    with open(opts.output_token_array, 'wb') as f:
        for _, _, lid, rid, cost in result:
            f.write(struct.pack('<I', lid))
            f.write(struct.pack('<I', lid))
            f.write(struct.pack('<I', cost))
Esempio n. 5
0
def main():
    options = ParseOptions()
    pos_database = pos_util.PosDataBase()
    pos_database.Parse(options.id_file, options.special_pos_file)
    inflection_map = pos_util.InflectionMap()
    inflection_map.Parse(options.cforms_file)
    user_pos = pos_util.UserPos(pos_database, inflection_map)
    user_pos.Parse(options.user_pos_file)

    OutputUserPosData(user_pos.data, options.output_token_array,
                      options.output_string_array)

    if options.output_pos_list:
        serialized_string_array_builder.SerializeToFile(
            [pos for (pos, _) in user_pos.data], options.output_pos_list)
def WriteSingleKanji(single_kanji_dic, output_tokens, output_string_array):
  """Writes single kanji list for readings.

  The token output is an array of uint32s, where array[2 * i] and
  array[2 * i + 1] are the indices of key and value in the string array.
  See rewriter/single_kanji_rewriter.cc.
  """
  strings = []
  with open(output_tokens, 'wb') as f:
    for index, (key, value) in enumerate(single_kanji_dic):
      f.write(struct.pack('<I', 2 * index))
      f.write(struct.pack('<I', 2 * index + 1))
      strings.append(key)
      strings.append(value)
  serialized_string_array_builder.SerializeToFile(strings, output_string_array)
Esempio n. 7
0
def WriteZeroQueryData(zero_query_dict, output_token_array,
                       output_string_array):
  # Collect all the strings and assing index in ascending order
  string_index = {}
  for key, entry_list in zero_query_dict.iteritems():
    string_index[key] = 0
    for entry in entry_list:
      string_index[entry.value] = 0
  sorted_strings = sorted(string_index)
  for i, s in enumerate(sorted_strings):
    string_index[s] = i

  with open(output_token_array, 'wb') as f:
    for key in sorted(zero_query_dict):
      for entry in zero_query_dict[key]:
        f.write(struct.pack('<I', string_index[key]))
        f.write(struct.pack('<I', string_index[entry.value]))
        f.write(struct.pack('<H', entry.entry_type))
        f.write(struct.pack('<H', entry.emoji_type))
        f.write(struct.pack('<I', entry.emoji_android_pua))

  serialized_string_array_builder.SerializeToFile(sorted_strings,
                                                  output_string_array)
Esempio n. 8
0
def OutputUserPosData(user_pos_data, output_token_array, output_string_array):
    string_index = {}
    for user_pos, conjugation_list in user_pos_data:
        string_index[ToString(user_pos)] = 0
        for value_suffix, key_suffix, _ in conjugation_list:
            string_index[ToString(value_suffix)] = 0
            string_index[ToString(key_suffix)] = 0
    for index, s in enumerate(sorted(string_index)):
        string_index[s] = index

    with open(output_token_array, 'wb') as f:
        for user_pos, conjugation_list in sorted(user_pos_data):
            user_pos_index = string_index[ToString(user_pos)]
            for value_suffix, key_suffix, conjugation_id in conjugation_list:
                # One entry is serialized to 8 byte (four uint16 components).
                f.write(struct.pack('<H', user_pos_index))
                f.write(struct.pack('<H',
                                    string_index[ToString(value_suffix)]))
                f.write(struct.pack('<H', string_index[ToString(key_suffix)]))
                f.write(struct.pack('<H', conjugation_id))

    serialized_string_array_builder.SerializeToFile(
        sorted(string_index.keys()), output_string_array)
Esempio n. 9
0
def main():
  options, args = ParseOptions()
  ids = ReadCounterSuffixPosIds(options.id_file)
  suffixes = ReadCounterSuffixes(args, ids)
  serialized_string_array_builder.SerializeToFile(suffixes, options.output)