Ejemplo n.º 1
0
def ReadSymbolTsv(stream):
    """Reads emoji data from stream and returns zero query data."""
    zero_query_dict = defaultdict(list)
    stream = code_generator_util.SkipLineComment(stream)
    for columns in code_generator_util.ParseColumnStream(stream,
                                                         delimiter='\t'):
        if len(columns) < 3:
            logging.warning('format error: %s', '\t'.join(columns))
            continue

        symbol = columns[1]
        readings = columns[2]

        symbol_unicode = symbol.decode('utf-8')
        if len(symbol_unicode) != 1:
            continue

        symbol_code_point = ord(symbol_unicode)
        # Selects emoji symbols from symbol dictionary.
        # TODO(toshiyuki): Update the range if we need.
        # from "☀"(black sun with rays) to "❧"(rotated floral heart).
        if not (0x2600 <= symbol_code_point and symbol_code_point <= 0x2767):
            continue

        # \xe3\x80\x80 is a full-width space
        for reading in re.split(r'(?: |\xe3\x80\x80)+', readings.strip()):
            if not reading:
                continue
            zero_query_dict[reading].append(
                util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, symbol,
                                    util.EMOJI_TYPE_NONE, 0))

        if len(columns) >= 4 and columns[3]:
            # description: "天気", etc.
            description = columns[3]
            zero_query_dict[description].append(
                util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, symbol,
                                    util.EMOJI_TYPE_NONE, 0))
        if len(columns) >= 5 and columns[4]:
            # additional_description: "傘", etc.
            additional_description = columns[4]
            zero_query_dict[additional_description].append(
                util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, symbol,
                                    util.EMOJI_TYPE_NONE, 0))

    return zero_query_dict
Ejemplo n.º 2
0
def ReadZeroQueryRuleData(input_stream):
    """Reads zero query rule data from stream and returns zero query data."""
    zero_query_dict = collections.defaultdict(list)

    for line in input_stream:
        if line.startswith('#'):
            continue
        line = line.rstrip('\r\n')
        if not line:
            continue

        tokens = line.split('\t')
        key = tokens[0]
        values = tokens[1].split(',')

        for value in values:
            zero_query_dict[key].append(
                util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, value,
                                    util.EMOJI_TYPE_NONE, 0))
    return zero_query_dict
Ejemplo n.º 3
0
def ReadZeroQueryNumberData(input_stream):
    """Reads zero query number data from stream and returns zero query data."""
    zero_query_dict = defaultdict(list)

    for line in input_stream:
        if line.startswith(b'#'):
            continue
        line = line.rstrip(b'\r\n')
        if not line:
            continue

        tokens = line.split(b'\t')
        key = tokens[0]
        values = tokens[1].split(b',')

        for value in values:
            zero_query_dict[key].append(
                util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NUMBER_SUFFIX, value,
                                    util.EMOJI_TYPE_NONE, 0))
    return zero_query_dict
Ejemplo n.º 4
0
def ReadEmoticonTsv(stream):
  """Reads emoticon data from stream and returns zero query data."""
  zero_query_dict = collections.defaultdict(list)
  stream = code_generator_util.SkipLineComment(stream)
  for columns in code_generator_util.ParseColumnStream(stream, delimiter='\t'):
    if len(columns) != 3:
      logging.critical('format error: %s', '\t'.join(columns))
      sys.exit(1)

    emoticon = columns[0]
    readings = columns[2]

    for reading in re.split(RE_SPLIT, readings.strip()):
      if not reading:
        continue
      zero_query_dict[reading].append(
          util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_EMOTICON,
                              emoticon, util.EMOJI_TYPE_NONE, 0))

  return zero_query_dict
Ejemplo n.º 5
0
def ReadEmojiTsv(stream):
    """Reads emoji data from stream and returns zero query data."""
    zero_query_dict = collections.defaultdict(list)
    stream = code_generator_util.SkipLineComment(stream)
    for columns in code_generator_util.ParseColumnStream(stream,
                                                         delimiter='\t'):
        if len(columns) != 13:
            logging.critical('format error: %s', '\t'.join(columns))
            sys.exit(1)

        code_points = columns[0].split(' ')

        # Emoji code point.
        emoji = columns[1]

        android_pua = ParseCodePoint(columns[2])
        readings = columns[6]
        japanese_name = columns[8]
        docomo_description = columns[9]
        softbank_description = columns[10]
        kddi_description = columns[11]

        if not android_pua or len(code_points) > 1:
            # Skip some emoji, which is not supported on old devices.
            # - Unicode 6.1 or later emoji which doesn't have PUA code point.
            # - Composite emoji which has multiple code point.
            # NOTE: Some Unicode 6.0 emoji don't have PUA, and it is also omitted.
            # TODO(hsumita): Check the availability of such emoji and enable it.
            logging.info('Skip %s', ' '.join(code_points))
            continue

        reading_list = []
        for reading in re.split(RE_SPLIT, NormalizeString(readings)):
            if not reading:
                continue
            reading_list.append(reading)

        reading_list.extend(GetReadingsFromDescription(japanese_name))
        reading_list.extend(GetReadingsFromDescription(docomo_description))
        reading_list.extend(GetReadingsFromDescription(softbank_description))
        reading_list.extend(GetReadingsFromDescription(kddi_description))

        emoji_type = util.EMOJI_TYPE_NONE
        if emoji:
            emoji_type |= util.EMOJI_TYPE_UNICODE
        if docomo_description:
            emoji_type |= util.EMOJI_TYPE_DOCOMO
        if softbank_description:
            emoji_type |= util.EMOJI_TYPE_SOFTBANK
        if kddi_description:
            emoji_type |= util.EMOJI_TYPE_KDDI

        for description in set(reading_list):
            if not description:
                continue
            zero_query_dict[description].append(
                util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_EMOJI, emoji,
                                    emoji_type, android_pua))

    # Sort emoji for each reading.
    for key in zero_query_dict.keys():
        zero_query_dict[key].sort(key=lambda e: (e.value, e.emoji_android_pua))

    return zero_query_dict