def romanize_search_query(word):
    """
    This method romanizes all languages for search query.
    If word is hiragana or katakana, it is romanized by jautils.
    Args:
        word: should be script varianted
    Returns:
        [romanized_word, ... ]   
        (if word can be romanized by unidecode and jp_dictionary,
        returns multiple romanizations.)
    """
    if not word:
        return []

    romanized_words = []
    if has_kanji(word):
        romanized_words = romanize_japanese_word(word, for_index=False)

    if jautils.should_normalize(word):
        hiragana_word = jautils.normalize(word)
        romanized_words.append(jautils.hiragana_to_romaji(hiragana_word))

    romanized_word = unidecode(word)
    romanized_words.append(romanized_word.strip())
    return romanized_words
Example #2
0
def romanize_search_query(word):
    """
    This method romanizes all languages for search query.
    If word is hiragana or katakana, it is romanized by jautils.
    Args:
        word: should be script varianted
    Returns:
        [romanized_word, ... ]   
        (if word can be romanized by unidecode and jp_dictionary,
        returns multiple romanizations.)
    """
    if not word:
        return []

    romanized_words = []
    if has_kanji(word):
        romanized_words = romanize_japanese_word(word, for_index=False)

    if jautils.should_normalize(word):
        hiragana_word = jautils.normalize(word)
        romanized_words.append(jautils.hiragana_to_romaji(hiragana_word))

    # if the name is a Chinese name and the chinese_romanize can produce
    # a different result, append the result to the romanzied_words with
    # unidecode results together
    unidecode_romanize_word = unidecode(word).strip()
    chinese_romanize_list = romanize_chinese_name(word)
    chinese_romanize_word = chinese_romanize_list[
        0] if chinese_romanize_list else ''
    if chinese_romanize_word and chinese_romanize_word != unidecode_romanize_word:
        romanized_words.append(chinese_romanize_word)
    romanized_words.append(unidecode_romanize_word)

    return romanized_words
def romanize_search_query(word):
    """
    This method romanizes all languages for search query.
    If word is hiragana or katakana, it is romanized by jautils.
    Args:
        word: should be script varianted
    Returns:
        [romanized_word, ... ]   
        (if word can be romanized by unidecode and jp_dictionary,
        returns multiple romanizations.)
    """
    if not word:
        return []

    romanized_words = []
    if has_kanji(word):
        romanized_words = romanize_japanese_word(word, for_index=False)

    if jautils.should_normalize(word):
        hiragana_word = jautils.normalize(word)
        romanized_words.append(jautils.hiragana_to_romaji(hiragana_word))

    romanized_word = unidecode(word)
    romanized_words.append(romanized_word.strip())
    return romanized_words
def romanize_japanese_location(word):
    """
    This method romanizes japanese name by using name dictionary.
    If word isn't found in dictionary, this method doesn't
    apply romanize.
    """
    if not word:
        return word

    if word in JAPANESE_LOCATION_DICTIONARY:
        yomigana = JAPANESE_LOCATION_DICTIONARY[word]
        return jautils.hiragana_to_romaji(yomigana)

    return word
def romanize_word_by_unidecode(word):
    """
    This method romanizes all languages by unidecode.
    If word is hiragana or katakana, it is romanized by jautils.
    kanji is romanized in Chinese way.
    Args:
        word: should be script varianted
    Returns:
        an array of romanzied_word by unidecode [romanized_word]
    """
    if not word:
        return ['']

    if jautils.should_normalize(word):
        hiragana_word = jautils.normalize(word)
        return [jautils.hiragana_to_romaji(hiragana_word)]
    romanized_word = unidecode(word)
    return [romanized_word.strip()]
def romanize_word_by_unidecode(word):
    """
    This method romanizes all languages by unidecode.
    If word is hiragana or katakana, it is romanized by jautils.
    kanji is romanized in Chinese way.
    Args:
        word: should be script varianted
    Returns:
        an array of romanzied_word by unidecode [romanized_word]
    """
    if not word:
        return ['']

    if jautils.should_normalize(word):
        hiragana_word = jautils.normalize(word)
        return [jautils.hiragana_to_romaji(hiragana_word)]
    romanized_word = unidecode(word)
    return [romanized_word.strip()]
Example #7
0
def romanize_japanese_location(word):
    """
    This method romanizes japanese location by using name dictionary.
    If word isn't found in dictionary, this method doesn't
    apply romanize.
    This method can return multiple romanizations.
    (because there are multiple ways to read the same kanji location in japanese)
    Returns:
        [romanized_jp_location, ...]
    """
    if not word:
        return ['']

    if word in JAPANESE_LOCATION_DICTIONARY:
        yomigana_list = JAPANESE_LOCATION_DICTIONARY[word]
        return [jautils.hiragana_to_romaji(yomigana)
                for yomigana in yomigana_list]

    return [word]
def romanize_japanese_location(word):
    """
    This method romanizes japanese location by using name dictionary.
    If word isn't found in dictionary, this method doesn't
    apply romanize.
    This method can return multiple romanizations.
    (because there are multiple ways to read the same kanji location in japanese)
    Returns:
        [romanized_jp_location, ...]
    """
    if not word:
        return ['']

    if word in JAPANESE_LOCATION_DICTIONARY:
        yomigana_list = JAPANESE_LOCATION_DICTIONARY[word]
        return [
            jautils.hiragana_to_romaji(yomigana) for yomigana in yomigana_list
        ]

    return [word]
def romanize_single_japanese_word(word):
    """
    This method romanizes a single Japanese word using a dictionary.
    If the word isn't found in the dictionary, this method returns the word as is.
    This method can return multiple romanizations
    (because there are multiple ways to read the same kanji name in Japanese).
    This method doesn't support romanizing full names using first/last
    names in the dictionary.

    Returns:
        [romanized_jp_word, ...]
    """
    if not word:
        return ['']

    if word in JAPANESE_NAME_LOCATION_DICTIONARY:
        yomigana_list = JAPANESE_NAME_LOCATION_DICTIONARY[word]
        return [jautils.hiragana_to_romaji(yomigana)
                for yomigana in yomigana_list]

    return [word]
Example #10
0
def romanize_single_japanese_word(word):
    """
    This method romanizes a single Japanese word using a dictionary.
    If the word isn't found in the dictionary, this method returns the word as is.
    This method can return multiple romanizations
    (because there are multiple ways to read the same kanji name in Japanese).
    This method doesn't support romanizing full names using first/last
    names in the dictionary.

    Returns:
        [romanized_jp_word, ...]
    """
    if not word:
        return ['']

    if word in JAPANESE_NAME_LOCATION_DICTIONARY:
        yomigana_list = JAPANESE_NAME_LOCATION_DICTIONARY[word]
        return [jautils.hiragana_to_romaji(yomigana)
                for yomigana in yomigana_list]

    return [word]
Example #11
0
 def test_hiragana_to_romaji(self):
     assert jautils.hiragana_to_romaji(u'abc') == u'abc'
     assert jautils.hiragana_to_romaji(u'漢字') == u'漢字'
     assert jautils.hiragana_to_romaji(u'ひらがな') == u'HIRAGANA'
     assert jautils.hiragana_to_romaji(u'カタカナ') == u'カタカナ'
     assert jautils.hiragana_to_romaji(u'カタカナ') == u'カタカナ'
     assert jautils.hiragana_to_romaji(u'abc') == u'abc'
     assert jautils.hiragana_to_romaji(u'きゃらめる') == u'KYARAMERU'
     assert jautils.hiragana_to_romaji(u'はーどる') == u'HA-DORU'
     assert jautils.hiragana_to_romaji(
         u'かんだしょうたろう') == u'KANDASHOTARO'
     assert jautils.hiragana_to_romaji(
         u'えんどういちお') == u'ENDOICHIO'
     assert jautils.hiragana_to_romaji(
         u'ひらがな カタカナ') == u'HIRAGANA カタカナ'
Example #12
0
 def test_hiragana_to_romaji(self):
     assert jautils.hiragana_to_romaji(u'abc') == u'abc'
     assert jautils.hiragana_to_romaji(u'漢字') == u'漢字'
     assert jautils.hiragana_to_romaji(u'ひらがな') == u'HIRAGANA'
     assert jautils.hiragana_to_romaji(u'カタカナ') == u'カタカナ'
     assert jautils.hiragana_to_romaji(u'カタカナ') == u'カタカナ'
     assert jautils.hiragana_to_romaji(u'abc') == u'abc'
     assert jautils.hiragana_to_romaji(u'きゃらめる') == u'KYARAMERU'
     assert jautils.hiragana_to_romaji(u'はーどる') == u'HA-DORU'
     assert jautils.hiragana_to_romaji(u'かんだしょうたろう') == u'KANDASHOTARO'
     assert jautils.hiragana_to_romaji(u'えんどういちお') == u'ENDOICHIO'
     assert jautils.hiragana_to_romaji(u'ひらがな カタカナ') == u'HIRAGANA カタカナ'
    If word is hiragana or katakana, it is romanized by jautils.
    Args:
        word: should be script varianted
    Returns:
        script varianted word
    """
    if not word:
        return word

    if re.match(ur'([\u3400-\u9fff])', word):
        word = romanize_japanese_name_by_name_dict(word)
        word = romanize_japanese_location(word)

    if jautils.should_normalize(word):
        hiragana_word = jautils.normalize(word)
        return jautils.hiragana_to_romaji(hiragana_word)
    romanized_word = unidecode(word)
    return romanized_word.strip()


def romanize_text(query_txt):
    """
    Applies romanization to each word in query_txt.
    This method uses unidecode and jautils for script variant.
    Args:
        query_txt: Search query
    Returns:
        script varianted query_txt (except kanji)
    """
    query_words = query_txt.split(' ')
    return ' '.join([romanize_word(word) for word in query_words])