Example #1
0
def parse(text):
    phrase = []
    words = []
    for morph in cabocha_tagger.parseToString(text).split("\n"):
        if morph.strip() == "" or morph.strip() == "EOS": continue
        if morph.strip().split(" ")[0] == "*":
            if phrase:
                phrase_lyrics = get_phrase(phrase, phrase_info)
                phrase = []
                for word in phrase_lyrics['word']:
                    accent = []
                    for syllable in word['syllable']:
                        accent.append(syllable['accent'])
                    yomi = "_".join(get_yomi(word['info'].split(',')[-1]))
                    if yomi == "*":
                        yomi = "_".join(get_yomi(jctconv.hira2kata(word['sur'])))
                    words.append("%s,%s,%s"%(','.join(word['info'].split(',')[:-1:]), yomi, '_'.join(accent)))
            phrase_info = morph.split(" ")[1::]
        else:
            phrase.append(morph)
    phrase_lyrics = get_phrase(phrase, phrase_info)
    for word in phrase_lyrics['word']:
        accent = []
        for syllable in word['syllable']:
            accent.append(syllable['accent'])
        yomi = "_".join(get_yomi(word['info'].split(',')[-1]))
        if yomi == "*":
            yomi = "_".join(get_yomi(jctconv.hira2kata(word['sur'])))

        words.append("%s,%s,%s"%(','.join(word['info'].split(',')[:-1:]), yomi, '_'.join(accent)))
    return words
Example #2
0
def add_verb(term, lemma, yomi):
    if is_godan_waonbin(lemma):
        suffixes = GODAN_SUFFIXES[0] if lemma.endswith('う') else GODAN_SUFFIXES[1]
        for (pos, suffix) in zip(GODAN_WAONBIN, suffixes):
            term = term[:-1] + suffix
            yomi = yomi[:-1] + jctconv.hira2kata(suffix)
            yield write_dic(pos, term, lemma, yomi)
    elif is_godan_magyou(lemma):
        term = lemma[:-1]
        yomi = yomi[:-1]
        for (suffix, pos) in GODAN_MA:
            new_term = term + suffix
            new_yomi = yomi + jctconv.hira2kata(suffix)
            yield write_dic(pos, new_term, lemma, new_yomi)
Example #3
0
    def translate(self, text):
        text = encodeMeCab(text)
        node = self._tagger.parseToNode(text)
        result = []
        while node:
            if node.stat>=2:
                node = node.next
                continue
            surface = decodeMeCab(node.surface)
            yomi = surface
            features = decodeMeCab(node.feature).split(',')
            if node.stat==0:
                yomi = features[7]
            yomi = jctconv.hira2kata(yomi)
            if features[1]==u'数':
                number = u''
                while True:
                    surface = decodeMeCab(node.surface)
                    features = decodeMeCab(node.feature).split(',')
                    if features[1]!=u'数':
                        break
                    number += surface
                    node = node.next
                number = self._ja2int(number)
                result.append(self._translate_int(number))
            else:
                result.append(self._translate_node(surface, yomi, features))
                node = node.next

        #長音と促音を変換
        text = u''.join(result)
        text = self.re_long.sub(u'\\1\\1', text)
        text = self.re_ltu.sub(u'\\1\\1', text)
        return text
Example #4
0
    def hira2kana(strings):
        """
        全角ひらがなを全角カタカナに変換する
        その他の文字はそのまま

        http://d.hatena.ne.jp/mohayonao/20101213/1292237816
        """
        strings = MultiBytes.convert2unicode(strings)
        return jctconv.hira2kata(strings)
Example #5
0
def get_yomi(word):
    word = re_symbol.sub('', word)
    katakana = jctconv.hira2kata(word)
    if re_katakana.match(katakana):
        return katakana
    katakana = ''.join(mecab.to_yomi(word))
    if re_katakana.match(katakana):
        return katakana
    return '*'
Example #6
0
def get_kanji_part_yomi(surface, reading):
    '''Get yomi of kanji part of the string. Assume only one kanji part.'''
    nkidx = non_kanji_pos(surface)
    if nkidx is None:
        return None

    non_kanji = surface[nkidx[0]:nkidx[1]]
    yomi = re.sub(jctconv.hira2kata(non_kanji), '', reading)

    # case of two separated hiragana part
    surface2 = surface[nkidx[1]:]
    nkidx2 = non_kanji_pos(surface2)
    if nkidx2 is None:
        return jctconv.kata2hira(yomi)

    non_kanji2 = surface2[nkidx2[0]:nkidx2[1]]
    yomi = re.sub(jctconv.hira2kata(non_kanji2), '', yomi)

    return jctconv.kata2hira(yomi)
def expand_string(string):
	"""
	input:文字列
	output:拡張された文字列のリスト
	当面はひらがなtoカタカナ
	"""
	ex_strings = []
	
	#ひらがな to カタカナ
	ex_strings.append(jctconv.hira2kata(string))

	return ex_strings
Example #8
0
def add_shii(term, lemma, yomi):
    if term.endswith('しい'):
        term = term[:-1]
    elif not term.endswith('し'):
        term += 'し'
    if yomi.endswith('シイ'):
        yomi = yomi[:-1]
    elif not yomi.endswith('シ'):
        yomi += 'シ'
    for (suffix, pos) in SHII:
        new_term = term + suffix
        new_yomi = yomi + jctconv.hira2kata(suffix)
        yield write_dic(pos, new_term, lemma, new_yomi)
Example #9
0
def convert(mozc_map, mozc_dir, output_dir):
    with open(os.path.join(output_dir, "mozc.csv"), "w") as out_fd:
        for f in glob.glob(os.path.join(mozc_dir, "src/data/dictionary_oss/dictionary*.txt")):
            with open(f) as in_fd:
                for l in in_fd:
                    l = l.decode("utf8").strip().split("\t")
                    (yomi, lid, rid, cost, surface) = l[:5]
                    if lid not in mozc_map:
                        continue
                    (new_id, pos) = mozc_map[lid]
                    yomi = jctconv.hira2kata(yomi)
                    line = ",".join([surface, new_id, new_id, "0", pos, surface, yomi, yomi])
                    line += "\n"
                    out_fd.write(line.encode("utf8", "replace"))
Example #10
0
def parseline(line):
	obj = line.split('/')
	for word in obj[1:]:
		if word != '':
			word = word.split(';')
			if(len(word) > 1 and re.search('(名詞|組織名|地名|人名)',word[1])):
				hinshi = '名詞'
#				if(re.search('(人名)',word[1])):
#					hinshi = '人名'
#				if(re.search('(地名)',word[1])):
#					hinshi = '地名'
			
				keyword = word[0].replace(',','')
				
				yomi = obj[0].strip()
				kana = jctconv.hira2kata(yomi)
				return ('%s,0,0,%i,%s,%s,*,*,*,*,%s,%s,%s' % (keyword,__cost,hinshi,'一般',yomi,kana,kana))
Example #11
0
def convert(mozc_map, mozc_dir, output_dir):
    with open(os.path.join(output_dir, 'mozc.csv'), 'w') as out_fd:
        for f in glob.glob(
                os.path.join(mozc_dir,
                             'src/data/dictionary_oss/dictionary*.txt')):
            with open(f) as in_fd:
                for l in in_fd:
                    l = l.decode('utf8').strip().split('\t')
                    (yomi, lid, rid, cost, surface) = l[:5]
                    if lid not in mozc_map:
                        continue
                    (new_id, pos) = mozc_map[lid]
                    yomi = jctconv.hira2kata(yomi)
                    line = ','.join([
                        surface, new_id, new_id, '0', pos, surface, yomi, yomi
                    ])
                    line += '\n'
                    out_fd.write(line.encode('utf8', 'replace'))
Example #12
0
def split_at_hiragana(surface, reading):
    '''Split at hiragana.'''
    kidx = kanji_pos(surface)
    if kidx is None:  # only non-kanji
        return [surface, reading]

    rest = surface[kidx[1]:]
    kidx2 = kanji_pos(rest)

    if kidx2 is None:  # only one kanji + hiragana
        return [surface, reading]

    hira = rest[:kidx2[0]]  # multiple kanji
    if hira in ["々", "ヶ"]:  # since this character is somehow defined as non-kanji
        return [surface, reading]

    m = re.search(r'(.*?' + jctconv.hira2kata(hira) + ')(.*)', reading)
    return [surface[:kidx[1]] + hira, m.group(1), surface[kidx[1] + 1:], m.group(2)]
Example #13
0
def get_phrase(phrase, phrase_info):
    phrase_lyrics = {"sur": " ".join([w.split("\t")[0] for w in phrase]), 
            "info": " ".join(phrase_info), 
            "word":[]}
    accent = get_accent("".join([w.split("\t")[0] for w in phrase]))
    mora = 0
    acc_idx = 0
    for word in phrase:
        sur = word.split("\t")[0]
        if en_p.match(sur):
            word = parse_eng_word(sur)
        kana = word.split("\t")[1].split(",")[-1]
        kana = "".join(get_yomi(kana))
        if kana == "*":
            kana = jctconv.hira2kata(sur)
        mora += get_mora(kana)
        kana = "q".join(kana.split("ッ"))
        kana = "".join(kana.split("ー"))
        kana = "N".join(kana.split("ン"))
        word_alpha = split_alpha(kana)
        word_lyrics = {"sur":sur, "kana":kana, "info":word, "syllable":[]}
        for w_a in word_alpha:
            syllable_lyrics = {}
            out_char = [w_a[0], w_a[1]]
            if out_char[1] == "*":
                syllable_lyrics["sur"] = w_a[0]
                syllable_lyrics["roma"] = w_a[1]
                syllable_lyrics["accent"] = "*"
            else:
                if len(accent) <= mora:      
                    syllable_lyrics["sur"] = w_a[0]
                    syllable_lyrics["roma"] = w_a[1]
                    syllable_lyrics["accent"] = accent[-1]
                else:
                    syllable_lyrics["sur"] = w_a[0]
                    syllable_lyrics["roma"] = w_a[1]
                    if acc_idx+1 > len(accent):
                        syllable_lyrics["accent"] = accent[-1]
                    else:
                        syllable_lyrics["accent"] = accent[acc_idx]
                acc_idx += 1
            word_lyrics["syllable"].append(syllable_lyrics)
        phrase_lyrics["word"].append(word_lyrics)
    return phrase_lyrics
Example #14
0
def test_hira2kata():
    assert_equal(jctconv.hira2kata('ともえまみ'), 'トモエマミ')
    assert_equal(jctconv.hira2kata('まどまぎ', ignore='ど'), 'マどマギ')
    _compare(jctconv.hira2kata, HIRAGANA, FULL_KANA)
Example #15
0
def test_hira2kata():
    assert_equal(jctconv.hira2kata(u'ともえまみ'), u'トモエマミ')
    assert_equal(jctconv.hira2kata(HIRAGANA), FULL_KANA)
Example #16
0
 def normalize_yomi(self, yomi):
     yomi = jctconv.hira2kata(yomi)
     return yomi.replace('ウ゛', 'ヴ').replace(' ', '')
Example #17
0
 def normalize(s):
     s = jctconv.hira2kata(s).replace('・', '')
     s = re_symbol.sub('', s)
     return re_tyouon.sub('ー', s)