def prob(self, grapheme, reading, alt_reading): """ Returns the probability of P(r|k), using the formula: P(r|k) ~ (alpha)P_raw(r|k) + (1-alpha)P(r|r*)P(r*|k). """ if scripts.to_hiragana(grapheme) == scripts.to_hiragana(alt_reading): # Special case: where the segment is phonetic. return 1.0 # We only handle entire kanji segments. assert scripts.script_types(grapheme) == set([scripts.Script.Kanji]) alpha = settings.ALTERNATION_ALPHA assert 0 <= alpha <= 1 try: rawProb = self.raw_freq_dist[grapheme].freq(alt_reading) except KeyError: rawProb = 0.0 normalizedProb = self.normalized_freq_dist[grapheme].freq(reading) alternationProb = self.alternation_dist[reading].freq(alt_reading) result = alpha*rawProb + (1-alpha)*normalizedProb*alternationProb return result
def test_fetch_scripts(self): """ Test fetching of hiragana and katakana, and converting between them. """ hiragana = 'ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ' # nopep8 self.assertEqual(scripts.get_script(Script.Hiragana), hiragana) katakana = 'ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶ' # nopep8 self.assertEqual(scripts.get_script(Script.Katakana), katakana) self.assertEqual(scripts.to_hiragana(katakana), hiragana) self.assertEqual(scripts.to_katakana(hiragana), katakana)
def __init__(self): ConditionalFreqDist.__init__(self) kanji_script = scripts.Script.Kanji i_stream = sopen(_edict_aligned_file, 'r') for line in i_stream: alignment = Alignment.from_line(line) for (g, p) in alignment: if scripts.contains_script(kanji_script, g): self[g].inc(scripts.to_hiragana(p)) i_stream.close() return
def expand_long_vowels(kana_string): """ Expands whatever long vowels are possible to expand. >>> a = expand_long_vowels(u'すー') >>> b = u'すう' >>> a == b True """ script_converters = { scripts.Script.Hiragana: lambda x: x, scripts.Script.Katakana: scripts.to_katakana } table = kana_table.KanaTable.get_cached() out_string = '' for segment in scripts.script_boundaries(kana_string): if len(segment): char_type = scripts.script_type(segment) if char_type not in script_converters: out_string += segment continue reverse_operation = script_converters[char_type] segment = scripts.to_hiragana(segment) else: continue for m in _long_finder.finditer(segment): i = m.start() vowel = table.to_vowel_line(segment[i - 1]) segment = segment[:i] + vowel + segment[i + 1:] out_string += reverse_operation(segment) return out_string
def expand_long_vowels(kana_string): """ Expands whatever long vowels are possible to expand. >>> a = expand_long_vowels(u'すー') >>> b = u'すう' >>> a == b True """ script_converters = {scripts.Script.Hiragana: lambda x: x, scripts.Script.Katakana: scripts.to_katakana} table = kana_table.KanaTable.get_cached() out_string = '' for segment in scripts.script_boundaries(kana_string): if len(segment): char_type = scripts.script_type(segment) if char_type not in script_converters: out_string += segment continue reverse_operation = script_converters[char_type] segment = scripts.to_hiragana(segment) else: continue for m in _long_finder.finditer(segment): i = m.start() vowel = table.to_vowel_line(segment[i-1]) segment = segment[:i] + vowel + segment[i+1:] out_string += reverse_operation(segment) return out_string
def _clean_readings(reading_list): return set( scripts.to_hiragana(r.split('.')[0]) for r in reading_list )
def _clean_readings(reading_list): return set(scripts.to_hiragana(r.split('.')[0]) for r in reading_list)