Beispiel #1
0
    def test_canonical_forms(self):
        base = (u'ゆっ', u'ぐり')
        seg1Cases = [u'ゆ' + c for c in u'いちりきつくっ']
        seg2Cases = [u'くり', u'ぐり']
        expected = set(combinations(seg1Cases, seg2Cases))

        self.assertEqual(set(alternations.canonical_forms(base)), expected)
Beispiel #2
0
def canonical_forms(kana_segments):
    """
    When given a sequence of segments, determine all possible canonical
    forms for the sequence. We define the canonical form to be the
    underlying form, before sequential voicing and sound euphony are
    applied.

    @param kana_segments: Reading segments in their surface form.
    """
    table = kana_table.KanaTable.get_cached()
    num_segments = len(kana_segments)

    candidate_sets = []
    for i, segment in enumerate(kana_segments):
        variants = [segment]

        if (i < num_segments - 1 and len(segment) > 1 and
                segment.endswith(u'っ')):
            # Can restore onbin cases.
            variants.extend([segment[:-1] + c for c in u'いちりきつく'])

        if i > 0 and table.is_voiced(segment[0]):
            # Can devoice.
            variants.extend([from_voiced[v[0]] + v[1:] for v in variants])

        candidate_sets.append(variants)

    return combinations(*candidate_sets)
Beispiel #3
0
def surface_forms(reading_segments):
    """
    The counterpart of canonical_forms(). Takes a correct reading, and
    determines how it could be erroneously modified into various surface
    forms.
    """
    candidate_sets = []
    candidate_sets.append(onbin_variants(reading_segments[0]))
    candidate_sets.extend(
        map(rendaku_variants, reading_segments[1:])
    )

    return combinations(*candidate_sets)