Exemple #1
0
def _resegment_chars_jyutping_data(json_filename):
    json_path = os.path.join(_THIS_DIR, json_filename)

    with open(json_path, encoding="utf8") as f:
        chars_to_jyutping = json.load(f)

    with open(os.path.join(_THIS_DIR, _RESEGMENTED_FILENAME),
              encoding="utf8") as f:
        resegmented = {}
        for line in f:
            line = line.strip()
            if not line or line.startswith("# ") or " " not in line:
                continue
            resegmented[line.replace(" ", "")] = line

    new_chars_to_jyutping = {}

    for chars, jp in chars_to_jyutping.items():
        if chars in resegmented:
            chars_split = split_characters_with_alphanum(chars)
            jp_split = parse_jyutping(jp)

            # Don't bother if we can't match each jyutping syllable
            # with each Cantonese character.
            if len(chars_split) != len(jp_split):
                new_chars_to_jyutping[chars] = jp

            else:
                new_words = resegmented[chars].split()
                i = 0
                for new_word in new_words:

                    # If this new word already exists in the original
                    # mapping, don't re-add it to the new map, or else
                    # we risk altering this word's jyutping representation
                    # (some Cantonese words/characters have multiple
                    # pronunciations, and we've already chosen the more
                    # frequent one according to the rime-cantonese source).
                    if new_word in chars_to_jyutping:
                        i += len(split_characters_with_alphanum(new_word))
                        continue

                    new_jp_for_word = ""
                    for _ in range(
                            len(split_characters_with_alphanum(new_word))):
                        new_jp_for_word += "".join(jp_split[i])
                        i += 1
                    new_chars_to_jyutping[new_word] = new_jp_for_word
        else:
            new_chars_to_jyutping[chars] = jp

    with open(json_path, "w", encoding="utf8") as f:
        json.dump(new_chars_to_jyutping, f, indent=4, ensure_ascii=False)
Exemple #2
0
    def __init__(
        self,
        *,
        max_word_length=_MAX_WORD_LENGTH,
        allow=None,
        disallow=None,
    ):
        """Initialize a Segmenter object.

        Parameters
        ----------
        max_word_length : int, optional
            Maximum word length this model allows.
        allow : iterable[str], optional
            Words to allow in word segmentation.
        disallow : iterable[str], optional
            Words to disallow in word segmentation.
        """
        super(Segmenter, self).__init__(max_word_length=max_word_length)

        # Train with HKCanCor data.
        self.fit(hkcancor().sents())

        # Train with rime-cantonese data.
        self._words |= CHARS_TO_JYUTPING.keys()
        self._words |= LETTERED.keys()

        # Adjust with the allowed and disallowed words.
        self._words |= allow or set()
        self._words -= disallow or set()

        # Turn everything from strings to tuples due to alphanumeric chars.
        self._words = {split_characters_with_alphanum(x) for x in self._words}
Exemple #3
0
def _get_words_characters_to_jyutping():
    corpus = hkcancor()
    words_to_jyutping_counters = defaultdict(Counter)
    characters_to_jyutping_counters = defaultdict(Counter)

    for word, _, jyutping, _ in corpus.tagged_words():
        if not jyutping or not word:
            continue
        try:
            parsed_jp = parse_jyutping(jyutping)
        except ValueError:
            continue
        if len(word) != len(parsed_jp):
            continue
        words_to_jyutping_counters[word][jyutping] += 1
        for char, jp in zip(word, parsed_jp):
            characters_to_jyutping_counters[char]["".join(jp)] += 1

    words_to_jyutping = {}
    for word, jyutping_counter in words_to_jyutping_counters.items():
        jp = jyutping_counter.most_common(1)[0][0]
        words_to_jyutping[word] = jp
    characters_to_jyutping = {}
    for character, jyutping_counter in characters_to_jyutping_counters.items():
        jp = jyutping_counter.most_common(1)[0][0]
        characters_to_jyutping[character] = jp

    words_to_jyutping = {
        # The ordering of the following dicts matters. The rime-cantonese
        # data may contain what's been re-segmented by this repo, and may
        # contain jyutping pronunciations for particular characters that
        # are only used in those contexts. The data from HKCanCor should comes
        # last to act as the default to override such cases.
        **{
            k: v
            for k, v in LETTERED.items() if len(
                split_characters_with_alphanum(k)) > 1
        },
        **{k: v
           for k, v in CHARS_TO_JYUTPING.items() if len(k) > 1},
        **words_to_jyutping,
    }

    # TODO: Extract characters from CHARS_TO_JYUTPING and LETTERED
    #    and add them to characters_to_jyutping
    characters_to_jyutping = {
        # The ordering of the following dicts matters. The rime-cantonese
        # data may contain what's been re-segmented by this repo, and may
        # contain jyutping pronunciations for particular characters that
        # are only used in those contexts. The data from HKCanCor should comes
        # last to act as the default to override such cases.
        **{k: v
           for k, v in LETTERED.items() if len(k) == 1},
        **{k: v
           for k, v in CHARS_TO_JYUTPING.items() if len(k) == 1},
        **characters_to_jyutping,
    }

    return words_to_jyutping, characters_to_jyutping
Exemple #4
0
def test_split_characters_with_english(chars, expected):
    assert split_characters_with_alphanum(chars) == expected
Exemple #5
0
 def _predict_sent(self, sent_str):
     chars = split_characters_with_alphanum(sent_str)
     segmented = super(Segmenter, self)._predict_sent(chars)
     # Turn the result back from tuples to strings.
     segmented = ["".join(x) for x in segmented]
     return segmented