コード例 #1
0
def parse_same_word_file(filename, words):
    for line in read_csv(filename):
        if len(line) != 2 or line[0] == "詞彙":
            continue

        trad = line[0]
        simp = HanziConv.toSimplified(trad)
        pin = lazy_pinyin(
            trad,
            style=Style.TONE3,
            neutral_tone_with_five=True,
        )
        pin = " ".join(pin).lower()
        pin = pin.strip().replace("v", "u:")
        jyut = pinyin_jyutping_sentence.jyutping(trad,
                                                 tone_numbers=True,
                                                 spaces=True)
        freq = zipf_frequency(trad, "zh")
        defs = [
            objects.DefinitionTuple("​".join(jieba.cut(line[1])), "臺陸用法和差異",
                                    [])
        ]

        entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs)
        words.add(entry)
コード例 #2
0
ファイル: geo-transcript-srv.py プロジェクト: giggls/osml10n
def cantonese_transcript(inpstr):
    stlist = split_by_alphabet(inpstr)

    latin = ''
    for st in stlist:
        if (unicodedata.name(st[0]).split(' ')[0] == 'CJK'):
            transcript = ''
            try:
                transcript = pinyin_jyutping_sentence.jyutping(st, spaces=True)
            except:
                sys.stderr.write(
                    "pinyin_jyutping_sentence error transcribing >%s<\n" % st)
                return (None)
            latin = latin + transcript
        else:
            latin = latin + st
    return (latin)
コード例 #3
0
def parse_file(filename, words):
    with open(filename) as f:
        data = json.load(f)

        items_parsed = 0

        # Each item in the JSON correspond to one or more entries in the dictionary
        # Most items map 1:1 to entries, e.g. "物質" is a single entry
        # Some items are 多音字, so they map to multiple entries (e.g. 重 -> zhòng and chóng)
        #
        # In the vocabulary of the MoEDict, each item may correspond to multiple heteronyms,
        # and each heteronym maps to a single entry.
        for item in data:
            # For now, ignore variant characters that aren't properly encoded in Unicode
            if re.match(EXCLUDE_VARIANT_REGEX_PATTERN, item["title"]):
                continue

            # These do not change no matter the heteronym
            trad = item["title"]
            simp = HanziConv.toSimplified(trad)
            jyut = pinyin_jyutping_sentence.jyutping(trad,
                                                     tone_numbers=True,
                                                     spaces=True)
            freq = zipf_frequency(trad, "zh")

            # Build up a list of definitions for each heteronym
            defs = []

            # Distinguish between heteronyms by their pinyin – if the pinyin of the
            # current heteronym does not match the old pinyin, then a new heteronym
            # must be created
            last_heteronym_pin = ""

            # Go through each heteronym, creating Entry objects for each one
            for heteronym in item["heteronyms"]:
                if "pinyin" not in heteronym:
                    logging.debug(
                        f'Could not find pinyin for heteronym of word {trad} with definitions {heteronym["definitions"]}'
                    )
                    continue

                pin = PINYIN_COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN.sub(
                    "", heteronym["pinyin"])
                pin = PINYIN_LITERARY_PRONUNCIATION_REGEX_PATTERN.sub("", pin)
                pin = PINYIN_ALTERNATE_PRONUNCIATION_REGEX_PATTERN.sub("", pin)
                pin = PINYIN_SECOND_ALTERNATE_PRONUNCIATION_REGEX_PATTERN.sub(
                    "", pin)
                pin = pin.split()
                pin = [
                    pinyin_to_tone_numbers(syllable, trad).split()
                    for syllable in pin
                ]
                pin = list(itertools.chain.from_iterable(pin))
                pin = pin[:len(trad)]
                pin = " ".join(pin)
                if last_heteronym_pin != "" and pin != last_heteronym_pin:
                    # Different pinyin means that we are now processing a new heteronym.
                    # We must create an Entry object for the definitions of the old heteronym
                    # and add it to the list of entries before processing the new one.
                    entry = objects.Entry(trad,
                                          simp,
                                          last_heteronym_pin,
                                          jyut,
                                          freq=freq,
                                          defs=defs)
                    words.append(entry)

                    # Reset the definitions list
                    defs = []

                for definition in heteronym["definitions"]:
                    label = definition["type"] if "type" in definition else ""

                    # Insert zero-width spaces so that we can reverse-search the definition
                    def_tuple = objects.DefinitionTuple(
                        "​".join(jieba.cut(definition["def"])), label, [])

                    # Parse and add examples to this definition
                    if "example" in definition:
                        for example in definition["example"]:
                            if EXAMPLE_REGEX_PATTERN.match(example):
                                # Every example is surrounded by "如:<example>", so only keep the example
                                example = EXAMPLE_REGEX_PATTERN.match(
                                    example).group(1)
                                # Some examples contain multiple examples, so split them up by enclosing brackets 「」
                                example_texts = (
                                    INDIVIDUAL_EXAMPLE_REGEX_PATTERN.findall(
                                        example))
                            else:
                                logging.warning(
                                    f"Found example that does not fit the normal example regex pattern: {trad}, {example}"
                                )
                                # Fall back to splitting on Chinese enumeration comma
                                example_texts = example.split("、")

                            for example_text in example_texts:
                                # Strip out weird whitespace
                                example_text = WHITESPACE_REGEX_PATTERN.sub(
                                    "", example_text)

                                # Joining and splitting separates series of full-width punctuation marks
                                # into separate items,  which is necessary so that lazy_pinyin() returns
                                # separate items for each full-width punctuation mark in the list it returns
                                #
                                # e.g. "《儒林外史.第四六回》:「成老爹道..." turns into
                                # "《 儒 林 外 史 . 第 四 六 回 》 : 「 成 老 爹 道", which turns into
                                # ['《', '儒', '林', '外', '史', '.', '第', '四', '六', '回', '》', ':', '「', '成', '老', '爹', '道']
                                # (Notice how "》:「"" is now split up into three different items)
                                example_pinyin = lazy_pinyin(
                                    " ".join(example_text).split(),
                                    style=Style.TONE3,
                                    neutral_tone_with_five=True,
                                )
                                example_pinyin = " ".join(
                                    example_pinyin).lower()
                                example_pinyin = example_pinyin.strip(
                                ).replace("v", "u:")

                                # Since the pinyin returned by lazy_pinyin doesn't always match the pinyin
                                # given in the heteronym, attempt to replace pinyin corresponding to the
                                # characters in this heteronym with the pinyin provided by the JSON file.
                                #
                                # e.g. example_text = "重新"; example_pinyin = "zhong4 xin1" (returned by lazy_pinyin)
                                # trad = "重", phrase_pinyin = "chong2"
                                # means that we should convert "zhong4 xin1" to "chong2 xin1"

                                # Strip out variant pronunciations for conversion purposes
                                phrase_pinyin = pin
                                phrase_pinyin = VARIANT_PRONUNCIATION_REGEX_PATTERN.sub(
                                    "",
                                    phrase_pinyin,
                                )
                                phrase_pinyin = (
                                    COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN.sub(
                                        "",
                                        phrase_pinyin,
                                    ))

                                # Do not try to match entries formatted like "那搭(Namibia)"
                                if not STRANGE_ENTRY_REGEX_PATTERN.match(trad):
                                    try:
                                        example_pinyin = change_pinyin_to_match_phrase(
                                            example_text,
                                            example_pinyin,
                                            trad,
                                            phrase_pinyin,
                                        )
                                    except Exception as e:
                                        logging.warning(
                                            f"Couldn't change pinyin in example for word {trad}: "
                                            f"{''.join(example_text)}, {example_pinyin}, {pin}, "
                                            f"{e}")
                                        traceback.print_exc()

                                def_tuple.examples.append(
                                    objects.ExampleTuple(
                                        "cmn", example_pinyin, example_text))

                    # Parse and add quotes to this definition
                    if "quote" in definition:
                        for quote in definition["quote"]:
                            quote_text = re.sub(WHITESPACE_REGEX_PATTERN, "",
                                                quote)

                            quote_pinyin = lazy_pinyin(
                                " ".join(quote_text).split(),
                                style=Style.TONE3,
                                neutral_tone_with_five=True,
                            )
                            quote_pinyin = " ".join(quote_pinyin).lower()
                            quote_pinyin = quote_pinyin.strip().replace(
                                "v", "u:")

                            phrase_pinyin = pin
                            phrase_pinyin = re.sub(
                                VARIANT_PRONUNCIATION_REGEX_PATTERN, "",
                                phrase_pinyin)
                            phrase_pinyin = re.sub(
                                COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN,
                                "",
                                phrase_pinyin,
                            )

                            if not re.match(STRANGE_ENTRY_REGEX_PATTERN, trad):
                                try:
                                    quote_pinyin = change_pinyin_to_match_phrase(
                                        quote_text, quote_pinyin, trad,
                                        phrase_pinyin)
                                except Exception as e:
                                    logging.warning(
                                        f"Couldn't change pinyin in quote for word {trad}: "
                                        f"{''.join(quote_text)}, {quote_pinyin}, {pin} "
                                        f"{e}")
                                    traceback.print_exc()
                            def_tuple.examples.append(
                                objects.ExampleTuple("zho", quote_pinyin,
                                                     quote_text))

                    # We currently ignore synonyms, antonyms, and "see also" links, because they are
                    # linked to definitions and we have no way to display that data...

                    defs.append(def_tuple)

                last_heteronym_pin = pin

            entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs)
            words.append(entry)

            items_parsed += 1
            if not items_parsed % 500:
                print(f"Parsed entry #{items_parsed}")
コード例 #4
0
def parse_file(filename, words):
    with open(filename) as f:
        data = json.load(f)

        items_parsed = 0

        # Each item in the JSON correspond to one or more entries in the dictionary
        # Most items map 1:1 to entries, e.g. "物質" is a single entry
        # Some items are 多音字, so they map to multiple entries (e.g. 重 -> zhòng and chóng)
        #
        # In the vocabulary of the the CSLD, each item may correspond to multiple heteronyms,
        # and each heteronym maps to a single entry.
        for item in data:
            # These do not change no matter the heteronym
            trad = item["title"]
            simp = HanziConv.toSimplified(trad)
            jyut = pinyin_jyutping_sentence.jyutping(trad,
                                                     tone_numbers=True,
                                                     spaces=True)
            freq = zipf_frequency(trad, "zh")

            # Some items have multiple pronunciations (one for Taiwan, one for Mainland China)
            taiwan_pin = mainland_pin = ""

            # Build up a list of definitions for each heteronym
            taiwan_defs = []
            mainland_defs = []

            # Distinguish between heteronyms by their pinyin – if the pinyin of the
            # current heteronym does not match the old pinyin, then a new heteronym
            # must be created
            last_heteronym_pin = ""
            last_taiwan_pin = last_mainland_pin = ""

            # Go through each heteronym, creating Entry objects for each one
            for heteronym in item["heteronyms"]:
                if "pinyin" not in heteronym:
                    logging.debug(
                        f'Could not find pinyin for heteronym of word {trad} with definitions {heteronym["definitions"]}'
                    )
                    continue

                # Filter out known bad pinyin
                if (trad in KNOWN_INVALID_SYLLABLES and heteronym["pinyin"]
                        in KNOWN_INVALID_SYLLABLES[trad]):
                    pins = KNOWN_INVALID_SYLLABLES[trad][heteronym["pinyin"]]
                else:
                    pins = heteronym["pinyin"].split("<br>陸⃝")

                    # Some weird a's cause dragonmapper to break, so replace them with standard a's.
                    pins = list(map(lambda x: x.replace("ɑ", "a"), pins))

                    # Remove dashes in pinyin
                    pins = list(map(lambda x: x.replace("-", " "), pins))

                    # Remove commas in pinyin
                    pins = list(map(lambda x: x.replace(",", ""), pins))

                    # Remove weird characters
                    pins = list(map(lambda x: x.replace("陸⃟", ""), pins))

                    # Dragonmapper cannot handle some erhua
                    pins = list(
                        map(lambda x: x.replace("diǎr", "diǎn er"), pins))
                    pins = list(
                        map(lambda x: x.replace("biār", "biān er"), pins))

                    try:
                        # Converting from pinyin -> zhuyin inserts spaces between characters
                        # Converting from zhuyin -> pinyin conserves these spaces
                        pins = [
                            transcriptions.zhuyin_to_pinyin(
                                transcriptions.pinyin_to_zhuyin(x),
                                accented=False) for x in pins
                        ]

                        for x in pins:
                            if x.count(" ") >= len(trad):
                                # This means that there was an extra space inserted somewhere; the pinyin is not valid
                                raise ValueError(
                                    "Too many spaces in parsed Pinyin!")
                    except Exception as e:
                        # Try parsing zhuyin as a backup
                        pins = heteronym["bopomofo"].split("<br>陸⃝")

                        # Remove weird spaces in zhuyin
                        pins = list(map(lambda x: x.replace(" ", " "), pins))

                        try:
                            pins = [
                                transcriptions.zhuyin_to_pinyin(x,
                                                                accented=False)
                                for x in pins
                            ]
                        except Exception as e:
                            logging.error(
                                f"Unable to split up Pinyin for word {trad}: {e}, skipping word..."
                            )
                            continue

                if len(pins) > 1:
                    taiwan_pin = pins[0]
                    mainland_pin = pins[1]
                else:
                    taiwan_pin = mainland_pin = pins[0]

                if (last_heteronym_pin != ""
                        and heteronym["pinyin"] != last_heteronym_pin):
                    # A new different pinyin means that we are now processing a new heteronym.
                    # We must create an Entry object for the definitions of the old heteronym
                    # and add it to the list of entries before processing the new one.
                    entry = objects.Entry(trad,
                                          simp,
                                          last_taiwan_pin,
                                          jyut,
                                          freq=freq,
                                          defs=taiwan_defs)
                    words.append(entry)

                    if last_mainland_pin != last_taiwan_pin:
                        entry = objects.Entry(
                            trad,
                            simp,
                            last_mainland_pin,
                            jyut,
                            freq=freq,
                            defs=mainland_defs,
                        )
                        words.append(entry)

                    # Reset the definitions list
                    taiwan_defs = []
                    mainland_defs = []

                for definition in heteronym["definitions"]:
                    taiwan_label = "臺" if taiwan_pin != mainland_pin else ""
                    mainland_label = "陸" if mainland_pin != taiwan_pin else ""

                    definition_text = definition["def"]

                    # Take out parts of definitions that should be in labels
                    for pattern in LABEL_REGEX_PATTERNS:
                        if re.match(pattern, definition_text):
                            definition_label, definition_text = re.match(
                                pattern, definition_text).group(1, 2)
                            taiwan_label += ("、" +
                                             definition_label if taiwan_label
                                             else definition_label)
                            mainland_label += ("、" + definition_label
                                               if mainland_label else
                                               definition_label)

                    # Remove 臺⃝ and 陸⃝ from definitions, since Qt cannot display them
                    definition_text = definition_text.replace("臺⃝", "臺:")
                    definition_text = definition_text.replace("陸⃝", "陸:")

                    # Insert zero-width spaces so that we can reverse-search the definition
                    taiwan_def_tuple = objects.DefinitionTuple(
                        "​".join(jieba.cut(definition_text)), taiwan_label, [])
                    mainland_def_tuple = objects.DefinitionTuple(
                        "​".join(jieba.cut(definition_text)), mainland_label,
                        [])

                    # Parse and add examples to this definition
                    if "example" in definition:
                        for example in definition["example"]:
                            if re.match(EXAMPLE_REGEX_PATTERN, example):
                                # Every example is surrounded by "如:<example>", so only keep the example
                                example = re.match(EXAMPLE_REGEX_PATTERN,
                                                   example).group(1)
                                # Some examples contain multiple examples, so split them up by enclosing brackets 「」
                                example_texts = re.findall(
                                    INDIVIDUAL_EXAMPLE_REGEX_PATTERN, example)
                            else:
                                logging.warning(
                                    f"Found example that does not fit the normal example regex pattern: {trad}, {example}"
                                )
                                # Fall back to splitting on Chinese enumeration comma
                                example_texts = example.split("、")

                            for example_text in example_texts:
                                # Strip out weird whitespace
                                example_text = re.sub(WHITESPACE_REGEX_PATTERN,
                                                      "", example_text)

                                # Joining and splitting separates series of full-width punctuation marks
                                # into separate items,  which is necessary so that lazy_pinyin() returns
                                # separate items for each full-width punctuation mark in the list it returns
                                #
                                # e.g. "《儒林外史.第四六回》:「成老爹道..." turns into
                                # "《 儒 林 外 史 . 第 四 六 回 》 : 「 成 老 爹 道", which turns into
                                # ['《', '儒', '林', '外', '史', '.', '第', '四', '六', '回', '》', ':', '「', '成', '老', '爹', '道']
                                # (Notice how "》:「"" is now split up into three different items)
                                example_pinyin = lazy_pinyin(
                                    " ".join(example_text).split(),
                                    style=Style.TONE3,
                                    neutral_tone_with_five=True,
                                )
                                example_pinyin = " ".join(
                                    example_pinyin).lower()
                                example_pinyin = example_pinyin.strip(
                                ).replace("v", "u:")

                                # Since the pinyin returned by lazy_pinyin doesn't always match the pinyin
                                # given in the heteronym, attempt to replace pinyin corresponding to the
                                # characters in this heteronym with the pinyin provided by the JSON file.
                                #
                                # e.g. example_text = "重新"; example_pinyin = "zhong4 xin1" (returned by lazy_pinyin)
                                # trad = "重", phrase_pinyin = "chong2"
                                # means that we should convert "zhong4 xin1" to "chong2 xin1"

                                # Strip out variant pronunciations for conversion purposes
                                for index, pin in enumerate(
                                    [taiwan_pin, mainland_pin]):
                                    phrase_pinyin = pin
                                    phrase_pinyin = re.sub(
                                        VARIANT_PRONUNCIATION_REGEX_PATTERN,
                                        "",
                                        phrase_pinyin,
                                    )
                                    phrase_pinyin = re.sub(
                                        COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN,
                                        "",
                                        phrase_pinyin,
                                    )

                                    # Do not try to match entries formatted like "那搭(Namibia)"
                                    if not re.match(
                                            STRANGE_ENTRY_REGEX_PATTERN, trad):
                                        try:
                                            example_pinyin = (
                                                change_pinyin_to_match_phrase(
                                                    example_text,
                                                    example_pinyin,
                                                    trad,
                                                    phrase_pinyin,
                                                ))
                                        except Exception as e:
                                            logging.warning(
                                                f"Couldn't change pinyin in example for word {trad}: "
                                                f"{''.join(example_text)}, {example_pinyin}, {pin}, "
                                                f"{e}")
                                            traceback.print_exc()

                                    if index == 0:
                                        taiwan_def_tuple.examples.append(
                                            objects.ExampleTuple(
                                                "zho", example_pinyin,
                                                example_text))
                                    elif index == 1:
                                        mainland_def_tuple.examples.append(
                                            objects.ExampleTuple(
                                                "zho", example_pinyin,
                                                example_text))

                    taiwan_defs.append(taiwan_def_tuple)
                    mainland_defs.append(mainland_def_tuple)

                last_heteronym_pin = heteronym["pinyin"]
                last_taiwan_pin = taiwan_pin
                last_mainland_pin = mainland_pin

            entry = objects.Entry(trad,
                                  simp,
                                  taiwan_pin,
                                  jyut,
                                  freq=freq,
                                  defs=taiwan_defs)
            words.append(entry)

            if mainland_pin != taiwan_pin:
                entry = objects.Entry(trad,
                                      simp,
                                      mainland_pin,
                                      jyut,
                                      freq=freq,
                                      defs=mainland_defs)
                words.append(entry)

            items_parsed += 1
            if not items_parsed % 500:
                print(f"Parsed entry #{items_parsed}")
コード例 #5
0
def parse_same_meaning_file(filename, words):
    for line in read_csv(filename):
        if len(line) != 17 or line[0] == "總分類":
            continue

        terms = defaultdict(set)

        for index in (4, 5, 6):
            if line[index]:
                terms["臺"].add(line[index])

        for index in (7, 8, 9):
            if line[index]:
                terms["陸"].add(line[index])

        for index in (10, 11, 12):
            if line[index]:
                terms["香"].add(line[index])

        for index in (13, 14, 15):
            if line[index]:
                terms["澳"].add(line[index])

        explanation = None
        if line[16]:
            explanation = objects.DefinitionTuple(
                "​".join(jieba.cut(line[16])), "差異說明", [])

        for location in terms:
            for term in terms[location]:
                trad = term
                simp = HanziConv.toSimplified(trad)
                if term == line[4] and line[2]:
                    # Use the provided pinyin, which always corresponds at least to the first Taiwan term
                    pin = transcriptions.zhuyin_to_pinyin(line[2].replace(
                        " ", " "),
                                                          accented=False)
                else:
                    pin = lazy_pinyin(
                        trad,
                        style=Style.TONE3,
                        neutral_tone_with_five=True,
                    )
                    pin = " ".join(pin).lower()
                    pin = pin.strip().replace("v", "u:")
                jyut = pinyin_jyutping_sentence.jyutping(trad,
                                                         tone_numbers=True,
                                                         spaces=True)
                freq = zipf_frequency(trad, "zh")

                defs = terms.keys()
                defs = map(
                    lambda x: objects.DefinitionTuple("、".join(terms[x]), line[
                        1] + ":" + x, []),
                    defs,
                )
                defs = list(defs)

                if explanation:
                    defs.append(explanation)

                entry = objects.Entry(trad,
                                      simp,
                                      pin,
                                      jyut,
                                      freq=freq,
                                      defs=defs)
                words.add(entry)
コード例 #6
0
def parse_sentence_file(
    filename,
    source,
    target,
    sentences,
    nonchinese_sentences,
    intermediate_ids,
    enable_jyutping,
    enable_pinyin,
):
    print("Parsing sentence file...")
    with open(filename, "r", encoding="utf8") as f:
        for index, line in enumerate(f):
            if len(line) == 0 or line[0] == "#":
                continue

            split = line.split()
            lang = split[1]
            sentence_start = line.index("\t", line.index("\t") + 1) + 1
            sentence = line[sentence_start:]

            sentence_id = split[0]

            if lang == source:
                if hanzidentifier.is_simplified(sentence):
                    trad = HanziConv.toTraditional(sentence)
                    simp = sentence
                else:
                    trad = sentence
                    simp = HanziConv.toSimplified(sentence)
                pin = ""
                if enable_pinyin:
                    pin = " ".join(
                        lazy_pinyin(
                            trad, style=Style.TONE3, neutral_tone_with_five=True
                        )
                    ).lower()
                    pin = pin.strip().replace("v", "u:")
                jyut = ""
                if enable_jyutping:
                    jyut = pinyin_jyutping_sentence.jyutping(
                        trad, tone_numbers=True, spaces=True
                    )
                sentence_row = objects.ChineseSentence(
                    sentence_id,
                    trad,
                    simp,
                    pin,
                    jyut,
                    lang,
                )

                sentences[sentence_id] = sentence_row
                continue

            if lang == target:
                sentence = line[sentence_start:].strip()
                sentence_translation = objects.NonChineseSentence(
                    sentence_id, sentence, lang
                )
                nonchinese_sentences[sentence_id] = sentence_translation
                continue

            intermediate_ids.add(sentence_id)