Example #1
0
def parse_variants(filename, words_traditional, words_simplified):
    variant_pairs = set()

    for line in read_csv(filename):
        if len(line) != 3 or line[0].startswith("#"):
            continue

        codepoint = line[0]
        fieldname = line[1]
        content = line[2]

        if fieldname not in ("kSimplifiedVariant", "kTraditionalVariant"):
            continue

        character = chr(int(codepoint[2:], 16))
        variants = [chr(int(item[2:], 16)) for item in content.split()]

        if fieldname == "kTraditionalVariant":
            for variant in variants:
                if (variant, character) not in variant_pairs:
                    # This character has a traditional variant - insert into words
                    entry = objects.Entry(variant, character, "", "")
                    words_traditional[variant].append(entry)
                    words_simplified[character].append(entry)
                    variant_pairs.add((variant, character))
        elif fieldname == "kSimplifiedVariant":
            for variant in variants:
                if (character, variant) not in variant_pairs:
                    # This character has a simplified variant - insert into words
                    entry = objects.Entry(character, variant, "", "")
                    words_traditional[character].append(entry)
                    words_simplified[variant].append(entry)
                    variant_pairs.add((character, variant))
Example #2
0
def parse_cc_cedict_canto_readings(filename, entries):
    with open(filename, "r", encoding="utf8") as f:
        for line in f:
            if len(line) == 0 or line[0] == "#":
                continue

            split = line.split()
            trad = split[0]
            simp = split[1]
            pin = line[line.index("[") + 1:line.index("]")].lower().replace(
                "v", "u:")
            jyut = line[line.index("{") + 1:line.index("}")].lower()

            entry = objects.Entry(trad=trad, simp=simp, pin=pin, jyut=jyut)

            if trad in entries:
                new_entry = True
                for existing_entry in entries[trad]:
                    if (existing_entry.simplified == simp
                            and existing_entry.pinyin == pin
                            and existing_entry.jyutping == jyut):
                        new_entry = False
                        break
                if new_entry:
                    entries[trad].append(entry)
            else:
                entries[trad] = [entry]
Example #3
0
def parse_same_word_file(filename, words):
    for line in read_csv(filename):
        if len(line) != 2 or line[0] == "詞彙":
            continue

        trad = line[0]
        simp = HanziConv.toSimplified(trad)
        pin = lazy_pinyin(
            trad,
            style=Style.TONE3,
            neutral_tone_with_five=True,
        )
        pin = " ".join(pin).lower()
        pin = pin.strip().replace("v", "u:")
        jyut = pinyin_jyutping_sentence.jyutping(trad,
                                                 tone_numbers=True,
                                                 spaces=True)
        freq = zipf_frequency(trad, "zh")
        defs = [
            objects.DefinitionTuple("​".join(jieba.cut(line[1])), "臺陸用法和差異",
                                    [])
        ]

        entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs)
        words.add(entry)
Example #4
0
def parse_cc_canto(filename, entries):
    with open(filename, "r", encoding="utf8") as f:
        for line in f:
            if len(line) == 0 or line[0] == "#":
                continue

            split = line.split()  # Splits by whitespace
            trad = split[0]
            simp = split[1]
            pin = line[line.index("[") + 1:line.index("]")].lower().replace(
                "v", "u:")
            jyut = line[line.index("{") + 1:line.index("}")].lower()
            entry = objects.Entry(trad=trad, simp=simp, pin=pin, jyut=jyut)

            # Check if entry is already in dictionary
            if trad in entries:
                # If entry is in dictionary, then
                # make sure is new entry before adding
                # to list
                new_entry = True
                for existing_entry in entries[trad]:
                    if (existing_entry.simplified == simp
                            and existing_entry.pinyin == pin
                            and existing_entry.jyutping == jyut):
                        new_entry = False
                        break
                if new_entry:
                    entries[trad].append(entry)
            else:
                entries[trad] = [entry]
Example #5
0
def parse_readings(filename, words_traditional, words_simplified):
    for line in read_csv(filename):
        if len(line) != 3 or line[0].startswith("#"):
            continue

        codepoint = line[0]
        fieldname = line[1]
        content = line[2]

        if fieldname not in ("kCantonese", "kMandarin", "kDefinition"):
            continue

        character = chr(int(codepoint[2:], 16))

        entry_added = False

        if character in words_traditional:
            freq = zipf_frequency(character, "zh")

            for entry in words_traditional[character]:
                entry.add_freq(freq)
                if fieldname == "kCantonese":
                    entry.add_jyutping(content)
                elif fieldname == "kMandarin":
                    pin = convert_pinyin_to_tone_numbers(content, character)
                    entry.add_pinyin(pin)
                elif fieldname == "kDefinition":
                    entry.add_defs([("", x.strip())
                                    for x in content.split(";")])

            entry_added = True

        if character in words_simplified:
            # Ignore simplified characters
            entry_added = True

        if not entry_added:
            trad = simp = character
            freq = zipf_frequency(trad, "zh")
            jyut = content if fieldname == "kCantonese" else ""
            pin = (convert_pinyin_to_tone_numbers(content, trad)
                   if fieldname == "kMandarin" else "")
            defs = ([("", x.strip()) for x in content.split(";")]
                    if fieldname == "kDefinition" else [])

            entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs)
            words_traditional[trad].append(entry)
            words_simplified[simp].append(entry)
Example #6
0
def parse_file(filename, entries):
    with open(filename, "r", encoding="utf8") as f:
        for index, line in enumerate(f):
            if len(line) == 0 or line[0] == "#":
                continue

            split = line.split()
            trad = split[0]
            simp = split[1]
            pin = line[line.index("[") + 1:line.index("]")].lower().replace(
                "v", "u:")
            definitions = line[line.index("/") + 1:-2].split("/")
            entry = objects.Entry(trad=trad,
                                  simp=simp,
                                  pin=pin,
                                  defs=definitions)

            if trad in entries:
                entries[trad].append(entry)
            else:
                entries[trad] = [entry]
Example #7
0
def parse_file(filename, words):
    with open(filename) as f:
        data = json.load(f)

        items_parsed = 0

        # Each item in the JSON correspond to one or more entries in the dictionary
        # Most items map 1:1 to entries, e.g. "物質" is a single entry
        # Some items are 多音字, so they map to multiple entries (e.g. 重 -> zhòng and chóng)
        #
        # In the vocabulary of the MoEDict, each item may correspond to multiple heteronyms,
        # and each heteronym maps to a single entry.
        for item in data:
            # For now, ignore variant characters that aren't properly encoded in Unicode
            if re.match(EXCLUDE_VARIANT_REGEX_PATTERN, item["title"]):
                continue

            # These do not change no matter the heteronym
            trad = item["title"]
            simp = HanziConv.toSimplified(trad)
            jyut = pinyin_jyutping_sentence.jyutping(trad,
                                                     tone_numbers=True,
                                                     spaces=True)
            freq = zipf_frequency(trad, "zh")

            # Build up a list of definitions for each heteronym
            defs = []

            # Distinguish between heteronyms by their pinyin – if the pinyin of the
            # current heteronym does not match the old pinyin, then a new heteronym
            # must be created
            last_heteronym_pin = ""

            # Go through each heteronym, creating Entry objects for each one
            for heteronym in item["heteronyms"]:
                if "pinyin" not in heteronym:
                    logging.debug(
                        f'Could not find pinyin for heteronym of word {trad} with definitions {heteronym["definitions"]}'
                    )
                    continue

                pin = PINYIN_COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN.sub(
                    "", heteronym["pinyin"])
                pin = PINYIN_LITERARY_PRONUNCIATION_REGEX_PATTERN.sub("", pin)
                pin = PINYIN_ALTERNATE_PRONUNCIATION_REGEX_PATTERN.sub("", pin)
                pin = PINYIN_SECOND_ALTERNATE_PRONUNCIATION_REGEX_PATTERN.sub(
                    "", pin)
                pin = pin.split()
                pin = [
                    pinyin_to_tone_numbers(syllable, trad).split()
                    for syllable in pin
                ]
                pin = list(itertools.chain.from_iterable(pin))
                pin = pin[:len(trad)]
                pin = " ".join(pin)
                if last_heteronym_pin != "" and pin != last_heteronym_pin:
                    # Different pinyin means that we are now processing a new heteronym.
                    # We must create an Entry object for the definitions of the old heteronym
                    # and add it to the list of entries before processing the new one.
                    entry = objects.Entry(trad,
                                          simp,
                                          last_heteronym_pin,
                                          jyut,
                                          freq=freq,
                                          defs=defs)
                    words.append(entry)

                    # Reset the definitions list
                    defs = []

                for definition in heteronym["definitions"]:
                    label = definition["type"] if "type" in definition else ""

                    # Insert zero-width spaces so that we can reverse-search the definition
                    def_tuple = objects.DefinitionTuple(
                        "​".join(jieba.cut(definition["def"])), label, [])

                    # Parse and add examples to this definition
                    if "example" in definition:
                        for example in definition["example"]:
                            if EXAMPLE_REGEX_PATTERN.match(example):
                                # Every example is surrounded by "如:<example>", so only keep the example
                                example = EXAMPLE_REGEX_PATTERN.match(
                                    example).group(1)
                                # Some examples contain multiple examples, so split them up by enclosing brackets 「」
                                example_texts = (
                                    INDIVIDUAL_EXAMPLE_REGEX_PATTERN.findall(
                                        example))
                            else:
                                logging.warning(
                                    f"Found example that does not fit the normal example regex pattern: {trad}, {example}"
                                )
                                # Fall back to splitting on Chinese enumeration comma
                                example_texts = example.split("、")

                            for example_text in example_texts:
                                # Strip out weird whitespace
                                example_text = WHITESPACE_REGEX_PATTERN.sub(
                                    "", example_text)

                                # Joining and splitting separates series of full-width punctuation marks
                                # into separate items,  which is necessary so that lazy_pinyin() returns
                                # separate items for each full-width punctuation mark in the list it returns
                                #
                                # e.g. "《儒林外史.第四六回》:「成老爹道..." turns into
                                # "《 儒 林 外 史 . 第 四 六 回 》 : 「 成 老 爹 道", which turns into
                                # ['《', '儒', '林', '外', '史', '.', '第', '四', '六', '回', '》', ':', '「', '成', '老', '爹', '道']
                                # (Notice how "》:「"" is now split up into three different items)
                                example_pinyin = lazy_pinyin(
                                    " ".join(example_text).split(),
                                    style=Style.TONE3,
                                    neutral_tone_with_five=True,
                                )
                                example_pinyin = " ".join(
                                    example_pinyin).lower()
                                example_pinyin = example_pinyin.strip(
                                ).replace("v", "u:")

                                # Since the pinyin returned by lazy_pinyin doesn't always match the pinyin
                                # given in the heteronym, attempt to replace pinyin corresponding to the
                                # characters in this heteronym with the pinyin provided by the JSON file.
                                #
                                # e.g. example_text = "重新"; example_pinyin = "zhong4 xin1" (returned by lazy_pinyin)
                                # trad = "重", phrase_pinyin = "chong2"
                                # means that we should convert "zhong4 xin1" to "chong2 xin1"

                                # Strip out variant pronunciations for conversion purposes
                                phrase_pinyin = pin
                                phrase_pinyin = VARIANT_PRONUNCIATION_REGEX_PATTERN.sub(
                                    "",
                                    phrase_pinyin,
                                )
                                phrase_pinyin = (
                                    COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN.sub(
                                        "",
                                        phrase_pinyin,
                                    ))

                                # Do not try to match entries formatted like "那搭(Namibia)"
                                if not STRANGE_ENTRY_REGEX_PATTERN.match(trad):
                                    try:
                                        example_pinyin = change_pinyin_to_match_phrase(
                                            example_text,
                                            example_pinyin,
                                            trad,
                                            phrase_pinyin,
                                        )
                                    except Exception as e:
                                        logging.warning(
                                            f"Couldn't change pinyin in example for word {trad}: "
                                            f"{''.join(example_text)}, {example_pinyin}, {pin}, "
                                            f"{e}")
                                        traceback.print_exc()

                                def_tuple.examples.append(
                                    objects.ExampleTuple(
                                        "cmn", example_pinyin, example_text))

                    # Parse and add quotes to this definition
                    if "quote" in definition:
                        for quote in definition["quote"]:
                            quote_text = re.sub(WHITESPACE_REGEX_PATTERN, "",
                                                quote)

                            quote_pinyin = lazy_pinyin(
                                " ".join(quote_text).split(),
                                style=Style.TONE3,
                                neutral_tone_with_five=True,
                            )
                            quote_pinyin = " ".join(quote_pinyin).lower()
                            quote_pinyin = quote_pinyin.strip().replace(
                                "v", "u:")

                            phrase_pinyin = pin
                            phrase_pinyin = re.sub(
                                VARIANT_PRONUNCIATION_REGEX_PATTERN, "",
                                phrase_pinyin)
                            phrase_pinyin = re.sub(
                                COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN,
                                "",
                                phrase_pinyin,
                            )

                            if not re.match(STRANGE_ENTRY_REGEX_PATTERN, trad):
                                try:
                                    quote_pinyin = change_pinyin_to_match_phrase(
                                        quote_text, quote_pinyin, trad,
                                        phrase_pinyin)
                                except Exception as e:
                                    logging.warning(
                                        f"Couldn't change pinyin in quote for word {trad}: "
                                        f"{''.join(quote_text)}, {quote_pinyin}, {pin} "
                                        f"{e}")
                                    traceback.print_exc()
                            def_tuple.examples.append(
                                objects.ExampleTuple("zho", quote_pinyin,
                                                     quote_text))

                    # We currently ignore synonyms, antonyms, and "see also" links, because they are
                    # linked to definitions and we have no way to display that data...

                    defs.append(def_tuple)

                last_heteronym_pin = pin

            entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs)
            words.append(entry)

            items_parsed += 1
            if not items_parsed % 500:
                print(f"Parsed entry #{items_parsed}")
Example #8
0
def parse_word_file(file_name, words):
    with open(file_name, "r") as file:
        soup = BeautifulSoup(file, "html.parser")

        # Extract the word on the page
        trad = converter.convert_string(
            soup.find("span", class_="ChiCharFix").get_text()
        )
        if re.search(PRIVATE_USE_AREA_REGEX, trad):
            logging.warning(
                f"Hmm, looks like the word {trad} contains nonstandard characters, replacing with squares..."
            )
            trad = re.sub(
                PRIVATE_USE_AREA_REGEX, PRIVATE_USE_AREA_REPLACEMENT_STRING, trad
            )
        simp = HanziConv.toSimplified(trad)

        word = os.path.splitext(os.path.basename(file_name))[0]
        word = converter.convert_string(word)
        if re.search(PRIVATE_USE_AREA_REGEX, word):
            word = re.sub(
                PRIVATE_USE_AREA_REGEX, PRIVATE_USE_AREA_REPLACEMENT_STRING, word
            )
        if trad != word:
            logging.warning(
                f"Hmm, looks like the parsed word {trad} doesn't match the filename {word}"
            )
            return

        freq = zipf_frequency(trad, "zh")

        # Get the type of word
        label = soup.find("span", id=LABEL_REGEX).get_text()

        # Get the pronunciation, which is split up into the letter portion and the number portion
        jyutping_letters = soup.find("span", id=JYUTPING_LETTERS_ID_REGEX).get_text()
        jyutping_letters = jyutping_letters.split()

        jyutping_numbers = soup.find("span", id=JYUTPING_NUMBERS_ID_REGEX).get_text()
        jyutping_numbers = JYUTPING_NUMBERS_REGEX.findall(jyutping_numbers)
        jyutping_numbers = [
            JYUTPING_MAP[x] if x in JYUTPING_MAP else x for x in jyutping_numbers
        ]  # Replacement is needed because CUHK uses 7-8-9 notation for checked tones instead of 1-3-6

        jyut = [x[0] + x[1] for x in zip(jyutping_letters, jyutping_numbers)]
        jyut = " ".join(jyut)

        # Automatically generate pinyin
        pin = (
            " ".join(lazy_pinyin(trad, style=Style.TONE3, neutral_tone_with_five=True))
            .lower()
            .replace("v", "u:")
        )

        # Extract the meanings
        meaning_elements = soup.find_all("span", id=MEANING_REGEX)
        defs = [
            (label, meaning_element.get_text()) for meaning_element in meaning_elements
        ]

        # Add remarks, if one exists on the page
        remark = soup.find("span", id=REMARK_REGEX).get_text()
        if remark:
            defs.append(("備註", remark))

        entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs)
        words.append(entry)
Example #9
0
def parse_file(filename, words):
    with open(filename) as f:
        data = json.load(f)

        items_parsed = 0

        # Each item in the JSON correspond to one or more entries in the dictionary
        # Most items map 1:1 to entries, e.g. "物質" is a single entry
        # Some items are 多音字, so they map to multiple entries (e.g. 重 -> zhòng and chóng)
        #
        # In the vocabulary of the the CSLD, each item may correspond to multiple heteronyms,
        # and each heteronym maps to a single entry.
        for item in data:
            # These do not change no matter the heteronym
            trad = item["title"]
            simp = HanziConv.toSimplified(trad)
            jyut = pinyin_jyutping_sentence.jyutping(trad,
                                                     tone_numbers=True,
                                                     spaces=True)
            freq = zipf_frequency(trad, "zh")

            # Some items have multiple pronunciations (one for Taiwan, one for Mainland China)
            taiwan_pin = mainland_pin = ""

            # Build up a list of definitions for each heteronym
            taiwan_defs = []
            mainland_defs = []

            # Distinguish between heteronyms by their pinyin – if the pinyin of the
            # current heteronym does not match the old pinyin, then a new heteronym
            # must be created
            last_heteronym_pin = ""
            last_taiwan_pin = last_mainland_pin = ""

            # Go through each heteronym, creating Entry objects for each one
            for heteronym in item["heteronyms"]:
                if "pinyin" not in heteronym:
                    logging.debug(
                        f'Could not find pinyin for heteronym of word {trad} with definitions {heteronym["definitions"]}'
                    )
                    continue

                # Filter out known bad pinyin
                if (trad in KNOWN_INVALID_SYLLABLES and heteronym["pinyin"]
                        in KNOWN_INVALID_SYLLABLES[trad]):
                    pins = KNOWN_INVALID_SYLLABLES[trad][heteronym["pinyin"]]
                else:
                    pins = heteronym["pinyin"].split("<br>陸⃝")

                    # Some weird a's cause dragonmapper to break, so replace them with standard a's.
                    pins = list(map(lambda x: x.replace("ɑ", "a"), pins))

                    # Remove dashes in pinyin
                    pins = list(map(lambda x: x.replace("-", " "), pins))

                    # Remove commas in pinyin
                    pins = list(map(lambda x: x.replace(",", ""), pins))

                    # Remove weird characters
                    pins = list(map(lambda x: x.replace("陸⃟", ""), pins))

                    # Dragonmapper cannot handle some erhua
                    pins = list(
                        map(lambda x: x.replace("diǎr", "diǎn er"), pins))
                    pins = list(
                        map(lambda x: x.replace("biār", "biān er"), pins))

                    try:
                        # Converting from pinyin -> zhuyin inserts spaces between characters
                        # Converting from zhuyin -> pinyin conserves these spaces
                        pins = [
                            transcriptions.zhuyin_to_pinyin(
                                transcriptions.pinyin_to_zhuyin(x),
                                accented=False) for x in pins
                        ]

                        for x in pins:
                            if x.count(" ") >= len(trad):
                                # This means that there was an extra space inserted somewhere; the pinyin is not valid
                                raise ValueError(
                                    "Too many spaces in parsed Pinyin!")
                    except Exception as e:
                        # Try parsing zhuyin as a backup
                        pins = heteronym["bopomofo"].split("<br>陸⃝")

                        # Remove weird spaces in zhuyin
                        pins = list(map(lambda x: x.replace(" ", " "), pins))

                        try:
                            pins = [
                                transcriptions.zhuyin_to_pinyin(x,
                                                                accented=False)
                                for x in pins
                            ]
                        except Exception as e:
                            logging.error(
                                f"Unable to split up Pinyin for word {trad}: {e}, skipping word..."
                            )
                            continue

                if len(pins) > 1:
                    taiwan_pin = pins[0]
                    mainland_pin = pins[1]
                else:
                    taiwan_pin = mainland_pin = pins[0]

                if (last_heteronym_pin != ""
                        and heteronym["pinyin"] != last_heteronym_pin):
                    # A new different pinyin means that we are now processing a new heteronym.
                    # We must create an Entry object for the definitions of the old heteronym
                    # and add it to the list of entries before processing the new one.
                    entry = objects.Entry(trad,
                                          simp,
                                          last_taiwan_pin,
                                          jyut,
                                          freq=freq,
                                          defs=taiwan_defs)
                    words.append(entry)

                    if last_mainland_pin != last_taiwan_pin:
                        entry = objects.Entry(
                            trad,
                            simp,
                            last_mainland_pin,
                            jyut,
                            freq=freq,
                            defs=mainland_defs,
                        )
                        words.append(entry)

                    # Reset the definitions list
                    taiwan_defs = []
                    mainland_defs = []

                for definition in heteronym["definitions"]:
                    taiwan_label = "臺" if taiwan_pin != mainland_pin else ""
                    mainland_label = "陸" if mainland_pin != taiwan_pin else ""

                    definition_text = definition["def"]

                    # Take out parts of definitions that should be in labels
                    for pattern in LABEL_REGEX_PATTERNS:
                        if re.match(pattern, definition_text):
                            definition_label, definition_text = re.match(
                                pattern, definition_text).group(1, 2)
                            taiwan_label += ("、" +
                                             definition_label if taiwan_label
                                             else definition_label)
                            mainland_label += ("、" + definition_label
                                               if mainland_label else
                                               definition_label)

                    # Remove 臺⃝ and 陸⃝ from definitions, since Qt cannot display them
                    definition_text = definition_text.replace("臺⃝", "臺:")
                    definition_text = definition_text.replace("陸⃝", "陸:")

                    # Insert zero-width spaces so that we can reverse-search the definition
                    taiwan_def_tuple = objects.DefinitionTuple(
                        "​".join(jieba.cut(definition_text)), taiwan_label, [])
                    mainland_def_tuple = objects.DefinitionTuple(
                        "​".join(jieba.cut(definition_text)), mainland_label,
                        [])

                    # Parse and add examples to this definition
                    if "example" in definition:
                        for example in definition["example"]:
                            if re.match(EXAMPLE_REGEX_PATTERN, example):
                                # Every example is surrounded by "如:<example>", so only keep the example
                                example = re.match(EXAMPLE_REGEX_PATTERN,
                                                   example).group(1)
                                # Some examples contain multiple examples, so split them up by enclosing brackets 「」
                                example_texts = re.findall(
                                    INDIVIDUAL_EXAMPLE_REGEX_PATTERN, example)
                            else:
                                logging.warning(
                                    f"Found example that does not fit the normal example regex pattern: {trad}, {example}"
                                )
                                # Fall back to splitting on Chinese enumeration comma
                                example_texts = example.split("、")

                            for example_text in example_texts:
                                # Strip out weird whitespace
                                example_text = re.sub(WHITESPACE_REGEX_PATTERN,
                                                      "", example_text)

                                # Joining and splitting separates series of full-width punctuation marks
                                # into separate items,  which is necessary so that lazy_pinyin() returns
                                # separate items for each full-width punctuation mark in the list it returns
                                #
                                # e.g. "《儒林外史.第四六回》:「成老爹道..." turns into
                                # "《 儒 林 外 史 . 第 四 六 回 》 : 「 成 老 爹 道", which turns into
                                # ['《', '儒', '林', '外', '史', '.', '第', '四', '六', '回', '》', ':', '「', '成', '老', '爹', '道']
                                # (Notice how "》:「"" is now split up into three different items)
                                example_pinyin = lazy_pinyin(
                                    " ".join(example_text).split(),
                                    style=Style.TONE3,
                                    neutral_tone_with_five=True,
                                )
                                example_pinyin = " ".join(
                                    example_pinyin).lower()
                                example_pinyin = example_pinyin.strip(
                                ).replace("v", "u:")

                                # Since the pinyin returned by lazy_pinyin doesn't always match the pinyin
                                # given in the heteronym, attempt to replace pinyin corresponding to the
                                # characters in this heteronym with the pinyin provided by the JSON file.
                                #
                                # e.g. example_text = "重新"; example_pinyin = "zhong4 xin1" (returned by lazy_pinyin)
                                # trad = "重", phrase_pinyin = "chong2"
                                # means that we should convert "zhong4 xin1" to "chong2 xin1"

                                # Strip out variant pronunciations for conversion purposes
                                for index, pin in enumerate(
                                    [taiwan_pin, mainland_pin]):
                                    phrase_pinyin = pin
                                    phrase_pinyin = re.sub(
                                        VARIANT_PRONUNCIATION_REGEX_PATTERN,
                                        "",
                                        phrase_pinyin,
                                    )
                                    phrase_pinyin = re.sub(
                                        COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN,
                                        "",
                                        phrase_pinyin,
                                    )

                                    # Do not try to match entries formatted like "那搭(Namibia)"
                                    if not re.match(
                                            STRANGE_ENTRY_REGEX_PATTERN, trad):
                                        try:
                                            example_pinyin = (
                                                change_pinyin_to_match_phrase(
                                                    example_text,
                                                    example_pinyin,
                                                    trad,
                                                    phrase_pinyin,
                                                ))
                                        except Exception as e:
                                            logging.warning(
                                                f"Couldn't change pinyin in example for word {trad}: "
                                                f"{''.join(example_text)}, {example_pinyin}, {pin}, "
                                                f"{e}")
                                            traceback.print_exc()

                                    if index == 0:
                                        taiwan_def_tuple.examples.append(
                                            objects.ExampleTuple(
                                                "zho", example_pinyin,
                                                example_text))
                                    elif index == 1:
                                        mainland_def_tuple.examples.append(
                                            objects.ExampleTuple(
                                                "zho", example_pinyin,
                                                example_text))

                    taiwan_defs.append(taiwan_def_tuple)
                    mainland_defs.append(mainland_def_tuple)

                last_heteronym_pin = heteronym["pinyin"]
                last_taiwan_pin = taiwan_pin
                last_mainland_pin = mainland_pin

            entry = objects.Entry(trad,
                                  simp,
                                  taiwan_pin,
                                  jyut,
                                  freq=freq,
                                  defs=taiwan_defs)
            words.append(entry)

            if mainland_pin != taiwan_pin:
                entry = objects.Entry(trad,
                                      simp,
                                      mainland_pin,
                                      jyut,
                                      freq=freq,
                                      defs=mainland_defs)
                words.append(entry)

            items_parsed += 1
            if not items_parsed % 500:
                print(f"Parsed entry #{items_parsed}")
Example #10
0
def parse_same_meaning_file(filename, words):
    for line in read_csv(filename):
        if len(line) != 17 or line[0] == "總分類":
            continue

        terms = defaultdict(set)

        for index in (4, 5, 6):
            if line[index]:
                terms["臺"].add(line[index])

        for index in (7, 8, 9):
            if line[index]:
                terms["陸"].add(line[index])

        for index in (10, 11, 12):
            if line[index]:
                terms["香"].add(line[index])

        for index in (13, 14, 15):
            if line[index]:
                terms["澳"].add(line[index])

        explanation = None
        if line[16]:
            explanation = objects.DefinitionTuple(
                "​".join(jieba.cut(line[16])), "差異說明", [])

        for location in terms:
            for term in terms[location]:
                trad = term
                simp = HanziConv.toSimplified(trad)
                if term == line[4] and line[2]:
                    # Use the provided pinyin, which always corresponds at least to the first Taiwan term
                    pin = transcriptions.zhuyin_to_pinyin(line[2].replace(
                        " ", " "),
                                                          accented=False)
                else:
                    pin = lazy_pinyin(
                        trad,
                        style=Style.TONE3,
                        neutral_tone_with_five=True,
                    )
                    pin = " ".join(pin).lower()
                    pin = pin.strip().replace("v", "u:")
                jyut = pinyin_jyutping_sentence.jyutping(trad,
                                                         tone_numbers=True,
                                                         spaces=True)
                freq = zipf_frequency(trad, "zh")

                defs = terms.keys()
                defs = map(
                    lambda x: objects.DefinitionTuple("、".join(terms[x]), line[
                        1] + ":" + x, []),
                    defs,
                )
                defs = list(defs)

                if explanation:
                    defs.append(explanation)

                entry = objects.Entry(trad,
                                      simp,
                                      pin,
                                      jyut,
                                      freq=freq,
                                      defs=defs)
                words.add(entry)
Example #11
0
def parse_word_file(file_name, words):
    with open(file_name, "r") as file:
        # In pages with latin script, the title messes with BeautifulSoup's HTML parsing
        # So remove the title and replace it with "CantoDict"
        file_text = file.read()
        file_text = re.sub(TITLE_REGEX_PATTERN, "<title>CantoDict</title>", file_text)

        soup = BeautifulSoup(file_text, "html.parser")

        # Extract the traditional and simplified forms
        try:
            forms = [
                x.strip()
                for x in soup.find("td", class_="chinesebig").get_text().split(" / ")
            ]
            if len(forms) > 1:
                trad = forms[0].strip()
                simp = forms[1].strip()
            else:
                trad = forms[0].strip()
                # Cantodict sometimes reports that there is no simplified variant, which is sometimes incorrect
                simp = HanziConv.toSimplified(trad)
        except:
            # If a character has latin script in it, it may not have a class called "chinesebig"
            try:
                forms = [
                    x.strip()
                    for x in soup.select("span.word.script")[0].get_text().split(" / ")
                ]
                if len(forms) > 1:
                    trad = forms[0].strip()
                    simp = forms[1].strip()
                else:
                    trad = forms[0].strip()
                    # Cantodict sometimes reports that there is no simplified variant, which is sometimes incorrect
                    simp = HanziConv.toSimplified(trad)
            except:
                logging.error(
                    f"Couldn't find traditional and simplified forms in file {file_name}"
                )
                return

        word = os.path.splitext(os.path.basename(file_name))[0].strip()
        if trad != word:
            if trad == HanziConv.toTraditional(word) or word == HanziConv.toSimplified(
                trad
            ):
                logging.debug(
                    f"File name {word} appears to be a simplified variant "
                    f"of {trad}. Ignoring..."
                )
                return
            else:
                if len(trad) == 1:
                    logging.error(
                        f"Hmm, looks like the parsed word {trad} doesn't "
                        f"match the file name {word}. If they are simplified "
                        "or traditional variants of each other, this error "
                        "can be safely ignored."
                    )
                else:
                    logging.error(
                        f"Hmm, looks like the parsed word {trad} doesn't "
                        f"match the file name {word}."
                    )
                return

        freq = zipf_frequency(trad, "zh")

        # Extract the pronunciations
        # CantoDict indicates differences in literary/colloquial pronunciation with *, but we don't support that
        # So remove the stars
        jyut_element = soup.find("span", class_="cardjyutping")
        jyut = jyut_element.get_text() if jyut_element else ""
        jyut = re.sub(LITERARY_CANTONESE_READING_REGEX_PATTERN, "", jyut)
        jyut = jyut.strip()

        pin_element = soup.find("span", class_="cardpinyin")
        pin = pin_element.get_text() if pin_element else ""
        # CantoDict also indicates tone sandhi in pinyin with *, but we don't support that either
        pin = re.sub(LITERARY_PINYIN_READING_REGEX_PATTERN, "", pin)
        if not pin:
            pin = " ".join(
                lazy_pinyin(trad, style=Style.TONE3, neutral_tone_with_five=True)
            ).lower()
        # Replace 'v' in Pinyin with the u: that CEDICT uses
        pin = pin.strip().replace("v", "u:")
        # Remove the zero-width spaces that sometimes show up
        pin = pin.replace("​", "")

        # CantoDict may have multiple pronunciations for an entry
        # Check for multiple pronunciations in Jyutping
        variant_jyutping = []
        if len(trad) == 1 and len(jyut.split(" ")) > len(trad):
            variant_jyutping = jyut.split(" ")
        elif ";" in jyut:
            variant_jyutping = [x.strip() for x in jyut.split("; ")]

        # Check for differences in PRC and Taiwan pronunciation
        variant_pinyin = []
        for pattern in MANDARIN_PRONUNCIATION_VARIANT_REGEX_PATTERNS:
            match = pattern.match(pin)
            if match:
                # Make sure the matched groups match the length of the characters
                if len(match.group("prc").split(" ")) == len(trad) and len(match.group("tw").split(" ")) == len(trad): 
                    variant_pinyin.append((match.group("prc"), match.group("tw")))
        if not variant_pinyin:
            for pattern in MANDARIN_PRONUNCIATION_PARTIAL_VARIANT_REGEX_PATTERNS:
                match = pattern.match(pin)
                if match:
                    prc_pin = pattern.sub(match.group("prc"), pin)
                    tw_pin = pattern.sub(match.group("tw"), pin)
                    # Make sure the found pronunciations match the length of the characters
                    if len(prc_pin.split(" ")) == len(trad) and len(tw_pin.split(" ")) == len(trad):
                        variant_pinyin.append((prc_pin, tw_pin))
        # Also check for multiple pronunciations of single-character words
        if len(trad) == 1 and len(pin.split(" ")) > len(trad):
            [variant_pinyin.append((x, None)) for x in pin.split(" ")]

        # Some entries give different meanings for different pronunciation;
        # assume they don't but mark True if yes
        variants_handled = False

        # Extract the meaning element
        meaning_element = soup.find("td", class_="wordmeaning")

        # Check for special labels in compound words (brandname, idiom, placename, etc.)
        special_label = ""
        special_pos_elem = meaning_element.find("img", class_="flagicon")
        if special_pos_elem:
            special_label = special_pos_elem["alt"]

        # The layout of compound word pages is different from single-character pages
        real_meaning_element = meaning_element.find("div", class_=None)
        if real_meaning_element:
            meaning_element = real_meaning_element

        # Remove children (these usually contain useless fluff that interfere with definition parsing)
        children = meaning_element.find_all("div")
        children += meaning_element.find_all("span")
        for child in children:
            child.decompose()

        # Parse the meanings from the meaning element
        meanings = []
        # CantoDict puts some weird stuff in the meanings div, and the only way to separate
        # them out is to replace the <br> tags with "\n"
        for br in soup.find_all("br"):
            br.replace_with("\n")

        strings = DEFINITION_SPLITTING_REGEX_PATTERN.split(meaning_element.get_text())
        for string in strings:
            string = string.strip()
            if not string or any([x in string for x in illegal_strings]):
                continue

            continue_parsing = True
            for pattern in JYUTPING_PINYIN_REGEX_PATTERNS:
                result = re.search(pattern, string)
                if result:
                    if meanings:
                        entry = objects.Entry(
                            trad,
                            simp,
                            pin,
                            jyut,
                            freq=freq,
                            defs=meanings,
                        )
                        words.append(entry)

                    # Then, extract the new pinyin and jyutping
                    # and reset the meanings tuple
                    jyut = result.group(1)
                    jyut = re.sub(LITERARY_CANTONESE_READING_REGEX_PATTERN, "", jyut)
                    pin = result.group(2) if result.group(2) else ""
                    meanings = []
                    continue_parsing = False
                    variants_handled = True
                    break

            if not continue_parsing:
                continue

            for pattern in JYUTPING_ONLY_REGEX_PATTERNS:
                result = re.search(pattern, string)
                if result:
                    if meanings:
                        entry = objects.Entry(
                            trad,
                            simp,
                            pin,
                            jyut,
                            freq=freq,
                            defs=meanings,
                        )
                        words.append(entry)

                    # Then, extract the new jyutping (but keep the old pinyin!)
                    # and reset the meanings tuple
                    jyut = result.group(1)
                    jyut = re.sub(LITERARY_CANTONESE_READING_REGEX_PATTERN, "", jyut)
                    meanings = []
                    variants_handled = True
                    continue_parsing = False
                    break

            if not continue_parsing:
                continue

            # Try to isolate one or more labels (usually a POS or [華]: indicating Mandarin-only usage or [粵]: indicating Cantonese-only usage)
            labels = []
            definition = string
            result = re.search(LABEL_REGEX_PATTERN, string)
            if not result:
                # Filter out bad non-standard strings that are completely enclosed in square braces
                if string[0] == "[" and string[-1] == "]":
                    continue

            while result:
                labels.extend(result.group(1).strip().split(","))
                string = re.sub(LABEL_REGEX_PATTERN, "", string)
                result = re.search(LABEL_REGEX_PATTERN, string)

            # At this point, all the labels enclosed in square braces (possibly followed by whitespace)
            # should be stripped out of the beginning of the string.
            # Therefore, we can now assume the contents of the string are the definition
            definition = string
            if not definition:
                continue

            # Override black trying to add a trailing comma here
            # fmt: off
            labels = map(
                lambda x: pos_labels[x.lower()] if x.lower() in pos_labels else x,
                labels
            )
            # fmt: on
            label = ", ".join(labels)
            if not label and special_label:
                label = special_label

            meanings.append((label, definition))

        if meanings:
            entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=meanings)

            if not variant_jyutping and not variant_pinyin:
                words.append(entry)

            elif not variants_handled:
                for jyutping in variant_jyutping:
                    variant_entry = copy.deepcopy(entry)
                    variant_entry.add_jyutping(jyutping)
                    words.append(variant_entry)

                for prc, tw in variant_pinyin:
                    prc_variant = copy.deepcopy(entry)
                    prc_variant.add_pinyin(prc)
                    words.append(prc_variant)

                    if tw:
                        tw_variant = copy.deepcopy(entry)
                        tw_variant.add_pinyin(tw)
                        words.append(tw_variant)
Example #12
0
def process_entry(line):
    entries = []

    # Parse the entry header
    header = line[1].strip('"')
    variants = header.split(",")

    for variant in variants:
        trad = variant.split(":")[0]
        simp = HanziConv.toSimplified(trad)
        pin = (" ".join(
            lazy_pinyin(trad, style=Style.TONE3,
                        neutral_tone_with_five=True)).lower().replace(
                            "v", "u:"))
        freq = zipf_frequency(trad, "zh")

        # Parse all the Jyutping
        pronunciations = variant.split(":")[1:]
        if not pronunciations:
            entries.append(
                objects.Entry(trad=trad,
                              simp=simp,
                              jyut="",
                              pin=pin,
                              freq=freq))
        else:
            for jyut in pronunciations:
                # Non-standard Jyutping starts with an exclamation mark, e.g. !sdet1 or !sdaaf1
                # Remove the exclamation mark
                if jyut.startswith("!"):
                    jyut = jyut[1:]

                entries.append(
                    objects.Entry(trad=trad,
                                  simp=simp,
                                  jyut=jyut,
                                  pin=pin,
                                  freq=freq))

    # Parse the entry content: explanations, examples
    content = line[2]
    if content.startswith("未有內容"):
        for entry in entries:
            entry.append_to_defs(objects.Definition(definition="x"))
        return entries

    definitions = []
    entry_labels = []
    near_synonyms = []
    antonyms = []

    # Explanations are separated by both '<explanation>' tags and '----' tags
    explanations = content.split("<explanation>")
    explanations = map(lambda x: x.split("----"), explanations)
    explanations = list(chain.from_iterable(explanations))

    for explanation_index, explanation in enumerate(explanations):
        if not explanation.strip():
            continue

        parse_explanation = True

        if explanation_index == 0:
            # The first item contains metadata about the entry
            parse_explanation = False

            for x in re.findall(PART_OF_SPEECH_REGEX, explanation):
                entry_labels.append(x)
            for x in re.findall(LABEL_REGEX, explanation):
                entry_labels.append(x)
            for x in re.findall(NEAR_SYNONYM_REGEX, explanation):
                near_synonyms.append(x)
            for x in re.findall(ANTONYM_REGEX, explanation):
                antonyms.append(x)

            # However, for some items, such as id 89764, the first item also contains the explanation
            if explanation.find("yue:") != -1:
                # fmt: off
                explanation = explanation[explanation.find("yue:"):]
                # fmt: on
                parse_explanation = True

        if parse_explanation:
            definition = objects.Definition(label="、".join(entry_labels),
                                            examples=[])

            # Subsequent items contain explanations
            for index, item in enumerate(explanation.split("<eg>")):
                if index == 0:
                    # The first item contains the explanation
                    # Translations in different languages are separated by newlines
                    explanation_translations = item.strip().split("\n")
                    # Strip out links
                    explanation_translations = map(
                        lambda x:
                        (re.sub(LINK_REGEX, r"\1", x).replace("#", "")
                         if re.search(LINK_REGEX, x) else x.replace("#", "")),
                        explanation_translations,
                    )
                    # fmt: off
                    # Segment the Chinese explanations so they show up in the FTS index
                    explanation_translations = map(
                        lambda x: (x[:x.find(":")], x[x.find(":") + 1:]),
                        explanation_translations)
                    explanation_translations = map(
                        lambda x: ("​".join(jieba.cut(x[1]))
                                   if x[0] in ("yue", "zho") else x[1]),
                        explanation_translations)
                    # fmt: on
                    explanation = "\n".join(explanation_translations)
                    definition.definition = explanation
                else:
                    # Subsequent items contain examples for this explanation
                    definition.examples.append([])
                    example_translations = item.strip().split("\n")
                    # fmt: off
                    # Strip out links
                    example_translations = map(lambda x: x.replace("#", ""),
                                               example_translations)
                    # fmt: on
                    for translation in example_translations:
                        if not translation or translation == "----":
                            # Ignore lines that are not translations
                            continue

                        # fmt: off
                        lang = translation[:translation.find(":")]
                        if lang in ("yue", "zho"):
                            # Example content ends before the first space with an opening parenthesis after it
                            # (which indicates the start of a romanization)
                            # but some example don't have romanization, so filter for that
                            if translation.find(" (") >= 0:
                                content = translation[translation.find(":") +
                                                      1:translation.find(" (")]
                            else:
                                content = translation[translation.find(":") +
                                                      1:]
                            pron = (translation[translation.find(" (") +
                                                1:].strip("()")
                                    if len(translation.split()) >= 2 else "")
                            if lang == "yue":
                                definition.examples[-1].insert(
                                    0,
                                    objects.Example(lang=lang,
                                                    pron=pron,
                                                    content=content))
                            else:
                                definition.examples[-1].append(
                                    objects.Example(lang=lang,
                                                    content=content))
                        else:
                            content = translation[translation.find(":") + 1:]
                            if not content:
                                content = "x"
                            definition.examples[-1].append(
                                objects.Example(lang=lang, content=content))
                        # fmt: on

            definitions.append(definition)

    # Add synonyms, antonyms to list of definitions
    if near_synonyms:
        definitions.append(
            objects.Definition(definition="、".join(near_synonyms),
                               label="近義詞",
                               examples=[]))
    if antonyms:
        definitions.append(
            objects.Definition(definition="、".join(antonyms),
                               label="反義詞",
                               examples=[]))

    # Assign definitions to each entry
    for entry in entries:
        entry.add_defs(copy.deepcopy(definitions))

    # Add variants to the definitions of an entry; these are unique for each entry
    if len(variants) > 1:
        variants = set(map(lambda x: x.split(":")[0], variants))
        for entry in entries:
            # Do not add variants whose Chinese characters match the current entry's characters into the current entry's "see also" section
            filtered_variants = filter(lambda x: x != entry.traditional,
                                       variants)
            entry.append_to_defs(
                objects.Definition(
                    definition="、".join(filtered_variants),
                    label="參看",
                    examples=[],
                ))

    return entries
Example #13
0
def parse_file(filename_traditional, filename_simplified_jyutping, entries):
    simplified = traditional = []
    with open(filename_traditional, "r", encoding="utf8") as f:
        reader = csv.reader(f, delimiter="	")
        traditional = list(reader)

    # The Kaifangcidian data for jyutping is horrible.
    # The entire data is on a single line, printed like a flat Python list.
    # The entry may be a single item in the array, or multiple items.
    # The Jyutping pronunciation is a separate item for each character in the entry.
    # The translations to Mandarin may, or may not follow the Jyutping!
    # And there is no separator between data for different entries :)
    last_line = ""
    with open(filename_simplified_jyutping, "r", encoding="utf8") as f:
        last_line = f.readlines()[-1]
    simplified = ast.literal_eval(last_line)

    index = 0
    for row in range(len(traditional)):
        if row < 9:  # The first nine rows are comments and headers
            continue

        trad = traditional[row][0]

        # Horrible data workaround 1:
        # In KFCD Jyutping data, when the entry has Chinese characters in it,
        # the entry is presented as a single string in the array. (This is sane.)
        # If it does not (e.g. the word 'pat pat'), each series of characters, delineated
        # by a space, is a separate entry in the array ('pat pat' => ["pat", "pat"])
        trad_len = len(trad.split(" "))
        if not hanzidentifier.has_chinese(trad):
            simp = "".join(simplified[index:index + trad_len])
        else:
            simp = simplified[index]

        # Horrible data workaround 2:
        # In KFCD Jyutping data, the Jyutping for each word in an entry
        # is presented as a separate string.
        # To find the indices that correspond to the entry we just extracted,
        # use the data from the KFCD Yale edition (which is formatted as a CSV) to
        # determine how many items comprise the Jyutping pronunciation.
        # One cannot use the string length of the entry, as it may contain punctuation
        # (e.g. ',') that has no corresponding Jyutping syllable, AND the entry
        # may be split up into multiple items (as described in horrible
        # workaround #1).
        jyut_len = len(traditional[row][1].split(" "))
        jyut = " ".join(simplified[index + trad_len:index + trad_len +
                                   jyut_len])

        pin = (" ".join(
            lazy_pinyin(trad, style=Style.TONE3,
                        neutral_tone_with_five=True)).lower().replace(
                            "v", "u:"))

        # Horrible data workaround 3:
        # In the KFCD Yale data, all the definitions are listed as a single item, separated
        # by the wide-character ','. Some entries have definitions, and some do not.
        # In the KFCD Jyutping edition, the definitions are also listed all as a single item.
        # However, many words do not have definitions; if there are no definitions then
        # we do NOT need to advance the index by 1 more item (which would have been
        # the definitions).
        if traditional[row][2]:
            defs_traditional = traditional[row][2].split(",")
            defs_simplified = simplified[index + trad_len +
                                         jyut_len].split(",")
            definitions = []
            for (def_traditional,
                 def_simplified) in zip(defs_traditional, defs_simplified):
                if def_traditional != def_simplified:
                    definitions.append(def_traditional + " – " +
                                       def_simplified)
                else:
                    definitions.append(def_traditional)
            index += trad_len + jyut_len + 1
        else:
            definitions = ["(沒有對應漢語詞彙)"]
            index += trad_len + jyut_len

        entry = objects.Entry(trad=trad,
                              simp=simp,
                              pin=pin,
                              jyut=jyut,
                              defs=definitions)

        if trad in entries:
            entries[trad].append(entry)
        else:
            entries[trad] = [entry]