Python Entry Examples

Programming Language: Python

Namespace/Package Name: database.objects

Method/Function: Entry

Examples at hotexamples.com: 13

Python Entry - 13 examples found. These are the top rated real world Python examples of database.objects.Entry extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def parse_variants(filename, words_traditional, words_simplified):
    variant_pairs = set()

    for line in read_csv(filename):
        if len(line) != 3 or line[0].startswith("#"):
            continue

        codepoint = line[0]
        fieldname = line[1]
        content = line[2]

        if fieldname not in ("kSimplifiedVariant", "kTraditionalVariant"):
            continue

        character = chr(int(codepoint[2:], 16))
        variants = [chr(int(item[2:], 16)) for item in content.split()]

        if fieldname == "kTraditionalVariant":
            for variant in variants:
                if (variant, character) not in variant_pairs:
                    # This character has a traditional variant - insert into words
                    entry = objects.Entry(variant, character, "", "")
                    words_traditional[variant].append(entry)
                    words_simplified[character].append(entry)
                    variant_pairs.add((variant, character))
        elif fieldname == "kSimplifiedVariant":
            for variant in variants:
                if (character, variant) not in variant_pairs:
                    # This character has a simplified variant - insert into words
                    entry = objects.Entry(character, variant, "", "")
                    words_traditional[character].append(entry)
                    words_simplified[variant].append(entry)
                    variant_pairs.add((character, variant))

Example #2

Show file

def parse_cc_cedict_canto_readings(filename, entries):
    with open(filename, "r", encoding="utf8") as f:
        for line in f:
            if len(line) == 0 or line[0] == "#":
                continue

            split = line.split()
            trad = split[0]
            simp = split[1]
            pin = line[line.index("[") + 1:line.index("]")].lower().replace(
                "v", "u:")
            jyut = line[line.index("{") + 1:line.index("}")].lower()

            entry = objects.Entry(trad=trad, simp=simp, pin=pin, jyut=jyut)

            if trad in entries:
                new_entry = True
                for existing_entry in entries[trad]:
                    if (existing_entry.simplified == simp
                            and existing_entry.pinyin == pin
                            and existing_entry.jyutping == jyut):
                        new_entry = False
                        break
                if new_entry:
                    entries[trad].append(entry)
            else:
                entries[trad] = [entry]

Example #3

Show file

def parse_same_word_file(filename, words):
    for line in read_csv(filename):
        if len(line) != 2 or line[0] == "詞彙":
            continue

        trad = line[0]
        simp = HanziConv.toSimplified(trad)
        pin = lazy_pinyin(
            trad,
            style=Style.TONE3,
            neutral_tone_with_five=True,
        )
        pin = " ".join(pin).lower()
        pin = pin.strip().replace("v", "u:")
        jyut = pinyin_jyutping_sentence.jyutping(trad,
                                                 tone_numbers=True,
                                                 spaces=True)
        freq = zipf_frequency(trad, "zh")
        defs = [
            objects.DefinitionTuple("".join(jieba.cut(line[1])), "臺陸用法和差異",
                                    [])
        ]

        entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs)
        words.add(entry)

Example #4

Show file

def parse_cc_canto(filename, entries):
    with open(filename, "r", encoding="utf8") as f:
        for line in f:
            if len(line) == 0 or line[0] == "#":
                continue

            split = line.split()  # Splits by whitespace
            trad = split[0]
            simp = split[1]
            pin = line[line.index("[") + 1:line.index("]")].lower().replace(
                "v", "u:")
            jyut = line[line.index("{") + 1:line.index("}")].lower()
            entry = objects.Entry(trad=trad, simp=simp, pin=pin, jyut=jyut)

            # Check if entry is already in dictionary
            if trad in entries:
                # If entry is in dictionary, then
                # make sure is new entry before adding
                # to list
                new_entry = True
                for existing_entry in entries[trad]:
                    if (existing_entry.simplified == simp
                            and existing_entry.pinyin == pin
                            and existing_entry.jyutping == jyut):
                        new_entry = False
                        break
                if new_entry:
                    entries[trad].append(entry)
            else:
                entries[trad] = [entry]

Example #5

Show file

def parse_readings(filename, words_traditional, words_simplified):
    for line in read_csv(filename):
        if len(line) != 3 or line[0].startswith("#"):
            continue

        codepoint = line[0]
        fieldname = line[1]
        content = line[2]

        if fieldname not in ("kCantonese", "kMandarin", "kDefinition"):
            continue

        character = chr(int(codepoint[2:], 16))

        entry_added = False

        if character in words_traditional:
            freq = zipf_frequency(character, "zh")

            for entry in words_traditional[character]:
                entry.add_freq(freq)
                if fieldname == "kCantonese":
                    entry.add_jyutping(content)
                elif fieldname == "kMandarin":
                    pin = convert_pinyin_to_tone_numbers(content, character)
                    entry.add_pinyin(pin)
                elif fieldname == "kDefinition":
                    entry.add_defs([("", x.strip())
                                    for x in content.split(";")])

            entry_added = True

        if character in words_simplified:
            # Ignore simplified characters
            entry_added = True

        if not entry_added:
            trad = simp = character
            freq = zipf_frequency(trad, "zh")
            jyut = content if fieldname == "kCantonese" else ""
            pin = (convert_pinyin_to_tone_numbers(content, trad)
                   if fieldname == "kMandarin" else "")
            defs = ([("", x.strip()) for x in content.split(";")]
                    if fieldname == "kDefinition" else [])

            entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs)
            words_traditional[trad].append(entry)
            words_simplified[simp].append(entry)

Example #6

Show file

File: parse-individual.py Project: aaronhktan/jyut-dict

def parse_file(filename, entries):
    with open(filename, "r", encoding="utf8") as f:
        for index, line in enumerate(f):
            if len(line) == 0 or line[0] == "#":
                continue

            split = line.split()
            trad = split[0]
            simp = split[1]
            pin = line[line.index("[") + 1:line.index("]")].lower().replace(
                "v", "u:")
            definitions = line[line.index("/") + 1:-2].split("/")
            entry = objects.Entry(trad=trad,
                                  simp=simp,
                                  pin=pin,
                                  defs=definitions)

            if trad in entries:
                entries[trad].append(entry)
            else:
                entries[trad] = [entry]

Example #7

Show file

def parse_file(filename, words):
    with open(filename) as f:
        data = json.load(f)

        items_parsed = 0

        # Each item in the JSON correspond to one or more entries in the dictionary
        # Most items map 1:1 to entries, e.g. "物質" is a single entry
        # Some items are 多音字, so they map to multiple entries (e.g. 重 -> zhòng and chóng)
        #
        # In the vocabulary of the MoEDict, each item may correspond to multiple heteronyms,
        # and each heteronym maps to a single entry.
        for item in data:
            # For now, ignore variant characters that aren't properly encoded in Unicode
            if re.match(EXCLUDE_VARIANT_REGEX_PATTERN, item["title"]):
                continue

            # These do not change no matter the heteronym
            trad = item["title"]
            simp = HanziConv.toSimplified(trad)
            jyut = pinyin_jyutping_sentence.jyutping(trad,
                                                     tone_numbers=True,
                                                     spaces=True)
            freq = zipf_frequency(trad, "zh")

            # Build up a list of definitions for each heteronym
            defs = []

            # Distinguish between heteronyms by their pinyin – if the pinyin of the
            # current heteronym does not match the old pinyin, then a new heteronym
            # must be created
            last_heteronym_pin = ""

            # Go through each heteronym, creating Entry objects for each one
            for heteronym in item["heteronyms"]:
                if "pinyin" not in heteronym:
                    logging.debug(
                        f'Could not find pinyin for heteronym of word {trad} with definitions {heteronym["definitions"]}'
                    )
                    continue

                pin = PINYIN_COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN.sub(
                    "", heteronym["pinyin"])
                pin = PINYIN_LITERARY_PRONUNCIATION_REGEX_PATTERN.sub("", pin)
                pin = PINYIN_ALTERNATE_PRONUNCIATION_REGEX_PATTERN.sub("", pin)
                pin = PINYIN_SECOND_ALTERNATE_PRONUNCIATION_REGEX_PATTERN.sub(
                    "", pin)
                pin = pin.split()
                pin = [
                    pinyin_to_tone_numbers(syllable, trad).split()
                    for syllable in pin
                ]
                pin = list(itertools.chain.from_iterable(pin))
                pin = pin[:len(trad)]
                pin = " ".join(pin)
                if last_heteronym_pin != "" and pin != last_heteronym_pin:
                    # Different pinyin means that we are now processing a new heteronym.
                    # We must create an Entry object for the definitions of the old heteronym
                    # and add it to the list of entries before processing the new one.
                    entry = objects.Entry(trad,
                                          simp,
                                          last_heteronym_pin,
                                          jyut,
                                          freq=freq,
                                          defs=defs)
                    words.append(entry)

                    # Reset the definitions list
                    defs = []

                for definition in heteronym["definitions"]:
                    label = definition["type"] if "type" in definition else ""

                    # Insert zero-width spaces so that we can reverse-search the definition
                    def_tuple = objects.DefinitionTuple(
                        "".join(jieba.cut(definition["def"])), label, [])

                    # Parse and add examples to this definition
                    if "example" in definition:
                        for example in definition["example"]:
                            if EXAMPLE_REGEX_PATTERN.match(example):
                                # Every example is surrounded by "如：<example>", so only keep the example
                                example = EXAMPLE_REGEX_PATTERN.match(
                                    example).group(1)
                                # Some examples contain multiple examples, so split them up by enclosing brackets 「」
                                example_texts = (
                                    INDIVIDUAL_EXAMPLE_REGEX_PATTERN.findall(
                                        example))
                            else:
                                logging.warning(
                                    f"Found example that does not fit the normal example regex pattern: {trad}, {example}"
                                )
                                # Fall back to splitting on Chinese enumeration comma
                                example_texts = example.split("、")

                            for example_text in example_texts:
                                # Strip out weird whitespace
                                example_text = WHITESPACE_REGEX_PATTERN.sub(
                                    "", example_text)

                                # Joining and splitting separates series of full-width punctuation marks
                                # into separate items,  which is necessary so that lazy_pinyin() returns
                                # separate items for each full-width punctuation mark in the list it returns
                                #
                                # e.g. "《儒林外史．第四六回》：「成老爹道..." turns into
                                # "《 儒 林 外 史 ． 第 四 六 回 》 ： 「 成 老 爹 道", which turns into
                                # ['《', '儒', '林', '外', '史', '．', '第', '四', '六', '回', '》', '：', '「', '成', '老', '爹', '道']
                                # (Notice how "》：「"" is now split up into three different items)
                                example_pinyin = lazy_pinyin(
                                    " ".join(example_text).split(),
                                    style=Style.TONE3,
                                    neutral_tone_with_five=True,
                                )
                                example_pinyin = " ".join(
                                    example_pinyin).lower()
                                example_pinyin = example_pinyin.strip(
                                ).replace("v", "u:")

                                # Since the pinyin returned by lazy_pinyin doesn't always match the pinyin
                                # given in the heteronym, attempt to replace pinyin corresponding to the
                                # characters in this heteronym with the pinyin provided by the JSON file.
                                #
                                # e.g. example_text = "重新"; example_pinyin = "zhong4 xin1" (returned by lazy_pinyin)
                                # trad = "重", phrase_pinyin = "chong2"
                                # means that we should convert "zhong4 xin1" to "chong2 xin1"

                                # Strip out variant pronunciations for conversion purposes
                                phrase_pinyin = pin
                                phrase_pinyin = VARIANT_PRONUNCIATION_REGEX_PATTERN.sub(
                                    "",
                                    phrase_pinyin,
                                )
                                phrase_pinyin = (
                                    COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN.sub(
                                        "",
                                        phrase_pinyin,
                                    ))

                                # Do not try to match entries formatted like "那搭（Namibia)"
                                if not STRANGE_ENTRY_REGEX_PATTERN.match(trad):
                                    try:
                                        example_pinyin = change_pinyin_to_match_phrase(
                                            example_text,
                                            example_pinyin,
                                            trad,
                                            phrase_pinyin,
                                        )
                                    except Exception as e:
                                        logging.warning(
                                            f"Couldn't change pinyin in example for word {trad}: "
                                            f"{''.join(example_text)}, {example_pinyin}, {pin}, "
                                            f"{e}")
                                        traceback.print_exc()

                                def_tuple.examples.append(
                                    objects.ExampleTuple(
                                        "cmn", example_pinyin, example_text))

                    # Parse and add quotes to this definition
                    if "quote" in definition:
                        for quote in definition["quote"]:
                            quote_text = re.sub(WHITESPACE_REGEX_PATTERN, "",
                                                quote)

                            quote_pinyin = lazy_pinyin(
                                " ".join(quote_text).split(),
                                style=Style.TONE3,
                                neutral_tone_with_five=True,
                            )
                            quote_pinyin = " ".join(quote_pinyin).lower()
                            quote_pinyin = quote_pinyin.strip().replace(
                                "v", "u:")

                            phrase_pinyin = pin
                            phrase_pinyin = re.sub(
                                VARIANT_PRONUNCIATION_REGEX_PATTERN, "",
                                phrase_pinyin)
                            phrase_pinyin = re.sub(
                                COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN,
                                "",
                                phrase_pinyin,
                            )

                            if not re.match(STRANGE_ENTRY_REGEX_PATTERN, trad):
                                try:
                                    quote_pinyin = change_pinyin_to_match_phrase(
                                        quote_text, quote_pinyin, trad,
                                        phrase_pinyin)
                                except Exception as e:
                                    logging.warning(
                                        f"Couldn't change pinyin in quote for word {trad}: "
                                        f"{''.join(quote_text)}, {quote_pinyin}, {pin} "
                                        f"{e}")
                                    traceback.print_exc()
                            def_tuple.examples.append(
                                objects.ExampleTuple("zho", quote_pinyin,
                                                     quote_text))

                    # We currently ignore synonyms, antonyms, and "see also" links, because they are
                    # linked to definitions and we have no way to display that data...

                    defs.append(def_tuple)

                last_heteronym_pin = pin

            entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs)
            words.append(entry)

            items_parsed += 1
            if not items_parsed % 500:
                print(f"Parsed entry #{items_parsed}")

Example #8

Show file

File: parse.py Project: aaronhktan/jyut-dict

def parse_word_file(file_name, words):
    with open(file_name, "r") as file:
        soup = BeautifulSoup(file, "html.parser")

        # Extract the word on the page
        trad = converter.convert_string(
            soup.find("span", class_="ChiCharFix").get_text()
        )
        if re.search(PRIVATE_USE_AREA_REGEX, trad):
            logging.warning(
                f"Hmm, looks like the word {trad} contains nonstandard characters, replacing with squares..."
            )
            trad = re.sub(
                PRIVATE_USE_AREA_REGEX, PRIVATE_USE_AREA_REPLACEMENT_STRING, trad
            )
        simp = HanziConv.toSimplified(trad)

        word = os.path.splitext(os.path.basename(file_name))[0]
        word = converter.convert_string(word)
        if re.search(PRIVATE_USE_AREA_REGEX, word):
            word = re.sub(
                PRIVATE_USE_AREA_REGEX, PRIVATE_USE_AREA_REPLACEMENT_STRING, word
            )
        if trad != word:
            logging.warning(
                f"Hmm, looks like the parsed word {trad} doesn't match the filename {word}"
            )
            return

        freq = zipf_frequency(trad, "zh")

        # Get the type of word
        label = soup.find("span", id=LABEL_REGEX).get_text()

        # Get the pronunciation, which is split up into the letter portion and the number portion
        jyutping_letters = soup.find("span", id=JYUTPING_LETTERS_ID_REGEX).get_text()
        jyutping_letters = jyutping_letters.split()

        jyutping_numbers = soup.find("span", id=JYUTPING_NUMBERS_ID_REGEX).get_text()
        jyutping_numbers = JYUTPING_NUMBERS_REGEX.findall(jyutping_numbers)
        jyutping_numbers = [
            JYUTPING_MAP[x] if x in JYUTPING_MAP else x for x in jyutping_numbers
        ]  # Replacement is needed because CUHK uses 7-8-9 notation for checked tones instead of 1-3-6

        jyut = [x[0] + x[1] for x in zip(jyutping_letters, jyutping_numbers)]
        jyut = " ".join(jyut)

        # Automatically generate pinyin
        pin = (
            " ".join(lazy_pinyin(trad, style=Style.TONE3, neutral_tone_with_five=True))
            .lower()
            .replace("v", "u:")
        )

        # Extract the meanings
        meaning_elements = soup.find_all("span", id=MEANING_REGEX)
        defs = [
            (label, meaning_element.get_text()) for meaning_element in meaning_elements
        ]

        # Add remarks, if one exists on the page
        remark = soup.find("span", id=REMARK_REGEX).get_text()
        if remark:
            defs.append(("備註", remark))

        entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs)
        words.append(entry)

Example #9

Show file

def parse_file(filename, words):
    with open(filename) as f:
        data = json.load(f)

        items_parsed = 0

        # Each item in the JSON correspond to one or more entries in the dictionary
        # Most items map 1:1 to entries, e.g. "物質" is a single entry
        # Some items are 多音字, so they map to multiple entries (e.g. 重 -> zhòng and chóng)
        #
        # In the vocabulary of the the CSLD, each item may correspond to multiple heteronyms,
        # and each heteronym maps to a single entry.
        for item in data:
            # These do not change no matter the heteronym
            trad = item["title"]
            simp = HanziConv.toSimplified(trad)
            jyut = pinyin_jyutping_sentence.jyutping(trad,
                                                     tone_numbers=True,
                                                     spaces=True)
            freq = zipf_frequency(trad, "zh")

            # Some items have multiple pronunciations (one for Taiwan, one for Mainland China)
            taiwan_pin = mainland_pin = ""

            # Build up a list of definitions for each heteronym
            taiwan_defs = []
            mainland_defs = []

            # Distinguish between heteronyms by their pinyin – if the pinyin of the
            # current heteronym does not match the old pinyin, then a new heteronym
            # must be created
            last_heteronym_pin = ""
            last_taiwan_pin = last_mainland_pin = ""

            # Go through each heteronym, creating Entry objects for each one
            for heteronym in item["heteronyms"]:
                if "pinyin" not in heteronym:
                    logging.debug(
                        f'Could not find pinyin for heteronym of word {trad} with definitions {heteronym["definitions"]}'
                    )
                    continue

                # Filter out known bad pinyin
                if (trad in KNOWN_INVALID_SYLLABLES and heteronym["pinyin"]
                        in KNOWN_INVALID_SYLLABLES[trad]):
                    pins = KNOWN_INVALID_SYLLABLES[trad][heteronym["pinyin"]]
                else:
                    pins = heteronym["pinyin"].split("<br>陸⃝")

                    # Some weird a's cause dragonmapper to break, so replace them with standard a's.
                    pins = list(map(lambda x: x.replace("ɑ", "a"), pins))

                    # Remove dashes in pinyin
                    pins = list(map(lambda x: x.replace("-", " "), pins))

                    # Remove commas in pinyin
                    pins = list(map(lambda x: x.replace(",", ""), pins))

                    # Remove weird characters
                    pins = list(map(lambda x: x.replace("陸⃟", ""), pins))

                    # Dragonmapper cannot handle some erhua
                    pins = list(
                        map(lambda x: x.replace("diǎr", "diǎn er"), pins))
                    pins = list(
                        map(lambda x: x.replace("biār", "biān er"), pins))

                    try:
                        # Converting from pinyin -> zhuyin inserts spaces between characters
                        # Converting from zhuyin -> pinyin conserves these spaces
                        pins = [
                            transcriptions.zhuyin_to_pinyin(
                                transcriptions.pinyin_to_zhuyin(x),
                                accented=False) for x in pins
                        ]

                        for x in pins:
                            if x.count(" ") >= len(trad):
                                # This means that there was an extra space inserted somewhere; the pinyin is not valid
                                raise ValueError(
                                    "Too many spaces in parsed Pinyin!")
                    except Exception as e:
                        # Try parsing zhuyin as a backup
                        pins = heteronym["bopomofo"].split("<br>陸⃝")

                        # Remove weird spaces in zhuyin
                        pins = list(map(lambda x: x.replace("　", " "), pins))

                        try:
                            pins = [
                                transcriptions.zhuyin_to_pinyin(x,
                                                                accented=False)
                                for x in pins
                            ]
                        except Exception as e:
                            logging.error(
                                f"Unable to split up Pinyin for word {trad}: {e}, skipping word..."
                            )
                            continue

                if len(pins) > 1:
                    taiwan_pin = pins[0]
                    mainland_pin = pins[1]
                else:
                    taiwan_pin = mainland_pin = pins[0]

                if (last_heteronym_pin != ""
                        and heteronym["pinyin"] != last_heteronym_pin):
                    # A new different pinyin means that we are now processing a new heteronym.
                    # We must create an Entry object for the definitions of the old heteronym
                    # and add it to the list of entries before processing the new one.
                    entry = objects.Entry(trad,
                                          simp,
                                          last_taiwan_pin,
                                          jyut,
                                          freq=freq,
                                          defs=taiwan_defs)
                    words.append(entry)

                    if last_mainland_pin != last_taiwan_pin:
                        entry = objects.Entry(
                            trad,
                            simp,
                            last_mainland_pin,
                            jyut,
                            freq=freq,
                            defs=mainland_defs,
                        )
                        words.append(entry)

                    # Reset the definitions list
                    taiwan_defs = []
                    mainland_defs = []

                for definition in heteronym["definitions"]:
                    taiwan_label = "臺" if taiwan_pin != mainland_pin else ""
                    mainland_label = "陸" if mainland_pin != taiwan_pin else ""

                    definition_text = definition["def"]

                    # Take out parts of definitions that should be in labels
                    for pattern in LABEL_REGEX_PATTERNS:
                        if re.match(pattern, definition_text):
                            definition_label, definition_text = re.match(
                                pattern, definition_text).group(1, 2)
                            taiwan_label += ("、" +
                                             definition_label if taiwan_label
                                             else definition_label)
                            mainland_label += ("、" + definition_label
                                               if mainland_label else
                                               definition_label)

                    # Remove 臺⃝ and 陸⃝ from definitions, since Qt cannot display them
                    definition_text = definition_text.replace("臺⃝", "臺：")
                    definition_text = definition_text.replace("陸⃝", "陸：")

                    # Insert zero-width spaces so that we can reverse-search the definition
                    taiwan_def_tuple = objects.DefinitionTuple(
                        "".join(jieba.cut(definition_text)), taiwan_label, [])
                    mainland_def_tuple = objects.DefinitionTuple(
                        "".join(jieba.cut(definition_text)), mainland_label,
                        [])

                    # Parse and add examples to this definition
                    if "example" in definition:
                        for example in definition["example"]:
                            if re.match(EXAMPLE_REGEX_PATTERN, example):
                                # Every example is surrounded by "如：<example>", so only keep the example
                                example = re.match(EXAMPLE_REGEX_PATTERN,
                                                   example).group(1)
                                # Some examples contain multiple examples, so split them up by enclosing brackets 「」
                                example_texts = re.findall(
                                    INDIVIDUAL_EXAMPLE_REGEX_PATTERN, example)
                            else:
                                logging.warning(
                                    f"Found example that does not fit the normal example regex pattern: {trad}, {example}"
                                )
                                # Fall back to splitting on Chinese enumeration comma
                                example_texts = example.split("、")

                            for example_text in example_texts:
                                # Strip out weird whitespace
                                example_text = re.sub(WHITESPACE_REGEX_PATTERN,
                                                      "", example_text)

                                # Joining and splitting separates series of full-width punctuation marks
                                # into separate items,  which is necessary so that lazy_pinyin() returns
                                # separate items for each full-width punctuation mark in the list it returns
                                #
                                # e.g. "《儒林外史．第四六回》：「成老爹道..." turns into
                                # "《 儒 林 外 史 ． 第 四 六 回 》 ： 「 成 老 爹 道", which turns into
                                # ['《', '儒', '林', '外', '史', '．', '第', '四', '六', '回', '》', '：', '「', '成', '老', '爹', '道']
                                # (Notice how "》：「"" is now split up into three different items)
                                example_pinyin = lazy_pinyin(
                                    " ".join(example_text).split(),
                                    style=Style.TONE3,
                                    neutral_tone_with_five=True,
                                )
                                example_pinyin = " ".join(
                                    example_pinyin).lower()
                                example_pinyin = example_pinyin.strip(
                                ).replace("v", "u:")

                                # Since the pinyin returned by lazy_pinyin doesn't always match the pinyin
                                # given in the heteronym, attempt to replace pinyin corresponding to the
                                # characters in this heteronym with the pinyin provided by the JSON file.
                                #
                                # e.g. example_text = "重新"; example_pinyin = "zhong4 xin1" (returned by lazy_pinyin)
                                # trad = "重", phrase_pinyin = "chong2"
                                # means that we should convert "zhong4 xin1" to "chong2 xin1"

                                # Strip out variant pronunciations for conversion purposes
                                for index, pin in enumerate(
                                    [taiwan_pin, mainland_pin]):
                                    phrase_pinyin = pin
                                    phrase_pinyin = re.sub(
                                        VARIANT_PRONUNCIATION_REGEX_PATTERN,
                                        "",
                                        phrase_pinyin,
                                    )
                                    phrase_pinyin = re.sub(
                                        COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN,
                                        "",
                                        phrase_pinyin,
                                    )

                                    # Do not try to match entries formatted like "那搭（Namibia)"
                                    if not re.match(
                                            STRANGE_ENTRY_REGEX_PATTERN, trad):
                                        try:
                                            example_pinyin = (
                                                change_pinyin_to_match_phrase(
                                                    example_text,
                                                    example_pinyin,
                                                    trad,
                                                    phrase_pinyin,
                                                ))
                                        except Exception as e:
                                            logging.warning(
                                                f"Couldn't change pinyin in example for word {trad}: "
                                                f"{''.join(example_text)}, {example_pinyin}, {pin}, "
                                                f"{e}")
                                            traceback.print_exc()

                                    if index == 0:
                                        taiwan_def_tuple.examples.append(
                                            objects.ExampleTuple(
                                                "zho", example_pinyin,
                                                example_text))
                                    elif index == 1:
                                        mainland_def_tuple.examples.append(
                                            objects.ExampleTuple(
                                                "zho", example_pinyin,
                                                example_text))

                    taiwan_defs.append(taiwan_def_tuple)
                    mainland_defs.append(mainland_def_tuple)

                last_heteronym_pin = heteronym["pinyin"]
                last_taiwan_pin = taiwan_pin
                last_mainland_pin = mainland_pin

            entry = objects.Entry(trad,
                                  simp,
                                  taiwan_pin,
                                  jyut,
                                  freq=freq,
                                  defs=taiwan_defs)
            words.append(entry)

            if mainland_pin != taiwan_pin:
                entry = objects.Entry(trad,
                                      simp,
                                      mainland_pin,
                                      jyut,
                                      freq=freq,
                                      defs=mainland_defs)
                words.append(entry)

            items_parsed += 1
            if not items_parsed % 500:
                print(f"Parsed entry #{items_parsed}")

Example #10

Show file

def parse_same_meaning_file(filename, words):
    for line in read_csv(filename):
        if len(line) != 17 or line[0] == "總分類":
            continue

        terms = defaultdict(set)

        for index in (4, 5, 6):
            if line[index]:
                terms["臺"].add(line[index])

        for index in (7, 8, 9):
            if line[index]:
                terms["陸"].add(line[index])

        for index in (10, 11, 12):
            if line[index]:
                terms["香"].add(line[index])

        for index in (13, 14, 15):
            if line[index]:
                terms["澳"].add(line[index])

        explanation = None
        if line[16]:
            explanation = objects.DefinitionTuple(
                "".join(jieba.cut(line[16])), "差異說明", [])

        for location in terms:
            for term in terms[location]:
                trad = term
                simp = HanziConv.toSimplified(trad)
                if term == line[4] and line[2]:
                    # Use the provided pinyin, which always corresponds at least to the first Taiwan term
                    pin = transcriptions.zhuyin_to_pinyin(line[2].replace(
                        "　", " "),
                                                          accented=False)
                else:
                    pin = lazy_pinyin(
                        trad,
                        style=Style.TONE3,
                        neutral_tone_with_five=True,
                    )
                    pin = " ".join(pin).lower()
                    pin = pin.strip().replace("v", "u:")
                jyut = pinyin_jyutping_sentence.jyutping(trad,
                                                         tone_numbers=True,
                                                         spaces=True)
                freq = zipf_frequency(trad, "zh")

                defs = terms.keys()
                defs = map(
                    lambda x: objects.DefinitionTuple("、".join(terms[x]), line[
                        1] + "：" + x, []),
                    defs,
                )
                defs = list(defs)

                if explanation:
                    defs.append(explanation)

                entry = objects.Entry(trad,
                                      simp,
                                      pin,
                                      jyut,
                                      freq=freq,
                                      defs=defs)
                words.add(entry)

Example #11

Show file

File: parse.py Project: aaronhktan/jyut-dict

def parse_word_file(file_name, words):
    with open(file_name, "r") as file:
        # In pages with latin script, the title messes with BeautifulSoup's HTML parsing
        # So remove the title and replace it with "CantoDict"
        file_text = file.read()
        file_text = re.sub(TITLE_REGEX_PATTERN, "<title>CantoDict</title>", file_text)

        soup = BeautifulSoup(file_text, "html.parser")

        # Extract the traditional and simplified forms
        try:
            forms = [
                x.strip()
                for x in soup.find("td", class_="chinesebig").get_text().split(" / ")
            ]
            if len(forms) > 1:
                trad = forms[0].strip()
                simp = forms[1].strip()
            else:
                trad = forms[0].strip()
                # Cantodict sometimes reports that there is no simplified variant, which is sometimes incorrect
                simp = HanziConv.toSimplified(trad)
        except:
            # If a character has latin script in it, it may not have a class called "chinesebig"
            try:
                forms = [
                    x.strip()
                    for x in soup.select("span.word.script")[0].get_text().split(" / ")
                ]
                if len(forms) > 1:
                    trad = forms[0].strip()
                    simp = forms[1].strip()
                else:
                    trad = forms[0].strip()
                    # Cantodict sometimes reports that there is no simplified variant, which is sometimes incorrect
                    simp = HanziConv.toSimplified(trad)
            except:
                logging.error(
                    f"Couldn't find traditional and simplified forms in file {file_name}"
                )
                return

        word = os.path.splitext(os.path.basename(file_name))[0].strip()
        if trad != word:
            if trad == HanziConv.toTraditional(word) or word == HanziConv.toSimplified(
                trad
            ):
                logging.debug(
                    f"File name {word} appears to be a simplified variant "
                    f"of {trad}. Ignoring..."
                )
                return
            else:
                if len(trad) == 1:
                    logging.error(
                        f"Hmm, looks like the parsed word {trad} doesn't "
                        f"match the file name {word}. If they are simplified "
                        "or traditional variants of each other, this error "
                        "can be safely ignored."
                    )
                else:
                    logging.error(
                        f"Hmm, looks like the parsed word {trad} doesn't "
                        f"match the file name {word}."
                    )
                return

        freq = zipf_frequency(trad, "zh")

        # Extract the pronunciations
        # CantoDict indicates differences in literary/colloquial pronunciation with *, but we don't support that
        # So remove the stars
        jyut_element = soup.find("span", class_="cardjyutping")
        jyut = jyut_element.get_text() if jyut_element else ""
        jyut = re.sub(LITERARY_CANTONESE_READING_REGEX_PATTERN, "", jyut)
        jyut = jyut.strip()

        pin_element = soup.find("span", class_="cardpinyin")
        pin = pin_element.get_text() if pin_element else ""
        # CantoDict also indicates tone sandhi in pinyin with *, but we don't support that either
        pin = re.sub(LITERARY_PINYIN_READING_REGEX_PATTERN, "", pin)
        if not pin:
            pin = " ".join(
                lazy_pinyin(trad, style=Style.TONE3, neutral_tone_with_five=True)
            ).lower()
        # Replace 'v' in Pinyin with the u: that CEDICT uses
        pin = pin.strip().replace("v", "u:")
        # Remove the zero-width spaces that sometimes show up
        pin = pin.replace("", "")

        # CantoDict may have multiple pronunciations for an entry
        # Check for multiple pronunciations in Jyutping
        variant_jyutping = []
        if len(trad) == 1 and len(jyut.split(" ")) > len(trad):
            variant_jyutping = jyut.split(" ")
        elif ";" in jyut:
            variant_jyutping = [x.strip() for x in jyut.split("; ")]

        # Check for differences in PRC and Taiwan pronunciation
        variant_pinyin = []
        for pattern in MANDARIN_PRONUNCIATION_VARIANT_REGEX_PATTERNS:
            match = pattern.match(pin)
            if match:
                # Make sure the matched groups match the length of the characters
                if len(match.group("prc").split(" ")) == len(trad) and len(match.group("tw").split(" ")) == len(trad): 
                    variant_pinyin.append((match.group("prc"), match.group("tw")))
        if not variant_pinyin:
            for pattern in MANDARIN_PRONUNCIATION_PARTIAL_VARIANT_REGEX_PATTERNS:
                match = pattern.match(pin)
                if match:
                    prc_pin = pattern.sub(match.group("prc"), pin)
                    tw_pin = pattern.sub(match.group("tw"), pin)
                    # Make sure the found pronunciations match the length of the characters
                    if len(prc_pin.split(" ")) == len(trad) and len(tw_pin.split(" ")) == len(trad):
                        variant_pinyin.append((prc_pin, tw_pin))
        # Also check for multiple pronunciations of single-character words
        if len(trad) == 1 and len(pin.split(" ")) > len(trad):
            [variant_pinyin.append((x, None)) for x in pin.split(" ")]

        # Some entries give different meanings for different pronunciation;
        # assume they don't but mark True if yes
        variants_handled = False

        # Extract the meaning element
        meaning_element = soup.find("td", class_="wordmeaning")

        # Check for special labels in compound words (brandname, idiom, placename, etc.)
        special_label = ""
        special_pos_elem = meaning_element.find("img", class_="flagicon")
        if special_pos_elem:
            special_label = special_pos_elem["alt"]

        # The layout of compound word pages is different from single-character pages
        real_meaning_element = meaning_element.find("div", class_=None)
        if real_meaning_element:
            meaning_element = real_meaning_element

        # Remove children (these usually contain useless fluff that interfere with definition parsing)
        children = meaning_element.find_all("div")
        children += meaning_element.find_all("span")
        for child in children:
            child.decompose()

        # Parse the meanings from the meaning element
        meanings = []
        # CantoDict puts some weird stuff in the meanings div, and the only way to separate
        # them out is to replace the <br> tags with "\n"
        for br in soup.find_all("br"):
            br.replace_with("\n")

        strings = DEFINITION_SPLITTING_REGEX_PATTERN.split(meaning_element.get_text())
        for string in strings:
            string = string.strip()
            if not string or any([x in string for x in illegal_strings]):
                continue

            continue_parsing = True
            for pattern in JYUTPING_PINYIN_REGEX_PATTERNS:
                result = re.search(pattern, string)
                if result:
                    if meanings:
                        entry = objects.Entry(
                            trad,
                            simp,
                            pin,
                            jyut,
                            freq=freq,
                            defs=meanings,
                        )
                        words.append(entry)

                    # Then, extract the new pinyin and jyutping
                    # and reset the meanings tuple
                    jyut = result.group(1)
                    jyut = re.sub(LITERARY_CANTONESE_READING_REGEX_PATTERN, "", jyut)
                    pin = result.group(2) if result.group(2) else ""
                    meanings = []
                    continue_parsing = False
                    variants_handled = True
                    break

            if not continue_parsing:
                continue

            for pattern in JYUTPING_ONLY_REGEX_PATTERNS:
                result = re.search(pattern, string)
                if result:
                    if meanings:
                        entry = objects.Entry(
                            trad,
                            simp,
                            pin,
                            jyut,
                            freq=freq,
                            defs=meanings,
                        )
                        words.append(entry)

                    # Then, extract the new jyutping (but keep the old pinyin!)
                    # and reset the meanings tuple
                    jyut = result.group(1)
                    jyut = re.sub(LITERARY_CANTONESE_READING_REGEX_PATTERN, "", jyut)
                    meanings = []
                    variants_handled = True
                    continue_parsing = False
                    break

            if not continue_parsing:
                continue

            # Try to isolate one or more labels (usually a POS or [華]: indicating Mandarin-only usage or [粵]: indicating Cantonese-only usage)
            labels = []
            definition = string
            result = re.search(LABEL_REGEX_PATTERN, string)
            if not result:
                # Filter out bad non-standard strings that are completely enclosed in square braces
                if string[0] == "[" and string[-1] == "]":
                    continue

            while result:
                labels.extend(result.group(1).strip().split(","))
                string = re.sub(LABEL_REGEX_PATTERN, "", string)
                result = re.search(LABEL_REGEX_PATTERN, string)

            # At this point, all the labels enclosed in square braces (possibly followed by whitespace)
            # should be stripped out of the beginning of the string.
            # Therefore, we can now assume the contents of the string are the definition
            definition = string
            if not definition:
                continue

            # Override black trying to add a trailing comma here
            # fmt: off
            labels = map(
                lambda x: pos_labels[x.lower()] if x.lower() in pos_labels else x,
                labels
            )
            # fmt: on
            label = ", ".join(labels)
            if not label and special_label:
                label = special_label

            meanings.append((label, definition))

        if meanings:
            entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=meanings)

            if not variant_jyutping and not variant_pinyin:
                words.append(entry)

            elif not variants_handled:
                for jyutping in variant_jyutping:
                    variant_entry = copy.deepcopy(entry)
                    variant_entry.add_jyutping(jyutping)
                    words.append(variant_entry)

                for prc, tw in variant_pinyin:
                    prc_variant = copy.deepcopy(entry)
                    prc_variant.add_pinyin(prc)
                    words.append(prc_variant)

                    if tw:
                        tw_variant = copy.deepcopy(entry)
                        tw_variant.add_pinyin(tw)
                        words.append(tw_variant)

Example #12

Show file

def process_entry(line):
    entries = []

    # Parse the entry header
    header = line[1].strip('"')
    variants = header.split(",")

    for variant in variants:
        trad = variant.split(":")[0]
        simp = HanziConv.toSimplified(trad)
        pin = (" ".join(
            lazy_pinyin(trad, style=Style.TONE3,
                        neutral_tone_with_five=True)).lower().replace(
                            "v", "u:"))
        freq = zipf_frequency(trad, "zh")

        # Parse all the Jyutping
        pronunciations = variant.split(":")[1:]
        if not pronunciations:
            entries.append(
                objects.Entry(trad=trad,
                              simp=simp,
                              jyut="",
                              pin=pin,
                              freq=freq))
        else:
            for jyut in pronunciations:
                # Non-standard Jyutping starts with an exclamation mark, e.g. !sdet1 or !sdaaf1
                # Remove the exclamation mark
                if jyut.startswith("!"):
                    jyut = jyut[1:]

                entries.append(
                    objects.Entry(trad=trad,
                                  simp=simp,
                                  jyut=jyut,
                                  pin=pin,
                                  freq=freq))

    # Parse the entry content: explanations, examples
    content = line[2]
    if content.startswith("未有內容"):
        for entry in entries:
            entry.append_to_defs(objects.Definition(definition="x"))
        return entries

    definitions = []
    entry_labels = []
    near_synonyms = []
    antonyms = []

    # Explanations are separated by both '<explanation>' tags and '----' tags
    explanations = content.split("<explanation>")
    explanations = map(lambda x: x.split("----"), explanations)
    explanations = list(chain.from_iterable(explanations))

    for explanation_index, explanation in enumerate(explanations):
        if not explanation.strip():
            continue

        parse_explanation = True

        if explanation_index == 0:
            # The first item contains metadata about the entry
            parse_explanation = False

            for x in re.findall(PART_OF_SPEECH_REGEX, explanation):
                entry_labels.append(x)
            for x in re.findall(LABEL_REGEX, explanation):
                entry_labels.append(x)
            for x in re.findall(NEAR_SYNONYM_REGEX, explanation):
                near_synonyms.append(x)
            for x in re.findall(ANTONYM_REGEX, explanation):
                antonyms.append(x)

            # However, for some items, such as id 89764, the first item also contains the explanation
            if explanation.find("yue:") != -1:
                # fmt: off
                explanation = explanation[explanation.find("yue:"):]
                # fmt: on
                parse_explanation = True

        if parse_explanation:
            definition = objects.Definition(label="、".join(entry_labels),
                                            examples=[])

            # Subsequent items contain explanations
            for index, item in enumerate(explanation.split("<eg>")):
                if index == 0:
                    # The first item contains the explanation
                    # Translations in different languages are separated by newlines
                    explanation_translations = item.strip().split("\n")
                    # Strip out links
                    explanation_translations = map(
                        lambda x:
                        (re.sub(LINK_REGEX, r"\1", x).replace("#", "")
                         if re.search(LINK_REGEX, x) else x.replace("#", "")),
                        explanation_translations,
                    )
                    # fmt: off
                    # Segment the Chinese explanations so they show up in the FTS index
                    explanation_translations = map(
                        lambda x: (x[:x.find(":")], x[x.find(":") + 1:]),
                        explanation_translations)
                    explanation_translations = map(
                        lambda x: ("".join(jieba.cut(x[1]))
                                   if x[0] in ("yue", "zho") else x[1]),
                        explanation_translations)
                    # fmt: on
                    explanation = "\n".join(explanation_translations)
                    definition.definition = explanation
                else:
                    # Subsequent items contain examples for this explanation
                    definition.examples.append([])
                    example_translations = item.strip().split("\n")
                    # fmt: off
                    # Strip out links
                    example_translations = map(lambda x: x.replace("#", ""),
                                               example_translations)
                    # fmt: on
                    for translation in example_translations:
                        if not translation or translation == "----":
                            # Ignore lines that are not translations
                            continue

                        # fmt: off
                        lang = translation[:translation.find(":")]
                        if lang in ("yue", "zho"):
                            # Example content ends before the first space with an opening parenthesis after it
                            # (which indicates the start of a romanization)
                            # but some example don't have romanization, so filter for that
                            if translation.find(" (") >= 0:
                                content = translation[translation.find(":") +
                                                      1:translation.find(" (")]
                            else:
                                content = translation[translation.find(":") +
                                                      1:]
                            pron = (translation[translation.find(" (") +
                                                1:].strip("()")
                                    if len(translation.split()) >= 2 else "")
                            if lang == "yue":
                                definition.examples[-1].insert(
                                    0,
                                    objects.Example(lang=lang,
                                                    pron=pron,
                                                    content=content))
                            else:
                                definition.examples[-1].append(
                                    objects.Example(lang=lang,
                                                    content=content))
                        else:
                            content = translation[translation.find(":") + 1:]
                            if not content:
                                content = "x"
                            definition.examples[-1].append(
                                objects.Example(lang=lang, content=content))
                        # fmt: on

            definitions.append(definition)

    # Add synonyms, antonyms to list of definitions
    if near_synonyms:
        definitions.append(
            objects.Definition(definition="、".join(near_synonyms),
                               label="近義詞",
                               examples=[]))
    if antonyms:
        definitions.append(
            objects.Definition(definition="、".join(antonyms),
                               label="反義詞",
                               examples=[]))

    # Assign definitions to each entry
    for entry in entries:
        entry.add_defs(copy.deepcopy(definitions))

    # Add variants to the definitions of an entry; these are unique for each entry
    if len(variants) > 1:
        variants = set(map(lambda x: x.split(":")[0], variants))
        for entry in entries:
            # Do not add variants whose Chinese characters match the current entry's characters into the current entry's "see also" section
            filtered_variants = filter(lambda x: x != entry.traditional,
                                       variants)
            entry.append_to_defs(
                objects.Definition(
                    definition="、".join(filtered_variants),
                    label="參看",
                    examples=[],
                ))

    return entries

Example #13

Show file

def parse_file(filename_traditional, filename_simplified_jyutping, entries):
    simplified = traditional = []
    with open(filename_traditional, "r", encoding="utf8") as f:
        reader = csv.reader(f, delimiter="	")
        traditional = list(reader)

    # The Kaifangcidian data for jyutping is horrible.
    # The entire data is on a single line, printed like a flat Python list.
    # The entry may be a single item in the array, or multiple items.
    # The Jyutping pronunciation is a separate item for each character in the entry.
    # The translations to Mandarin may, or may not follow the Jyutping!
    # And there is no separator between data for different entries :)
    last_line = ""
    with open(filename_simplified_jyutping, "r", encoding="utf8") as f:
        last_line = f.readlines()[-1]
    simplified = ast.literal_eval(last_line)

    index = 0
    for row in range(len(traditional)):
        if row < 9:  # The first nine rows are comments and headers
            continue

        trad = traditional[row][0]

        # Horrible data workaround 1:
        # In KFCD Jyutping data, when the entry has Chinese characters in it,
        # the entry is presented as a single string in the array. (This is sane.)
        # If it does not (e.g. the word 'pat pat'), each series of characters, delineated
        # by a space, is a separate entry in the array ('pat pat' => ["pat", "pat"])
        trad_len = len(trad.split(" "))
        if not hanzidentifier.has_chinese(trad):
            simp = "".join(simplified[index:index + trad_len])
        else:
            simp = simplified[index]

        # Horrible data workaround 2:
        # In KFCD Jyutping data, the Jyutping for each word in an entry
        # is presented as a separate string.
        # To find the indices that correspond to the entry we just extracted,
        # use the data from the KFCD Yale edition (which is formatted as a CSV) to
        # determine how many items comprise the Jyutping pronunciation.
        # One cannot use the string length of the entry, as it may contain punctuation
        # (e.g. '，') that has no corresponding Jyutping syllable, AND the entry
        # may be split up into multiple items (as described in horrible
        # workaround #1).
        jyut_len = len(traditional[row][1].split(" "))
        jyut = " ".join(simplified[index + trad_len:index + trad_len +
                                   jyut_len])

        pin = (" ".join(
            lazy_pinyin(trad, style=Style.TONE3,
                        neutral_tone_with_five=True)).lower().replace(
                            "v", "u:"))

        # Horrible data workaround 3:
        # In the KFCD Yale data, all the definitions are listed as a single item, separated
        # by the wide-character '，'. Some entries have definitions, and some do not.
        # In the KFCD Jyutping edition, the definitions are also listed all as a single item.
        # However, many words do not have definitions; if there are no definitions then
        # we do NOT need to advance the index by 1 more item (which would have been
        # the definitions).
        if traditional[row][2]:
            defs_traditional = traditional[row][2].split("，")
            defs_simplified = simplified[index + trad_len +
                                         jyut_len].split("，")
            definitions = []
            for (def_traditional,
                 def_simplified) in zip(defs_traditional, defs_simplified):
                if def_traditional != def_simplified:
                    definitions.append(def_traditional + " – " +
                                       def_simplified)
                else:
                    definitions.append(def_traditional)
            index += trad_len + jyut_len + 1
        else:
            definitions = ["（沒有對應漢語詞彙）"]
            index += trad_len + jyut_len

        entry = objects.Entry(trad=trad,
                              simp=simp,
                              pin=pin,
                              jyut=jyut,
                              defs=definitions)

        if trad in entries:
            entries[trad].append(entry)
        else:
            entries[trad] = [entry]