def parse_variants(filename, words_traditional, words_simplified): variant_pairs = set() for line in read_csv(filename): if len(line) != 3 or line[0].startswith("#"): continue codepoint = line[0] fieldname = line[1] content = line[2] if fieldname not in ("kSimplifiedVariant", "kTraditionalVariant"): continue character = chr(int(codepoint[2:], 16)) variants = [chr(int(item[2:], 16)) for item in content.split()] if fieldname == "kTraditionalVariant": for variant in variants: if (variant, character) not in variant_pairs: # This character has a traditional variant - insert into words entry = objects.Entry(variant, character, "", "") words_traditional[variant].append(entry) words_simplified[character].append(entry) variant_pairs.add((variant, character)) elif fieldname == "kSimplifiedVariant": for variant in variants: if (character, variant) not in variant_pairs: # This character has a simplified variant - insert into words entry = objects.Entry(character, variant, "", "") words_traditional[character].append(entry) words_simplified[variant].append(entry) variant_pairs.add((character, variant))
def parse_cc_cedict_canto_readings(filename, entries): with open(filename, "r", encoding="utf8") as f: for line in f: if len(line) == 0 or line[0] == "#": continue split = line.split() trad = split[0] simp = split[1] pin = line[line.index("[") + 1:line.index("]")].lower().replace( "v", "u:") jyut = line[line.index("{") + 1:line.index("}")].lower() entry = objects.Entry(trad=trad, simp=simp, pin=pin, jyut=jyut) if trad in entries: new_entry = True for existing_entry in entries[trad]: if (existing_entry.simplified == simp and existing_entry.pinyin == pin and existing_entry.jyutping == jyut): new_entry = False break if new_entry: entries[trad].append(entry) else: entries[trad] = [entry]
def parse_same_word_file(filename, words): for line in read_csv(filename): if len(line) != 2 or line[0] == "詞彙": continue trad = line[0] simp = HanziConv.toSimplified(trad) pin = lazy_pinyin( trad, style=Style.TONE3, neutral_tone_with_five=True, ) pin = " ".join(pin).lower() pin = pin.strip().replace("v", "u:") jyut = pinyin_jyutping_sentence.jyutping(trad, tone_numbers=True, spaces=True) freq = zipf_frequency(trad, "zh") defs = [ objects.DefinitionTuple("".join(jieba.cut(line[1])), "臺陸用法和差異", []) ] entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs) words.add(entry)
def parse_cc_canto(filename, entries): with open(filename, "r", encoding="utf8") as f: for line in f: if len(line) == 0 or line[0] == "#": continue split = line.split() # Splits by whitespace trad = split[0] simp = split[1] pin = line[line.index("[") + 1:line.index("]")].lower().replace( "v", "u:") jyut = line[line.index("{") + 1:line.index("}")].lower() entry = objects.Entry(trad=trad, simp=simp, pin=pin, jyut=jyut) # Check if entry is already in dictionary if trad in entries: # If entry is in dictionary, then # make sure is new entry before adding # to list new_entry = True for existing_entry in entries[trad]: if (existing_entry.simplified == simp and existing_entry.pinyin == pin and existing_entry.jyutping == jyut): new_entry = False break if new_entry: entries[trad].append(entry) else: entries[trad] = [entry]
def parse_readings(filename, words_traditional, words_simplified): for line in read_csv(filename): if len(line) != 3 or line[0].startswith("#"): continue codepoint = line[0] fieldname = line[1] content = line[2] if fieldname not in ("kCantonese", "kMandarin", "kDefinition"): continue character = chr(int(codepoint[2:], 16)) entry_added = False if character in words_traditional: freq = zipf_frequency(character, "zh") for entry in words_traditional[character]: entry.add_freq(freq) if fieldname == "kCantonese": entry.add_jyutping(content) elif fieldname == "kMandarin": pin = convert_pinyin_to_tone_numbers(content, character) entry.add_pinyin(pin) elif fieldname == "kDefinition": entry.add_defs([("", x.strip()) for x in content.split(";")]) entry_added = True if character in words_simplified: # Ignore simplified characters entry_added = True if not entry_added: trad = simp = character freq = zipf_frequency(trad, "zh") jyut = content if fieldname == "kCantonese" else "" pin = (convert_pinyin_to_tone_numbers(content, trad) if fieldname == "kMandarin" else "") defs = ([("", x.strip()) for x in content.split(";")] if fieldname == "kDefinition" else []) entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs) words_traditional[trad].append(entry) words_simplified[simp].append(entry)
def parse_file(filename, entries): with open(filename, "r", encoding="utf8") as f: for index, line in enumerate(f): if len(line) == 0 or line[0] == "#": continue split = line.split() trad = split[0] simp = split[1] pin = line[line.index("[") + 1:line.index("]")].lower().replace( "v", "u:") definitions = line[line.index("/") + 1:-2].split("/") entry = objects.Entry(trad=trad, simp=simp, pin=pin, defs=definitions) if trad in entries: entries[trad].append(entry) else: entries[trad] = [entry]
def parse_file(filename, words): with open(filename) as f: data = json.load(f) items_parsed = 0 # Each item in the JSON correspond to one or more entries in the dictionary # Most items map 1:1 to entries, e.g. "物質" is a single entry # Some items are 多音字, so they map to multiple entries (e.g. 重 -> zhòng and chóng) # # In the vocabulary of the MoEDict, each item may correspond to multiple heteronyms, # and each heteronym maps to a single entry. for item in data: # For now, ignore variant characters that aren't properly encoded in Unicode if re.match(EXCLUDE_VARIANT_REGEX_PATTERN, item["title"]): continue # These do not change no matter the heteronym trad = item["title"] simp = HanziConv.toSimplified(trad) jyut = pinyin_jyutping_sentence.jyutping(trad, tone_numbers=True, spaces=True) freq = zipf_frequency(trad, "zh") # Build up a list of definitions for each heteronym defs = [] # Distinguish between heteronyms by their pinyin – if the pinyin of the # current heteronym does not match the old pinyin, then a new heteronym # must be created last_heteronym_pin = "" # Go through each heteronym, creating Entry objects for each one for heteronym in item["heteronyms"]: if "pinyin" not in heteronym: logging.debug( f'Could not find pinyin for heteronym of word {trad} with definitions {heteronym["definitions"]}' ) continue pin = PINYIN_COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN.sub( "", heteronym["pinyin"]) pin = PINYIN_LITERARY_PRONUNCIATION_REGEX_PATTERN.sub("", pin) pin = PINYIN_ALTERNATE_PRONUNCIATION_REGEX_PATTERN.sub("", pin) pin = PINYIN_SECOND_ALTERNATE_PRONUNCIATION_REGEX_PATTERN.sub( "", pin) pin = pin.split() pin = [ pinyin_to_tone_numbers(syllable, trad).split() for syllable in pin ] pin = list(itertools.chain.from_iterable(pin)) pin = pin[:len(trad)] pin = " ".join(pin) if last_heteronym_pin != "" and pin != last_heteronym_pin: # Different pinyin means that we are now processing a new heteronym. # We must create an Entry object for the definitions of the old heteronym # and add it to the list of entries before processing the new one. entry = objects.Entry(trad, simp, last_heteronym_pin, jyut, freq=freq, defs=defs) words.append(entry) # Reset the definitions list defs = [] for definition in heteronym["definitions"]: label = definition["type"] if "type" in definition else "" # Insert zero-width spaces so that we can reverse-search the definition def_tuple = objects.DefinitionTuple( "".join(jieba.cut(definition["def"])), label, []) # Parse and add examples to this definition if "example" in definition: for example in definition["example"]: if EXAMPLE_REGEX_PATTERN.match(example): # Every example is surrounded by "如:<example>", so only keep the example example = EXAMPLE_REGEX_PATTERN.match( example).group(1) # Some examples contain multiple examples, so split them up by enclosing brackets 「」 example_texts = ( INDIVIDUAL_EXAMPLE_REGEX_PATTERN.findall( example)) else: logging.warning( f"Found example that does not fit the normal example regex pattern: {trad}, {example}" ) # Fall back to splitting on Chinese enumeration comma example_texts = example.split("、") for example_text in example_texts: # Strip out weird whitespace example_text = WHITESPACE_REGEX_PATTERN.sub( "", example_text) # Joining and splitting separates series of full-width punctuation marks # into separate items, which is necessary so that lazy_pinyin() returns # separate items for each full-width punctuation mark in the list it returns # # e.g. "《儒林外史.第四六回》:「成老爹道..." turns into # "《 儒 林 外 史 . 第 四 六 回 》 : 「 成 老 爹 道", which turns into # ['《', '儒', '林', '外', '史', '.', '第', '四', '六', '回', '》', ':', '「', '成', '老', '爹', '道'] # (Notice how "》:「"" is now split up into three different items) example_pinyin = lazy_pinyin( " ".join(example_text).split(), style=Style.TONE3, neutral_tone_with_five=True, ) example_pinyin = " ".join( example_pinyin).lower() example_pinyin = example_pinyin.strip( ).replace("v", "u:") # Since the pinyin returned by lazy_pinyin doesn't always match the pinyin # given in the heteronym, attempt to replace pinyin corresponding to the # characters in this heteronym with the pinyin provided by the JSON file. # # e.g. example_text = "重新"; example_pinyin = "zhong4 xin1" (returned by lazy_pinyin) # trad = "重", phrase_pinyin = "chong2" # means that we should convert "zhong4 xin1" to "chong2 xin1" # Strip out variant pronunciations for conversion purposes phrase_pinyin = pin phrase_pinyin = VARIANT_PRONUNCIATION_REGEX_PATTERN.sub( "", phrase_pinyin, ) phrase_pinyin = ( COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN.sub( "", phrase_pinyin, )) # Do not try to match entries formatted like "那搭(Namibia)" if not STRANGE_ENTRY_REGEX_PATTERN.match(trad): try: example_pinyin = change_pinyin_to_match_phrase( example_text, example_pinyin, trad, phrase_pinyin, ) except Exception as e: logging.warning( f"Couldn't change pinyin in example for word {trad}: " f"{''.join(example_text)}, {example_pinyin}, {pin}, " f"{e}") traceback.print_exc() def_tuple.examples.append( objects.ExampleTuple( "cmn", example_pinyin, example_text)) # Parse and add quotes to this definition if "quote" in definition: for quote in definition["quote"]: quote_text = re.sub(WHITESPACE_REGEX_PATTERN, "", quote) quote_pinyin = lazy_pinyin( " ".join(quote_text).split(), style=Style.TONE3, neutral_tone_with_five=True, ) quote_pinyin = " ".join(quote_pinyin).lower() quote_pinyin = quote_pinyin.strip().replace( "v", "u:") phrase_pinyin = pin phrase_pinyin = re.sub( VARIANT_PRONUNCIATION_REGEX_PATTERN, "", phrase_pinyin) phrase_pinyin = re.sub( COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN, "", phrase_pinyin, ) if not re.match(STRANGE_ENTRY_REGEX_PATTERN, trad): try: quote_pinyin = change_pinyin_to_match_phrase( quote_text, quote_pinyin, trad, phrase_pinyin) except Exception as e: logging.warning( f"Couldn't change pinyin in quote for word {trad}: " f"{''.join(quote_text)}, {quote_pinyin}, {pin} " f"{e}") traceback.print_exc() def_tuple.examples.append( objects.ExampleTuple("zho", quote_pinyin, quote_text)) # We currently ignore synonyms, antonyms, and "see also" links, because they are # linked to definitions and we have no way to display that data... defs.append(def_tuple) last_heteronym_pin = pin entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs) words.append(entry) items_parsed += 1 if not items_parsed % 500: print(f"Parsed entry #{items_parsed}")
def parse_word_file(file_name, words): with open(file_name, "r") as file: soup = BeautifulSoup(file, "html.parser") # Extract the word on the page trad = converter.convert_string( soup.find("span", class_="ChiCharFix").get_text() ) if re.search(PRIVATE_USE_AREA_REGEX, trad): logging.warning( f"Hmm, looks like the word {trad} contains nonstandard characters, replacing with squares..." ) trad = re.sub( PRIVATE_USE_AREA_REGEX, PRIVATE_USE_AREA_REPLACEMENT_STRING, trad ) simp = HanziConv.toSimplified(trad) word = os.path.splitext(os.path.basename(file_name))[0] word = converter.convert_string(word) if re.search(PRIVATE_USE_AREA_REGEX, word): word = re.sub( PRIVATE_USE_AREA_REGEX, PRIVATE_USE_AREA_REPLACEMENT_STRING, word ) if trad != word: logging.warning( f"Hmm, looks like the parsed word {trad} doesn't match the filename {word}" ) return freq = zipf_frequency(trad, "zh") # Get the type of word label = soup.find("span", id=LABEL_REGEX).get_text() # Get the pronunciation, which is split up into the letter portion and the number portion jyutping_letters = soup.find("span", id=JYUTPING_LETTERS_ID_REGEX).get_text() jyutping_letters = jyutping_letters.split() jyutping_numbers = soup.find("span", id=JYUTPING_NUMBERS_ID_REGEX).get_text() jyutping_numbers = JYUTPING_NUMBERS_REGEX.findall(jyutping_numbers) jyutping_numbers = [ JYUTPING_MAP[x] if x in JYUTPING_MAP else x for x in jyutping_numbers ] # Replacement is needed because CUHK uses 7-8-9 notation for checked tones instead of 1-3-6 jyut = [x[0] + x[1] for x in zip(jyutping_letters, jyutping_numbers)] jyut = " ".join(jyut) # Automatically generate pinyin pin = ( " ".join(lazy_pinyin(trad, style=Style.TONE3, neutral_tone_with_five=True)) .lower() .replace("v", "u:") ) # Extract the meanings meaning_elements = soup.find_all("span", id=MEANING_REGEX) defs = [ (label, meaning_element.get_text()) for meaning_element in meaning_elements ] # Add remarks, if one exists on the page remark = soup.find("span", id=REMARK_REGEX).get_text() if remark: defs.append(("備註", remark)) entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs) words.append(entry)
def parse_file(filename, words): with open(filename) as f: data = json.load(f) items_parsed = 0 # Each item in the JSON correspond to one or more entries in the dictionary # Most items map 1:1 to entries, e.g. "物質" is a single entry # Some items are 多音字, so they map to multiple entries (e.g. 重 -> zhòng and chóng) # # In the vocabulary of the the CSLD, each item may correspond to multiple heteronyms, # and each heteronym maps to a single entry. for item in data: # These do not change no matter the heteronym trad = item["title"] simp = HanziConv.toSimplified(trad) jyut = pinyin_jyutping_sentence.jyutping(trad, tone_numbers=True, spaces=True) freq = zipf_frequency(trad, "zh") # Some items have multiple pronunciations (one for Taiwan, one for Mainland China) taiwan_pin = mainland_pin = "" # Build up a list of definitions for each heteronym taiwan_defs = [] mainland_defs = [] # Distinguish between heteronyms by their pinyin – if the pinyin of the # current heteronym does not match the old pinyin, then a new heteronym # must be created last_heteronym_pin = "" last_taiwan_pin = last_mainland_pin = "" # Go through each heteronym, creating Entry objects for each one for heteronym in item["heteronyms"]: if "pinyin" not in heteronym: logging.debug( f'Could not find pinyin for heteronym of word {trad} with definitions {heteronym["definitions"]}' ) continue # Filter out known bad pinyin if (trad in KNOWN_INVALID_SYLLABLES and heteronym["pinyin"] in KNOWN_INVALID_SYLLABLES[trad]): pins = KNOWN_INVALID_SYLLABLES[trad][heteronym["pinyin"]] else: pins = heteronym["pinyin"].split("<br>陸⃝") # Some weird a's cause dragonmapper to break, so replace them with standard a's. pins = list(map(lambda x: x.replace("ɑ", "a"), pins)) # Remove dashes in pinyin pins = list(map(lambda x: x.replace("-", " "), pins)) # Remove commas in pinyin pins = list(map(lambda x: x.replace(",", ""), pins)) # Remove weird characters pins = list(map(lambda x: x.replace("陸⃟", ""), pins)) # Dragonmapper cannot handle some erhua pins = list( map(lambda x: x.replace("diǎr", "diǎn er"), pins)) pins = list( map(lambda x: x.replace("biār", "biān er"), pins)) try: # Converting from pinyin -> zhuyin inserts spaces between characters # Converting from zhuyin -> pinyin conserves these spaces pins = [ transcriptions.zhuyin_to_pinyin( transcriptions.pinyin_to_zhuyin(x), accented=False) for x in pins ] for x in pins: if x.count(" ") >= len(trad): # This means that there was an extra space inserted somewhere; the pinyin is not valid raise ValueError( "Too many spaces in parsed Pinyin!") except Exception as e: # Try parsing zhuyin as a backup pins = heteronym["bopomofo"].split("<br>陸⃝") # Remove weird spaces in zhuyin pins = list(map(lambda x: x.replace(" ", " "), pins)) try: pins = [ transcriptions.zhuyin_to_pinyin(x, accented=False) for x in pins ] except Exception as e: logging.error( f"Unable to split up Pinyin for word {trad}: {e}, skipping word..." ) continue if len(pins) > 1: taiwan_pin = pins[0] mainland_pin = pins[1] else: taiwan_pin = mainland_pin = pins[0] if (last_heteronym_pin != "" and heteronym["pinyin"] != last_heteronym_pin): # A new different pinyin means that we are now processing a new heteronym. # We must create an Entry object for the definitions of the old heteronym # and add it to the list of entries before processing the new one. entry = objects.Entry(trad, simp, last_taiwan_pin, jyut, freq=freq, defs=taiwan_defs) words.append(entry) if last_mainland_pin != last_taiwan_pin: entry = objects.Entry( trad, simp, last_mainland_pin, jyut, freq=freq, defs=mainland_defs, ) words.append(entry) # Reset the definitions list taiwan_defs = [] mainland_defs = [] for definition in heteronym["definitions"]: taiwan_label = "臺" if taiwan_pin != mainland_pin else "" mainland_label = "陸" if mainland_pin != taiwan_pin else "" definition_text = definition["def"] # Take out parts of definitions that should be in labels for pattern in LABEL_REGEX_PATTERNS: if re.match(pattern, definition_text): definition_label, definition_text = re.match( pattern, definition_text).group(1, 2) taiwan_label += ("、" + definition_label if taiwan_label else definition_label) mainland_label += ("、" + definition_label if mainland_label else definition_label) # Remove 臺⃝ and 陸⃝ from definitions, since Qt cannot display them definition_text = definition_text.replace("臺⃝", "臺:") definition_text = definition_text.replace("陸⃝", "陸:") # Insert zero-width spaces so that we can reverse-search the definition taiwan_def_tuple = objects.DefinitionTuple( "".join(jieba.cut(definition_text)), taiwan_label, []) mainland_def_tuple = objects.DefinitionTuple( "".join(jieba.cut(definition_text)), mainland_label, []) # Parse and add examples to this definition if "example" in definition: for example in definition["example"]: if re.match(EXAMPLE_REGEX_PATTERN, example): # Every example is surrounded by "如:<example>", so only keep the example example = re.match(EXAMPLE_REGEX_PATTERN, example).group(1) # Some examples contain multiple examples, so split them up by enclosing brackets 「」 example_texts = re.findall( INDIVIDUAL_EXAMPLE_REGEX_PATTERN, example) else: logging.warning( f"Found example that does not fit the normal example regex pattern: {trad}, {example}" ) # Fall back to splitting on Chinese enumeration comma example_texts = example.split("、") for example_text in example_texts: # Strip out weird whitespace example_text = re.sub(WHITESPACE_REGEX_PATTERN, "", example_text) # Joining and splitting separates series of full-width punctuation marks # into separate items, which is necessary so that lazy_pinyin() returns # separate items for each full-width punctuation mark in the list it returns # # e.g. "《儒林外史.第四六回》:「成老爹道..." turns into # "《 儒 林 外 史 . 第 四 六 回 》 : 「 成 老 爹 道", which turns into # ['《', '儒', '林', '外', '史', '.', '第', '四', '六', '回', '》', ':', '「', '成', '老', '爹', '道'] # (Notice how "》:「"" is now split up into three different items) example_pinyin = lazy_pinyin( " ".join(example_text).split(), style=Style.TONE3, neutral_tone_with_five=True, ) example_pinyin = " ".join( example_pinyin).lower() example_pinyin = example_pinyin.strip( ).replace("v", "u:") # Since the pinyin returned by lazy_pinyin doesn't always match the pinyin # given in the heteronym, attempt to replace pinyin corresponding to the # characters in this heteronym with the pinyin provided by the JSON file. # # e.g. example_text = "重新"; example_pinyin = "zhong4 xin1" (returned by lazy_pinyin) # trad = "重", phrase_pinyin = "chong2" # means that we should convert "zhong4 xin1" to "chong2 xin1" # Strip out variant pronunciations for conversion purposes for index, pin in enumerate( [taiwan_pin, mainland_pin]): phrase_pinyin = pin phrase_pinyin = re.sub( VARIANT_PRONUNCIATION_REGEX_PATTERN, "", phrase_pinyin, ) phrase_pinyin = re.sub( COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN, "", phrase_pinyin, ) # Do not try to match entries formatted like "那搭(Namibia)" if not re.match( STRANGE_ENTRY_REGEX_PATTERN, trad): try: example_pinyin = ( change_pinyin_to_match_phrase( example_text, example_pinyin, trad, phrase_pinyin, )) except Exception as e: logging.warning( f"Couldn't change pinyin in example for word {trad}: " f"{''.join(example_text)}, {example_pinyin}, {pin}, " f"{e}") traceback.print_exc() if index == 0: taiwan_def_tuple.examples.append( objects.ExampleTuple( "zho", example_pinyin, example_text)) elif index == 1: mainland_def_tuple.examples.append( objects.ExampleTuple( "zho", example_pinyin, example_text)) taiwan_defs.append(taiwan_def_tuple) mainland_defs.append(mainland_def_tuple) last_heteronym_pin = heteronym["pinyin"] last_taiwan_pin = taiwan_pin last_mainland_pin = mainland_pin entry = objects.Entry(trad, simp, taiwan_pin, jyut, freq=freq, defs=taiwan_defs) words.append(entry) if mainland_pin != taiwan_pin: entry = objects.Entry(trad, simp, mainland_pin, jyut, freq=freq, defs=mainland_defs) words.append(entry) items_parsed += 1 if not items_parsed % 500: print(f"Parsed entry #{items_parsed}")
def parse_same_meaning_file(filename, words): for line in read_csv(filename): if len(line) != 17 or line[0] == "總分類": continue terms = defaultdict(set) for index in (4, 5, 6): if line[index]: terms["臺"].add(line[index]) for index in (7, 8, 9): if line[index]: terms["陸"].add(line[index]) for index in (10, 11, 12): if line[index]: terms["香"].add(line[index]) for index in (13, 14, 15): if line[index]: terms["澳"].add(line[index]) explanation = None if line[16]: explanation = objects.DefinitionTuple( "".join(jieba.cut(line[16])), "差異說明", []) for location in terms: for term in terms[location]: trad = term simp = HanziConv.toSimplified(trad) if term == line[4] and line[2]: # Use the provided pinyin, which always corresponds at least to the first Taiwan term pin = transcriptions.zhuyin_to_pinyin(line[2].replace( " ", " "), accented=False) else: pin = lazy_pinyin( trad, style=Style.TONE3, neutral_tone_with_five=True, ) pin = " ".join(pin).lower() pin = pin.strip().replace("v", "u:") jyut = pinyin_jyutping_sentence.jyutping(trad, tone_numbers=True, spaces=True) freq = zipf_frequency(trad, "zh") defs = terms.keys() defs = map( lambda x: objects.DefinitionTuple("、".join(terms[x]), line[ 1] + ":" + x, []), defs, ) defs = list(defs) if explanation: defs.append(explanation) entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs) words.add(entry)
def parse_word_file(file_name, words): with open(file_name, "r") as file: # In pages with latin script, the title messes with BeautifulSoup's HTML parsing # So remove the title and replace it with "CantoDict" file_text = file.read() file_text = re.sub(TITLE_REGEX_PATTERN, "<title>CantoDict</title>", file_text) soup = BeautifulSoup(file_text, "html.parser") # Extract the traditional and simplified forms try: forms = [ x.strip() for x in soup.find("td", class_="chinesebig").get_text().split(" / ") ] if len(forms) > 1: trad = forms[0].strip() simp = forms[1].strip() else: trad = forms[0].strip() # Cantodict sometimes reports that there is no simplified variant, which is sometimes incorrect simp = HanziConv.toSimplified(trad) except: # If a character has latin script in it, it may not have a class called "chinesebig" try: forms = [ x.strip() for x in soup.select("span.word.script")[0].get_text().split(" / ") ] if len(forms) > 1: trad = forms[0].strip() simp = forms[1].strip() else: trad = forms[0].strip() # Cantodict sometimes reports that there is no simplified variant, which is sometimes incorrect simp = HanziConv.toSimplified(trad) except: logging.error( f"Couldn't find traditional and simplified forms in file {file_name}" ) return word = os.path.splitext(os.path.basename(file_name))[0].strip() if trad != word: if trad == HanziConv.toTraditional(word) or word == HanziConv.toSimplified( trad ): logging.debug( f"File name {word} appears to be a simplified variant " f"of {trad}. Ignoring..." ) return else: if len(trad) == 1: logging.error( f"Hmm, looks like the parsed word {trad} doesn't " f"match the file name {word}. If they are simplified " "or traditional variants of each other, this error " "can be safely ignored." ) else: logging.error( f"Hmm, looks like the parsed word {trad} doesn't " f"match the file name {word}." ) return freq = zipf_frequency(trad, "zh") # Extract the pronunciations # CantoDict indicates differences in literary/colloquial pronunciation with *, but we don't support that # So remove the stars jyut_element = soup.find("span", class_="cardjyutping") jyut = jyut_element.get_text() if jyut_element else "" jyut = re.sub(LITERARY_CANTONESE_READING_REGEX_PATTERN, "", jyut) jyut = jyut.strip() pin_element = soup.find("span", class_="cardpinyin") pin = pin_element.get_text() if pin_element else "" # CantoDict also indicates tone sandhi in pinyin with *, but we don't support that either pin = re.sub(LITERARY_PINYIN_READING_REGEX_PATTERN, "", pin) if not pin: pin = " ".join( lazy_pinyin(trad, style=Style.TONE3, neutral_tone_with_five=True) ).lower() # Replace 'v' in Pinyin with the u: that CEDICT uses pin = pin.strip().replace("v", "u:") # Remove the zero-width spaces that sometimes show up pin = pin.replace("", "") # CantoDict may have multiple pronunciations for an entry # Check for multiple pronunciations in Jyutping variant_jyutping = [] if len(trad) == 1 and len(jyut.split(" ")) > len(trad): variant_jyutping = jyut.split(" ") elif ";" in jyut: variant_jyutping = [x.strip() for x in jyut.split("; ")] # Check for differences in PRC and Taiwan pronunciation variant_pinyin = [] for pattern in MANDARIN_PRONUNCIATION_VARIANT_REGEX_PATTERNS: match = pattern.match(pin) if match: # Make sure the matched groups match the length of the characters if len(match.group("prc").split(" ")) == len(trad) and len(match.group("tw").split(" ")) == len(trad): variant_pinyin.append((match.group("prc"), match.group("tw"))) if not variant_pinyin: for pattern in MANDARIN_PRONUNCIATION_PARTIAL_VARIANT_REGEX_PATTERNS: match = pattern.match(pin) if match: prc_pin = pattern.sub(match.group("prc"), pin) tw_pin = pattern.sub(match.group("tw"), pin) # Make sure the found pronunciations match the length of the characters if len(prc_pin.split(" ")) == len(trad) and len(tw_pin.split(" ")) == len(trad): variant_pinyin.append((prc_pin, tw_pin)) # Also check for multiple pronunciations of single-character words if len(trad) == 1 and len(pin.split(" ")) > len(trad): [variant_pinyin.append((x, None)) for x in pin.split(" ")] # Some entries give different meanings for different pronunciation; # assume they don't but mark True if yes variants_handled = False # Extract the meaning element meaning_element = soup.find("td", class_="wordmeaning") # Check for special labels in compound words (brandname, idiom, placename, etc.) special_label = "" special_pos_elem = meaning_element.find("img", class_="flagicon") if special_pos_elem: special_label = special_pos_elem["alt"] # The layout of compound word pages is different from single-character pages real_meaning_element = meaning_element.find("div", class_=None) if real_meaning_element: meaning_element = real_meaning_element # Remove children (these usually contain useless fluff that interfere with definition parsing) children = meaning_element.find_all("div") children += meaning_element.find_all("span") for child in children: child.decompose() # Parse the meanings from the meaning element meanings = [] # CantoDict puts some weird stuff in the meanings div, and the only way to separate # them out is to replace the <br> tags with "\n" for br in soup.find_all("br"): br.replace_with("\n") strings = DEFINITION_SPLITTING_REGEX_PATTERN.split(meaning_element.get_text()) for string in strings: string = string.strip() if not string or any([x in string for x in illegal_strings]): continue continue_parsing = True for pattern in JYUTPING_PINYIN_REGEX_PATTERNS: result = re.search(pattern, string) if result: if meanings: entry = objects.Entry( trad, simp, pin, jyut, freq=freq, defs=meanings, ) words.append(entry) # Then, extract the new pinyin and jyutping # and reset the meanings tuple jyut = result.group(1) jyut = re.sub(LITERARY_CANTONESE_READING_REGEX_PATTERN, "", jyut) pin = result.group(2) if result.group(2) else "" meanings = [] continue_parsing = False variants_handled = True break if not continue_parsing: continue for pattern in JYUTPING_ONLY_REGEX_PATTERNS: result = re.search(pattern, string) if result: if meanings: entry = objects.Entry( trad, simp, pin, jyut, freq=freq, defs=meanings, ) words.append(entry) # Then, extract the new jyutping (but keep the old pinyin!) # and reset the meanings tuple jyut = result.group(1) jyut = re.sub(LITERARY_CANTONESE_READING_REGEX_PATTERN, "", jyut) meanings = [] variants_handled = True continue_parsing = False break if not continue_parsing: continue # Try to isolate one or more labels (usually a POS or [華]: indicating Mandarin-only usage or [粵]: indicating Cantonese-only usage) labels = [] definition = string result = re.search(LABEL_REGEX_PATTERN, string) if not result: # Filter out bad non-standard strings that are completely enclosed in square braces if string[0] == "[" and string[-1] == "]": continue while result: labels.extend(result.group(1).strip().split(",")) string = re.sub(LABEL_REGEX_PATTERN, "", string) result = re.search(LABEL_REGEX_PATTERN, string) # At this point, all the labels enclosed in square braces (possibly followed by whitespace) # should be stripped out of the beginning of the string. # Therefore, we can now assume the contents of the string are the definition definition = string if not definition: continue # Override black trying to add a trailing comma here # fmt: off labels = map( lambda x: pos_labels[x.lower()] if x.lower() in pos_labels else x, labels ) # fmt: on label = ", ".join(labels) if not label and special_label: label = special_label meanings.append((label, definition)) if meanings: entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=meanings) if not variant_jyutping and not variant_pinyin: words.append(entry) elif not variants_handled: for jyutping in variant_jyutping: variant_entry = copy.deepcopy(entry) variant_entry.add_jyutping(jyutping) words.append(variant_entry) for prc, tw in variant_pinyin: prc_variant = copy.deepcopy(entry) prc_variant.add_pinyin(prc) words.append(prc_variant) if tw: tw_variant = copy.deepcopy(entry) tw_variant.add_pinyin(tw) words.append(tw_variant)
def process_entry(line): entries = [] # Parse the entry header header = line[1].strip('"') variants = header.split(",") for variant in variants: trad = variant.split(":")[0] simp = HanziConv.toSimplified(trad) pin = (" ".join( lazy_pinyin(trad, style=Style.TONE3, neutral_tone_with_five=True)).lower().replace( "v", "u:")) freq = zipf_frequency(trad, "zh") # Parse all the Jyutping pronunciations = variant.split(":")[1:] if not pronunciations: entries.append( objects.Entry(trad=trad, simp=simp, jyut="", pin=pin, freq=freq)) else: for jyut in pronunciations: # Non-standard Jyutping starts with an exclamation mark, e.g. !sdet1 or !sdaaf1 # Remove the exclamation mark if jyut.startswith("!"): jyut = jyut[1:] entries.append( objects.Entry(trad=trad, simp=simp, jyut=jyut, pin=pin, freq=freq)) # Parse the entry content: explanations, examples content = line[2] if content.startswith("未有內容"): for entry in entries: entry.append_to_defs(objects.Definition(definition="x")) return entries definitions = [] entry_labels = [] near_synonyms = [] antonyms = [] # Explanations are separated by both '<explanation>' tags and '----' tags explanations = content.split("<explanation>") explanations = map(lambda x: x.split("----"), explanations) explanations = list(chain.from_iterable(explanations)) for explanation_index, explanation in enumerate(explanations): if not explanation.strip(): continue parse_explanation = True if explanation_index == 0: # The first item contains metadata about the entry parse_explanation = False for x in re.findall(PART_OF_SPEECH_REGEX, explanation): entry_labels.append(x) for x in re.findall(LABEL_REGEX, explanation): entry_labels.append(x) for x in re.findall(NEAR_SYNONYM_REGEX, explanation): near_synonyms.append(x) for x in re.findall(ANTONYM_REGEX, explanation): antonyms.append(x) # However, for some items, such as id 89764, the first item also contains the explanation if explanation.find("yue:") != -1: # fmt: off explanation = explanation[explanation.find("yue:"):] # fmt: on parse_explanation = True if parse_explanation: definition = objects.Definition(label="、".join(entry_labels), examples=[]) # Subsequent items contain explanations for index, item in enumerate(explanation.split("<eg>")): if index == 0: # The first item contains the explanation # Translations in different languages are separated by newlines explanation_translations = item.strip().split("\n") # Strip out links explanation_translations = map( lambda x: (re.sub(LINK_REGEX, r"\1", x).replace("#", "") if re.search(LINK_REGEX, x) else x.replace("#", "")), explanation_translations, ) # fmt: off # Segment the Chinese explanations so they show up in the FTS index explanation_translations = map( lambda x: (x[:x.find(":")], x[x.find(":") + 1:]), explanation_translations) explanation_translations = map( lambda x: ("".join(jieba.cut(x[1])) if x[0] in ("yue", "zho") else x[1]), explanation_translations) # fmt: on explanation = "\n".join(explanation_translations) definition.definition = explanation else: # Subsequent items contain examples for this explanation definition.examples.append([]) example_translations = item.strip().split("\n") # fmt: off # Strip out links example_translations = map(lambda x: x.replace("#", ""), example_translations) # fmt: on for translation in example_translations: if not translation or translation == "----": # Ignore lines that are not translations continue # fmt: off lang = translation[:translation.find(":")] if lang in ("yue", "zho"): # Example content ends before the first space with an opening parenthesis after it # (which indicates the start of a romanization) # but some example don't have romanization, so filter for that if translation.find(" (") >= 0: content = translation[translation.find(":") + 1:translation.find(" (")] else: content = translation[translation.find(":") + 1:] pron = (translation[translation.find(" (") + 1:].strip("()") if len(translation.split()) >= 2 else "") if lang == "yue": definition.examples[-1].insert( 0, objects.Example(lang=lang, pron=pron, content=content)) else: definition.examples[-1].append( objects.Example(lang=lang, content=content)) else: content = translation[translation.find(":") + 1:] if not content: content = "x" definition.examples[-1].append( objects.Example(lang=lang, content=content)) # fmt: on definitions.append(definition) # Add synonyms, antonyms to list of definitions if near_synonyms: definitions.append( objects.Definition(definition="、".join(near_synonyms), label="近義詞", examples=[])) if antonyms: definitions.append( objects.Definition(definition="、".join(antonyms), label="反義詞", examples=[])) # Assign definitions to each entry for entry in entries: entry.add_defs(copy.deepcopy(definitions)) # Add variants to the definitions of an entry; these are unique for each entry if len(variants) > 1: variants = set(map(lambda x: x.split(":")[0], variants)) for entry in entries: # Do not add variants whose Chinese characters match the current entry's characters into the current entry's "see also" section filtered_variants = filter(lambda x: x != entry.traditional, variants) entry.append_to_defs( objects.Definition( definition="、".join(filtered_variants), label="參看", examples=[], )) return entries
def parse_file(filename_traditional, filename_simplified_jyutping, entries): simplified = traditional = [] with open(filename_traditional, "r", encoding="utf8") as f: reader = csv.reader(f, delimiter=" ") traditional = list(reader) # The Kaifangcidian data for jyutping is horrible. # The entire data is on a single line, printed like a flat Python list. # The entry may be a single item in the array, or multiple items. # The Jyutping pronunciation is a separate item for each character in the entry. # The translations to Mandarin may, or may not follow the Jyutping! # And there is no separator between data for different entries :) last_line = "" with open(filename_simplified_jyutping, "r", encoding="utf8") as f: last_line = f.readlines()[-1] simplified = ast.literal_eval(last_line) index = 0 for row in range(len(traditional)): if row < 9: # The first nine rows are comments and headers continue trad = traditional[row][0] # Horrible data workaround 1: # In KFCD Jyutping data, when the entry has Chinese characters in it, # the entry is presented as a single string in the array. (This is sane.) # If it does not (e.g. the word 'pat pat'), each series of characters, delineated # by a space, is a separate entry in the array ('pat pat' => ["pat", "pat"]) trad_len = len(trad.split(" ")) if not hanzidentifier.has_chinese(trad): simp = "".join(simplified[index:index + trad_len]) else: simp = simplified[index] # Horrible data workaround 2: # In KFCD Jyutping data, the Jyutping for each word in an entry # is presented as a separate string. # To find the indices that correspond to the entry we just extracted, # use the data from the KFCD Yale edition (which is formatted as a CSV) to # determine how many items comprise the Jyutping pronunciation. # One cannot use the string length of the entry, as it may contain punctuation # (e.g. ',') that has no corresponding Jyutping syllable, AND the entry # may be split up into multiple items (as described in horrible # workaround #1). jyut_len = len(traditional[row][1].split(" ")) jyut = " ".join(simplified[index + trad_len:index + trad_len + jyut_len]) pin = (" ".join( lazy_pinyin(trad, style=Style.TONE3, neutral_tone_with_five=True)).lower().replace( "v", "u:")) # Horrible data workaround 3: # In the KFCD Yale data, all the definitions are listed as a single item, separated # by the wide-character ','. Some entries have definitions, and some do not. # In the KFCD Jyutping edition, the definitions are also listed all as a single item. # However, many words do not have definitions; if there are no definitions then # we do NOT need to advance the index by 1 more item (which would have been # the definitions). if traditional[row][2]: defs_traditional = traditional[row][2].split(",") defs_simplified = simplified[index + trad_len + jyut_len].split(",") definitions = [] for (def_traditional, def_simplified) in zip(defs_traditional, defs_simplified): if def_traditional != def_simplified: definitions.append(def_traditional + " – " + def_simplified) else: definitions.append(def_traditional) index += trad_len + jyut_len + 1 else: definitions = ["(沒有對應漢語詞彙)"] index += trad_len + jyut_len entry = objects.Entry(trad=trad, simp=simp, pin=pin, jyut=jyut, defs=definitions) if trad in entries: entries[trad].append(entry) else: entries[trad] = [entry]