def parse_same_word_file(filename, words): for line in read_csv(filename): if len(line) != 2 or line[0] == "詞彙": continue trad = line[0] simp = HanziConv.toSimplified(trad) pin = lazy_pinyin( trad, style=Style.TONE3, neutral_tone_with_five=True, ) pin = " ".join(pin).lower() pin = pin.strip().replace("v", "u:") jyut = pinyin_jyutping_sentence.jyutping(trad, tone_numbers=True, spaces=True) freq = zipf_frequency(trad, "zh") defs = [ objects.DefinitionTuple("".join(jieba.cut(line[1])), "臺陸用法和差異", []) ] entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs) words.add(entry)
def cantonese_transcript(inpstr): stlist = split_by_alphabet(inpstr) latin = '' for st in stlist: if (unicodedata.name(st[0]).split(' ')[0] == 'CJK'): transcript = '' try: transcript = pinyin_jyutping_sentence.jyutping(st, spaces=True) except: sys.stderr.write( "pinyin_jyutping_sentence error transcribing >%s<\n" % st) return (None) latin = latin + transcript else: latin = latin + st return (latin)
def parse_file(filename, words): with open(filename) as f: data = json.load(f) items_parsed = 0 # Each item in the JSON correspond to one or more entries in the dictionary # Most items map 1:1 to entries, e.g. "物質" is a single entry # Some items are 多音字, so they map to multiple entries (e.g. 重 -> zhòng and chóng) # # In the vocabulary of the MoEDict, each item may correspond to multiple heteronyms, # and each heteronym maps to a single entry. for item in data: # For now, ignore variant characters that aren't properly encoded in Unicode if re.match(EXCLUDE_VARIANT_REGEX_PATTERN, item["title"]): continue # These do not change no matter the heteronym trad = item["title"] simp = HanziConv.toSimplified(trad) jyut = pinyin_jyutping_sentence.jyutping(trad, tone_numbers=True, spaces=True) freq = zipf_frequency(trad, "zh") # Build up a list of definitions for each heteronym defs = [] # Distinguish between heteronyms by their pinyin – if the pinyin of the # current heteronym does not match the old pinyin, then a new heteronym # must be created last_heteronym_pin = "" # Go through each heteronym, creating Entry objects for each one for heteronym in item["heteronyms"]: if "pinyin" not in heteronym: logging.debug( f'Could not find pinyin for heteronym of word {trad} with definitions {heteronym["definitions"]}' ) continue pin = PINYIN_COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN.sub( "", heteronym["pinyin"]) pin = PINYIN_LITERARY_PRONUNCIATION_REGEX_PATTERN.sub("", pin) pin = PINYIN_ALTERNATE_PRONUNCIATION_REGEX_PATTERN.sub("", pin) pin = PINYIN_SECOND_ALTERNATE_PRONUNCIATION_REGEX_PATTERN.sub( "", pin) pin = pin.split() pin = [ pinyin_to_tone_numbers(syllable, trad).split() for syllable in pin ] pin = list(itertools.chain.from_iterable(pin)) pin = pin[:len(trad)] pin = " ".join(pin) if last_heteronym_pin != "" and pin != last_heteronym_pin: # Different pinyin means that we are now processing a new heteronym. # We must create an Entry object for the definitions of the old heteronym # and add it to the list of entries before processing the new one. entry = objects.Entry(trad, simp, last_heteronym_pin, jyut, freq=freq, defs=defs) words.append(entry) # Reset the definitions list defs = [] for definition in heteronym["definitions"]: label = definition["type"] if "type" in definition else "" # Insert zero-width spaces so that we can reverse-search the definition def_tuple = objects.DefinitionTuple( "".join(jieba.cut(definition["def"])), label, []) # Parse and add examples to this definition if "example" in definition: for example in definition["example"]: if EXAMPLE_REGEX_PATTERN.match(example): # Every example is surrounded by "如:<example>", so only keep the example example = EXAMPLE_REGEX_PATTERN.match( example).group(1) # Some examples contain multiple examples, so split them up by enclosing brackets 「」 example_texts = ( INDIVIDUAL_EXAMPLE_REGEX_PATTERN.findall( example)) else: logging.warning( f"Found example that does not fit the normal example regex pattern: {trad}, {example}" ) # Fall back to splitting on Chinese enumeration comma example_texts = example.split("、") for example_text in example_texts: # Strip out weird whitespace example_text = WHITESPACE_REGEX_PATTERN.sub( "", example_text) # Joining and splitting separates series of full-width punctuation marks # into separate items, which is necessary so that lazy_pinyin() returns # separate items for each full-width punctuation mark in the list it returns # # e.g. "《儒林外史.第四六回》:「成老爹道..." turns into # "《 儒 林 外 史 . 第 四 六 回 》 : 「 成 老 爹 道", which turns into # ['《', '儒', '林', '外', '史', '.', '第', '四', '六', '回', '》', ':', '「', '成', '老', '爹', '道'] # (Notice how "》:「"" is now split up into three different items) example_pinyin = lazy_pinyin( " ".join(example_text).split(), style=Style.TONE3, neutral_tone_with_five=True, ) example_pinyin = " ".join( example_pinyin).lower() example_pinyin = example_pinyin.strip( ).replace("v", "u:") # Since the pinyin returned by lazy_pinyin doesn't always match the pinyin # given in the heteronym, attempt to replace pinyin corresponding to the # characters in this heteronym with the pinyin provided by the JSON file. # # e.g. example_text = "重新"; example_pinyin = "zhong4 xin1" (returned by lazy_pinyin) # trad = "重", phrase_pinyin = "chong2" # means that we should convert "zhong4 xin1" to "chong2 xin1" # Strip out variant pronunciations for conversion purposes phrase_pinyin = pin phrase_pinyin = VARIANT_PRONUNCIATION_REGEX_PATTERN.sub( "", phrase_pinyin, ) phrase_pinyin = ( COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN.sub( "", phrase_pinyin, )) # Do not try to match entries formatted like "那搭(Namibia)" if not STRANGE_ENTRY_REGEX_PATTERN.match(trad): try: example_pinyin = change_pinyin_to_match_phrase( example_text, example_pinyin, trad, phrase_pinyin, ) except Exception as e: logging.warning( f"Couldn't change pinyin in example for word {trad}: " f"{''.join(example_text)}, {example_pinyin}, {pin}, " f"{e}") traceback.print_exc() def_tuple.examples.append( objects.ExampleTuple( "cmn", example_pinyin, example_text)) # Parse and add quotes to this definition if "quote" in definition: for quote in definition["quote"]: quote_text = re.sub(WHITESPACE_REGEX_PATTERN, "", quote) quote_pinyin = lazy_pinyin( " ".join(quote_text).split(), style=Style.TONE3, neutral_tone_with_five=True, ) quote_pinyin = " ".join(quote_pinyin).lower() quote_pinyin = quote_pinyin.strip().replace( "v", "u:") phrase_pinyin = pin phrase_pinyin = re.sub( VARIANT_PRONUNCIATION_REGEX_PATTERN, "", phrase_pinyin) phrase_pinyin = re.sub( COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN, "", phrase_pinyin, ) if not re.match(STRANGE_ENTRY_REGEX_PATTERN, trad): try: quote_pinyin = change_pinyin_to_match_phrase( quote_text, quote_pinyin, trad, phrase_pinyin) except Exception as e: logging.warning( f"Couldn't change pinyin in quote for word {trad}: " f"{''.join(quote_text)}, {quote_pinyin}, {pin} " f"{e}") traceback.print_exc() def_tuple.examples.append( objects.ExampleTuple("zho", quote_pinyin, quote_text)) # We currently ignore synonyms, antonyms, and "see also" links, because they are # linked to definitions and we have no way to display that data... defs.append(def_tuple) last_heteronym_pin = pin entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs) words.append(entry) items_parsed += 1 if not items_parsed % 500: print(f"Parsed entry #{items_parsed}")
def parse_file(filename, words): with open(filename) as f: data = json.load(f) items_parsed = 0 # Each item in the JSON correspond to one or more entries in the dictionary # Most items map 1:1 to entries, e.g. "物質" is a single entry # Some items are 多音字, so they map to multiple entries (e.g. 重 -> zhòng and chóng) # # In the vocabulary of the the CSLD, each item may correspond to multiple heteronyms, # and each heteronym maps to a single entry. for item in data: # These do not change no matter the heteronym trad = item["title"] simp = HanziConv.toSimplified(trad) jyut = pinyin_jyutping_sentence.jyutping(trad, tone_numbers=True, spaces=True) freq = zipf_frequency(trad, "zh") # Some items have multiple pronunciations (one for Taiwan, one for Mainland China) taiwan_pin = mainland_pin = "" # Build up a list of definitions for each heteronym taiwan_defs = [] mainland_defs = [] # Distinguish between heteronyms by their pinyin – if the pinyin of the # current heteronym does not match the old pinyin, then a new heteronym # must be created last_heteronym_pin = "" last_taiwan_pin = last_mainland_pin = "" # Go through each heteronym, creating Entry objects for each one for heteronym in item["heteronyms"]: if "pinyin" not in heteronym: logging.debug( f'Could not find pinyin for heteronym of word {trad} with definitions {heteronym["definitions"]}' ) continue # Filter out known bad pinyin if (trad in KNOWN_INVALID_SYLLABLES and heteronym["pinyin"] in KNOWN_INVALID_SYLLABLES[trad]): pins = KNOWN_INVALID_SYLLABLES[trad][heteronym["pinyin"]] else: pins = heteronym["pinyin"].split("<br>陸⃝") # Some weird a's cause dragonmapper to break, so replace them with standard a's. pins = list(map(lambda x: x.replace("ɑ", "a"), pins)) # Remove dashes in pinyin pins = list(map(lambda x: x.replace("-", " "), pins)) # Remove commas in pinyin pins = list(map(lambda x: x.replace(",", ""), pins)) # Remove weird characters pins = list(map(lambda x: x.replace("陸⃟", ""), pins)) # Dragonmapper cannot handle some erhua pins = list( map(lambda x: x.replace("diǎr", "diǎn er"), pins)) pins = list( map(lambda x: x.replace("biār", "biān er"), pins)) try: # Converting from pinyin -> zhuyin inserts spaces between characters # Converting from zhuyin -> pinyin conserves these spaces pins = [ transcriptions.zhuyin_to_pinyin( transcriptions.pinyin_to_zhuyin(x), accented=False) for x in pins ] for x in pins: if x.count(" ") >= len(trad): # This means that there was an extra space inserted somewhere; the pinyin is not valid raise ValueError( "Too many spaces in parsed Pinyin!") except Exception as e: # Try parsing zhuyin as a backup pins = heteronym["bopomofo"].split("<br>陸⃝") # Remove weird spaces in zhuyin pins = list(map(lambda x: x.replace(" ", " "), pins)) try: pins = [ transcriptions.zhuyin_to_pinyin(x, accented=False) for x in pins ] except Exception as e: logging.error( f"Unable to split up Pinyin for word {trad}: {e}, skipping word..." ) continue if len(pins) > 1: taiwan_pin = pins[0] mainland_pin = pins[1] else: taiwan_pin = mainland_pin = pins[0] if (last_heteronym_pin != "" and heteronym["pinyin"] != last_heteronym_pin): # A new different pinyin means that we are now processing a new heteronym. # We must create an Entry object for the definitions of the old heteronym # and add it to the list of entries before processing the new one. entry = objects.Entry(trad, simp, last_taiwan_pin, jyut, freq=freq, defs=taiwan_defs) words.append(entry) if last_mainland_pin != last_taiwan_pin: entry = objects.Entry( trad, simp, last_mainland_pin, jyut, freq=freq, defs=mainland_defs, ) words.append(entry) # Reset the definitions list taiwan_defs = [] mainland_defs = [] for definition in heteronym["definitions"]: taiwan_label = "臺" if taiwan_pin != mainland_pin else "" mainland_label = "陸" if mainland_pin != taiwan_pin else "" definition_text = definition["def"] # Take out parts of definitions that should be in labels for pattern in LABEL_REGEX_PATTERNS: if re.match(pattern, definition_text): definition_label, definition_text = re.match( pattern, definition_text).group(1, 2) taiwan_label += ("、" + definition_label if taiwan_label else definition_label) mainland_label += ("、" + definition_label if mainland_label else definition_label) # Remove 臺⃝ and 陸⃝ from definitions, since Qt cannot display them definition_text = definition_text.replace("臺⃝", "臺:") definition_text = definition_text.replace("陸⃝", "陸:") # Insert zero-width spaces so that we can reverse-search the definition taiwan_def_tuple = objects.DefinitionTuple( "".join(jieba.cut(definition_text)), taiwan_label, []) mainland_def_tuple = objects.DefinitionTuple( "".join(jieba.cut(definition_text)), mainland_label, []) # Parse and add examples to this definition if "example" in definition: for example in definition["example"]: if re.match(EXAMPLE_REGEX_PATTERN, example): # Every example is surrounded by "如:<example>", so only keep the example example = re.match(EXAMPLE_REGEX_PATTERN, example).group(1) # Some examples contain multiple examples, so split them up by enclosing brackets 「」 example_texts = re.findall( INDIVIDUAL_EXAMPLE_REGEX_PATTERN, example) else: logging.warning( f"Found example that does not fit the normal example regex pattern: {trad}, {example}" ) # Fall back to splitting on Chinese enumeration comma example_texts = example.split("、") for example_text in example_texts: # Strip out weird whitespace example_text = re.sub(WHITESPACE_REGEX_PATTERN, "", example_text) # Joining and splitting separates series of full-width punctuation marks # into separate items, which is necessary so that lazy_pinyin() returns # separate items for each full-width punctuation mark in the list it returns # # e.g. "《儒林外史.第四六回》:「成老爹道..." turns into # "《 儒 林 外 史 . 第 四 六 回 》 : 「 成 老 爹 道", which turns into # ['《', '儒', '林', '外', '史', '.', '第', '四', '六', '回', '》', ':', '「', '成', '老', '爹', '道'] # (Notice how "》:「"" is now split up into three different items) example_pinyin = lazy_pinyin( " ".join(example_text).split(), style=Style.TONE3, neutral_tone_with_five=True, ) example_pinyin = " ".join( example_pinyin).lower() example_pinyin = example_pinyin.strip( ).replace("v", "u:") # Since the pinyin returned by lazy_pinyin doesn't always match the pinyin # given in the heteronym, attempt to replace pinyin corresponding to the # characters in this heteronym with the pinyin provided by the JSON file. # # e.g. example_text = "重新"; example_pinyin = "zhong4 xin1" (returned by lazy_pinyin) # trad = "重", phrase_pinyin = "chong2" # means that we should convert "zhong4 xin1" to "chong2 xin1" # Strip out variant pronunciations for conversion purposes for index, pin in enumerate( [taiwan_pin, mainland_pin]): phrase_pinyin = pin phrase_pinyin = re.sub( VARIANT_PRONUNCIATION_REGEX_PATTERN, "", phrase_pinyin, ) phrase_pinyin = re.sub( COLLOQUIAL_PRONUNCIATION_REGEX_PATTERN, "", phrase_pinyin, ) # Do not try to match entries formatted like "那搭(Namibia)" if not re.match( STRANGE_ENTRY_REGEX_PATTERN, trad): try: example_pinyin = ( change_pinyin_to_match_phrase( example_text, example_pinyin, trad, phrase_pinyin, )) except Exception as e: logging.warning( f"Couldn't change pinyin in example for word {trad}: " f"{''.join(example_text)}, {example_pinyin}, {pin}, " f"{e}") traceback.print_exc() if index == 0: taiwan_def_tuple.examples.append( objects.ExampleTuple( "zho", example_pinyin, example_text)) elif index == 1: mainland_def_tuple.examples.append( objects.ExampleTuple( "zho", example_pinyin, example_text)) taiwan_defs.append(taiwan_def_tuple) mainland_defs.append(mainland_def_tuple) last_heteronym_pin = heteronym["pinyin"] last_taiwan_pin = taiwan_pin last_mainland_pin = mainland_pin entry = objects.Entry(trad, simp, taiwan_pin, jyut, freq=freq, defs=taiwan_defs) words.append(entry) if mainland_pin != taiwan_pin: entry = objects.Entry(trad, simp, mainland_pin, jyut, freq=freq, defs=mainland_defs) words.append(entry) items_parsed += 1 if not items_parsed % 500: print(f"Parsed entry #{items_parsed}")
def parse_same_meaning_file(filename, words): for line in read_csv(filename): if len(line) != 17 or line[0] == "總分類": continue terms = defaultdict(set) for index in (4, 5, 6): if line[index]: terms["臺"].add(line[index]) for index in (7, 8, 9): if line[index]: terms["陸"].add(line[index]) for index in (10, 11, 12): if line[index]: terms["香"].add(line[index]) for index in (13, 14, 15): if line[index]: terms["澳"].add(line[index]) explanation = None if line[16]: explanation = objects.DefinitionTuple( "".join(jieba.cut(line[16])), "差異說明", []) for location in terms: for term in terms[location]: trad = term simp = HanziConv.toSimplified(trad) if term == line[4] and line[2]: # Use the provided pinyin, which always corresponds at least to the first Taiwan term pin = transcriptions.zhuyin_to_pinyin(line[2].replace( " ", " "), accented=False) else: pin = lazy_pinyin( trad, style=Style.TONE3, neutral_tone_with_five=True, ) pin = " ".join(pin).lower() pin = pin.strip().replace("v", "u:") jyut = pinyin_jyutping_sentence.jyutping(trad, tone_numbers=True, spaces=True) freq = zipf_frequency(trad, "zh") defs = terms.keys() defs = map( lambda x: objects.DefinitionTuple("、".join(terms[x]), line[ 1] + ":" + x, []), defs, ) defs = list(defs) if explanation: defs.append(explanation) entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs) words.add(entry)
def parse_sentence_file( filename, source, target, sentences, nonchinese_sentences, intermediate_ids, enable_jyutping, enable_pinyin, ): print("Parsing sentence file...") with open(filename, "r", encoding="utf8") as f: for index, line in enumerate(f): if len(line) == 0 or line[0] == "#": continue split = line.split() lang = split[1] sentence_start = line.index("\t", line.index("\t") + 1) + 1 sentence = line[sentence_start:] sentence_id = split[0] if lang == source: if hanzidentifier.is_simplified(sentence): trad = HanziConv.toTraditional(sentence) simp = sentence else: trad = sentence simp = HanziConv.toSimplified(sentence) pin = "" if enable_pinyin: pin = " ".join( lazy_pinyin( trad, style=Style.TONE3, neutral_tone_with_five=True ) ).lower() pin = pin.strip().replace("v", "u:") jyut = "" if enable_jyutping: jyut = pinyin_jyutping_sentence.jyutping( trad, tone_numbers=True, spaces=True ) sentence_row = objects.ChineseSentence( sentence_id, trad, simp, pin, jyut, lang, ) sentences[sentence_id] = sentence_row continue if lang == target: sentence = line[sentence_start:].strip() sentence_translation = objects.NonChineseSentence( sentence_id, sentence, lang ) nonchinese_sentences[sentence_id] = sentence_translation continue intermediate_ids.add(sentence_id)