def test_pinyinize(self): self.assertEqual(pinyinize('ke3neng2'), u'kěnéng') self.assertEqual(pinyinize('xi3huan1'), u'xǐhuān') self.assertEqual(pinyinize('xing2 dong4'), u'xíng dòng') self.assertEqual(pinyinize('ni3 hao3'), u'nǐ hǎo') self.assertEqual(pinyinize('ni3 hao5'), u'nǐ hao') self.assertEqual(pinyinize('hua4'), u'huà') self.assertEqual(pinyinize('you3'), u'yǒu') # make sure case is preserved self.assertEqual(pinyinize('xi3HUan1'), u'xǐHUān')
def test_double_dot(self): self.assertEqual(pinyinize('nve4dai4'), u'nüèdài') self.assertEqual(pinyinize('lu:4fa3'), u'lǜfǎ') self.assertEqual(pinyinize('lu:4fa3'), u'lǜfǎ') self.assertEqual(pinyinize('nu:e4dai4'), u'nüèdài') self.assertEqual(pinyinize('lv4fa3'), u'lǜfǎ') self.assertEqual('lv4fa3', depinyinize(u'lǜfǎ')) self.assertEqual(pinyinize('lv4fa3'), u'lǜfǎ') self.assertEqual('lv4fa3', depinyinize(u'lǜfǎ')) self.assertEqual(pinyinize('nve4dai4'), u'nüèdài') self.assertEqual('nve4dai4', depinyinize(u'nüèdài'))
def get_meaning(): # for not repeat of chars, in list only one word/character to prevent duplicate in list inserted = [] # change HSK ? -> 1,2,3,4,5,6,7-9 # e.g. HSK 1 with clear meaning.txt, HSK 2 with clear meaning.txt, HSK 7-9 with clear meaning.txt... # write to HSK 7-9 with clear meaning.txt out = open("HSK 7-9 with clear meaning.txt", "w", encoding="utf-8") # read list characters/words one per line with open("HSK 7-9.txt", "r", encoding="utf-8") as f: lines = f.readlines() for line in lines: # print(line) char = line.strip() # read 10k words from wiktionary with open("10k Mandarin.txt", "r", encoding="utf-8") as f2: lines2 = f2.readlines() found = False for l2 in lines2: sp = l2.split(",")[1].strip() if sp == char: if char not in inserted: inserted.append(char) print(l2) out.write(l2) found = True # not found in wiktionary 10k # get meaning from all_cedict.json if not found: found = False try: ch_pin = "" ch_mean = "" for p in data[char]["pinyin"]: ch_pin += pinyinize(p) + ", " ch_pin = ch_pin.strip().rstrip(",") for x in [1,2,3,4,5]: if str(x) in ch_pin: # convert to numeric pinyin to pinyin tone ch_pin = pinyin_jyutping_sentence.pinyin(char) break for d in data[char]["definitions"]: ch_mean += data[char]["definitions"][d] ch_mean = ch_mean.replace(";", ",") ch_mean = ch_mean.strip().rstrip(",") ch_data = data[char]["traditional"] + "," + data[char]["simplified"] + "," + ch_pin + ',"' + ch_mean + '"\n' if char not in inserted: inserted.append(char) print(ch_data) out.write(ch_data) except: # exception occurs the characters not in json as well as 10k Mandarin # get meaning using google translate print(char+" not found in 10k and cedict, fetching online") translator = Translator() t = translator.translate(char, src='zh-cn', dest="en") ch_mean = t.text ch_pin = pinyin_jyutping_sentence.pinyin(char) ch_trad = HanziConv.toTraditional(char) ch_data = ch_trad + "," + char + "," + ch_pin + ',"' + ch_mean + '"\n' if char not in inserted: inserted.append(char) print(ch_data) out.write(ch_data)
def insert_meaning(self, word): ch_sim = word self.simp = ch_sim self.ch_sim_entry.delete(0, 'end') ch_pin = "" ch_mean = "" ch_trad = HanziConv.toTraditional(ch_sim) ch_audio = "[sound:xiehanzi/cmn-" + ch_sim + ".mp3]" found = False try: # when pinyin present in json file, fetch from there as pinyin library provide incorrect pinyin ch_pin = "" for p in self.cedict_json_data[ch_sim]["pinyin"]: ch_pin += pinyinize(p) + ", " ch_pin = ch_pin.strip().rstrip(",") for d in self.cedict_json_data[ch_sim]["definitions"]: print(self.cedict_json_data[ch_sim]["definitions"][d]) ch_mean += self.cedict_json_data[ch_sim]["definitions"][d] + " " found = True except: self.not_found = open("not_found.txt", "a", encoding="utf-8") ln = ch_sim + "\n" self.not_found.write(ln) self.not_found.close() print("json not found in data folder, fetching online") if not found and len(ch_sim) > 0: ch_pin = pinyin.get(ch_sim) translator = Translator() t = translator.translate(ch_sim, src='zh-cn', dest="en") ch_mean = t.text self.ch_sent = "" self.ch_sent_trad = "" self.ch_sent_pinyin = "" self.ch_sent_translate = "" if len(ch_sim) > 0: try: if self.addAudio.get() and self.addSentence.get(): self.treeview.insert( '', 'end', text=ch_sim, values=(ch_trad, ch_pin, ch_mean, self.ch_sent, self.ch_sent_trad, self.ch_sent_pinyin, self.ch_sent_translate, ch_audio)) self.save_audio(ch_sim) else: if self.addAudio.get(): self.treeview.insert('', 'end', text=ch_sim, values=(ch_trad, ch_pin, ch_mean, ch_audio)) self.save_audio(ch_sim) elif self.addSentence.get(): self.treeview.insert( '', 'end', text=ch_sim, values=(ch_trad, ch_pin, ch_mean, self.ch_sent, self.ch_sent_trad, self.ch_sent_pinyin, self.ch_sent_translate)) else: self.treeview.insert('', 'end', text=ch_sim, values=(ch_trad, ch_pin, ch_mean)) except: print("Insert Error") traceback.print_exc()
def insert_meaning(self, word): ch_sim = word self.simp = ch_sim self.ch_sim_entry.delete(0, 'end') ch_pin = "" ch_mean = "" ch_trad = HanziConv.toTraditional(ch_sim) ch_audio = "[sound:xiehanzi/cmn-" + ch_sim + ".mp3]" found = False try: j = "json_data/" + ch_sim + ".json" f = open(j, encoding="utf-8") d = json.load(f) # when pinyin present in json file, fetch from there as pinyin library provide incorrect pinyin try: ch_pin = pinyinize(d["pinyin"]) if self.contains_digit(ch_pin): raise Exception('Contain number') except Exception as error: ch_pin = pinyin.get(ch_sim) i = 0 for i in range(len(d['definitions'])): ch_mean += str(d["definitions"][i]) + ", " ch_mean = ch_mean.rstrip(', ') found = True except: print("json not found in data folder, fetching online") if not found and len(ch_sim) > 0: ch_pin = pinyin.get(ch_sim) translator = Translator() t = translator.translate(ch_sim, src='zh-cn', dest="en") ch_mean = t.text self.ch_sent = "" self.ch_sent_pinyin = "" self.ch_sent_translate = "" if len(ch_sim) > 0: try: if self.addAudio.get() and self.addSentence.get(): self.treeview.insert( '', 'end', text=ch_sim, values=(ch_trad, ch_pin, ch_mean, self.ch_sent, self.ch_sent_pinyin, self.ch_sent_translate, ch_audio)) self.save_audio(ch_sim) else: if self.addAudio.get(): self.treeview.insert('', 'end', text=ch_sim, values=(ch_trad, ch_pin, ch_mean, ch_audio)) self.save_audio(ch_sim) elif self.addSentence.get(): self.treeview.insert('', 'end', text=ch_sim, values=(ch_trad, ch_pin, ch_mean, self.ch_sent, self.ch_sent_pinyin, self.ch_sent_translate)) else: self.treeview.insert('', 'end', text=ch_sim, values=(ch_trad, ch_pin, ch_mean)) except: print("Insert Error")