Example #1
0
    def test_pinyinize(self):
        self.assertEqual(pinyinize('ke3neng2'), u'kěnéng')
        self.assertEqual(pinyinize('xi3huan1'), u'xǐhuān')
        self.assertEqual(pinyinize('xing2 dong4'), u'xíng dòng')
        self.assertEqual(pinyinize('ni3 hao3'), u'nǐ hǎo')
        self.assertEqual(pinyinize('ni3 hao5'), u'nǐ hao')
        self.assertEqual(pinyinize('hua4'), u'huà')
        self.assertEqual(pinyinize('you3'), u'yǒu')

        # make sure case is preserved
        self.assertEqual(pinyinize('xi3HUan1'), u'xǐHUān')
Example #2
0
    def test_double_dot(self):
        self.assertEqual(pinyinize('nve4dai4'), u'nüèdài')
        self.assertEqual(pinyinize('lu:4fa3'), u'lǜfǎ')

        self.assertEqual(pinyinize('lu:4fa3'), u'lǜfǎ')

        self.assertEqual(pinyinize('nu:e4dai4'), u'nüèdài')

        self.assertEqual(pinyinize('lv4fa3'), u'lǜfǎ')
        self.assertEqual('lv4fa3', depinyinize(u'lǜfǎ'))

        self.assertEqual(pinyinize('lv4fa3'), u'lǜfǎ')
        self.assertEqual('lv4fa3', depinyinize(u'lǜfǎ'))

        self.assertEqual(pinyinize('nve4dai4'), u'nüèdài')
        self.assertEqual('nve4dai4', depinyinize(u'nüèdài'))
Example #3
0
def get_meaning():
    # for not repeat of chars, in list only one word/character to prevent duplicate in list
    inserted = []

    # change HSK ? -> 1,2,3,4,5,6,7-9
    # e.g. HSK 1 with clear meaning.txt, HSK 2 with clear meaning.txt, HSK 7-9 with clear meaning.txt...
    # write to HSK 7-9 with clear meaning.txt
    out = open("HSK 7-9 with clear meaning.txt", "w", encoding="utf-8") 

    # read list characters/words one per line
    with open("HSK 7-9.txt", "r", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            # print(line)
            char = line.strip()
            # read 10k words from wiktionary
            with open("10k Mandarin.txt", "r", encoding="utf-8") as f2:
                lines2 = f2.readlines()
                found = False
                for l2 in lines2:
                    sp = l2.split(",")[1].strip()
                    if sp  == char:
                        if char not in inserted:
                            inserted.append(char)
                            print(l2)
                            out.write(l2)
                            found = True

                # not found in wiktionary 10k
                # get meaning from all_cedict.json
                if not found:
                    found = False
                    try:
                        ch_pin = ""
                        ch_mean = ""
                        for p in data[char]["pinyin"]:
                            ch_pin += pinyinize(p) + ", "
                        ch_pin = ch_pin.strip().rstrip(",")

                        for x in [1,2,3,4,5]:
                            if str(x) in ch_pin:
                                # convert to numeric pinyin to pinyin tone
                                ch_pin = pinyin_jyutping_sentence.pinyin(char)
                                break

                        for d in data[char]["definitions"]:
                            ch_mean += data[char]["definitions"][d]
                        
                        ch_mean = ch_mean.replace(";", ",")
                        ch_mean = ch_mean.strip().rstrip(",")
                        
                        ch_data = data[char]["traditional"] + "," + data[char]["simplified"] + "," + ch_pin + ',"' + ch_mean + '"\n'
                        
                        if char not in inserted:
                            inserted.append(char)
                            print(ch_data)
                            out.write(ch_data)

                    except:
                        # exception occurs the characters not in json as well as 10k Mandarin
                        # get meaning using google translate
                        print(char+" not found in 10k and cedict, fetching online")
                        translator = Translator()
                        t = translator.translate(char, src='zh-cn', dest="en")
                        ch_mean = t.text

                        ch_pin = pinyin_jyutping_sentence.pinyin(char)
                        ch_trad = HanziConv.toTraditional(char)

                        ch_data = ch_trad + "," + char + "," + ch_pin + ',"' + ch_mean + '"\n'

                        if char not in inserted:
                            inserted.append(char)
                            print(ch_data)
                            out.write(ch_data)
    def insert_meaning(self, word):
        ch_sim = word
        self.simp = ch_sim
        self.ch_sim_entry.delete(0, 'end')

        ch_pin = ""
        ch_mean = ""
        ch_trad = HanziConv.toTraditional(ch_sim)
        ch_audio = "[sound:xiehanzi/cmn-" + ch_sim + ".mp3]"

        found = False

        try:
            # when pinyin present in json file, fetch from there as pinyin library provide incorrect pinyin
            ch_pin = ""
            for p in self.cedict_json_data[ch_sim]["pinyin"]:
                ch_pin += pinyinize(p) + ", "
            ch_pin = ch_pin.strip().rstrip(",")

            for d in self.cedict_json_data[ch_sim]["definitions"]:
                print(self.cedict_json_data[ch_sim]["definitions"][d])
                ch_mean += self.cedict_json_data[ch_sim]["definitions"][d] + " "
            found = True
        except:
            self.not_found = open("not_found.txt", "a", encoding="utf-8")
            ln = ch_sim + "\n"
            self.not_found.write(ln)
            self.not_found.close()
            print("json not found in data folder, fetching online")

        if not found and len(ch_sim) > 0:
            ch_pin = pinyin.get(ch_sim)

            translator = Translator()
            t = translator.translate(ch_sim, src='zh-cn', dest="en")
            ch_mean = t.text

        self.ch_sent = ""
        self.ch_sent_trad = ""
        self.ch_sent_pinyin = ""
        self.ch_sent_translate = ""

        if len(ch_sim) > 0:
            try:
                if self.addAudio.get() and self.addSentence.get():
                    self.treeview.insert(
                        '',
                        'end',
                        text=ch_sim,
                        values=(ch_trad, ch_pin, ch_mean, self.ch_sent,
                                self.ch_sent_trad, self.ch_sent_pinyin,
                                self.ch_sent_translate, ch_audio))
                    self.save_audio(ch_sim)
                else:
                    if self.addAudio.get():
                        self.treeview.insert('',
                                             'end',
                                             text=ch_sim,
                                             values=(ch_trad, ch_pin, ch_mean,
                                                     ch_audio))
                        self.save_audio(ch_sim)
                    elif self.addSentence.get():
                        self.treeview.insert(
                            '',
                            'end',
                            text=ch_sim,
                            values=(ch_trad, ch_pin, ch_mean, self.ch_sent,
                                    self.ch_sent_trad, self.ch_sent_pinyin,
                                    self.ch_sent_translate))
                    else:
                        self.treeview.insert('',
                                             'end',
                                             text=ch_sim,
                                             values=(ch_trad, ch_pin, ch_mean))

            except:
                print("Insert Error")
                traceback.print_exc()
Example #5
0
    def insert_meaning(self, word):
        ch_sim = word
        self.simp = ch_sim
        self.ch_sim_entry.delete(0, 'end')

        ch_pin = ""
        ch_mean = ""
        ch_trad = HanziConv.toTraditional(ch_sim)
        ch_audio = "[sound:xiehanzi/cmn-" + ch_sim + ".mp3]"

        found = False

        try:
            j = "json_data/" + ch_sim + ".json"
            f = open(j, encoding="utf-8")
            d = json.load(f)

            # when pinyin present in json file, fetch from there as pinyin library provide incorrect pinyin
            try:
                ch_pin = pinyinize(d["pinyin"])
                if self.contains_digit(ch_pin):
                    raise Exception('Contain number')
            except Exception as error:
                ch_pin = pinyin.get(ch_sim)

            i = 0
            for i in range(len(d['definitions'])):
                ch_mean += str(d["definitions"][i]) + ", "

            ch_mean = ch_mean.rstrip(', ')

            found = True
        except:
            print("json not found in data folder, fetching online")

        if not found and len(ch_sim) > 0:
            ch_pin = pinyin.get(ch_sim)

            translator = Translator()
            t = translator.translate(ch_sim, src='zh-cn', dest="en")
            ch_mean = t.text

        self.ch_sent = ""
        self.ch_sent_pinyin = ""
        self.ch_sent_translate = ""

        if len(ch_sim) > 0:
            try:
                if self.addAudio.get() and self.addSentence.get():
                    self.treeview.insert(
                        '',
                        'end',
                        text=ch_sim,
                        values=(ch_trad, ch_pin, ch_mean, self.ch_sent,
                                self.ch_sent_pinyin, self.ch_sent_translate,
                                ch_audio))
                    self.save_audio(ch_sim)
                else:
                    if self.addAudio.get():
                        self.treeview.insert('',
                                             'end',
                                             text=ch_sim,
                                             values=(ch_trad, ch_pin, ch_mean,
                                                     ch_audio))
                        self.save_audio(ch_sim)
                    elif self.addSentence.get():
                        self.treeview.insert('',
                                             'end',
                                             text=ch_sim,
                                             values=(ch_trad, ch_pin, ch_mean,
                                                     self.ch_sent,
                                                     self.ch_sent_pinyin,
                                                     self.ch_sent_translate))
                    else:
                        self.treeview.insert('',
                                             'end',
                                             text=ch_sim,
                                             values=(ch_trad, ch_pin, ch_mean))

            except:
                print("Insert Error")