def test_jamdict_xml(self): print("Test Jamdict search in XML files") jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE, auto_config=True) jam.import_data() result = jam.lookup('おみやげ') self.assertEqual(1, len(result.entries)) self.assertEqual(2, len(result.chars)) self.assertEqual({c.literal for c in result.chars}, {'土', '産'})
def test_lookup_result(self): jam = Jamdict(jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, auto_config=False, auto_expand=False) result = jam.lookup('おみやげ') self.assertTrue(result.entries) self.assertEqual(result.entries[0].kana_forms[0].text, 'おみやげ') # test lookup by ID res = jam.lookup('id#{}'.format(1002490)) self.assertTrue(res.entries) self.assertEqual(res.entries[0].kana_forms[0].text, 'おとそ')
class PrepJam(Processor): def __init__(self, info, name="jam"): super().__init__(info, name) self.parser = JapaneseAnalyser() self.jam = Jamdict() def process(self, sent): if isinstance(sent, Sentence): ttl_sent = self.parser.analyse(sent.text) # lookup each token in the dictionary for idx, token in enumerate(ttl_sent): if not token.lemma: continue result = self.jam.lookup(token.lemma, strict_lookup=True) if result.entries or result.chars: ids = [] for e in result.entries: ids.append('jam::{}'.format(e.idseq)) # for c in result.chars: # ids.append('jam:char:{}'.format(c.literal)) nc = ttl_sent.new_concept(tag=';'.join(ids), clemma=token.text, tokens=[token]) # comment = TextReport.string() # dump_result(result, report=comment) # nc.comment = comment.content() nc.comment = result.text(compact=False, no_id=True, with_chars=False) sent.shallow = ttl_sent sent.text = ' '.join(t.text for t in sent.shallow.tokens) return sent else: return self.process(Sentence(sent))
def test_search_by_ne_type(self): print("Test Jamdict search in XML files") jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE, auto_config=True) jam.import_data() netypes = jam.all_ne_type() expected = [ 'company', 'fem', 'given', 'organization', 'person', 'place', 'surname', 'unclass' ] self.assertEqual(expected, netypes) res = jam.lookup("place") actual = set() for n in res.names: actual.update(k.text for k in n.kanji_forms) self.assertIn("厦門", actual) res = jam.lookup("company") actual = set() for n in res.names: actual.update(k.text for k in n.kanji_forms) expected = {'埼銀', 'IKEA'} self.assertTrue(expected.issubset(actual))
def test_jamdict_sqlite_all(self): if os.path.isfile(TEST_DB): os.unlink(TEST_DB) jam = Jamdict(db_file=TEST_DB, kd2_file=TEST_DB, jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2) # Lookup using XML result = jam.jmdict_xml.lookup('おみやげ') getLogger().debug("Results: {}".format(result)) # Lookup using SQLite jam.import_data() # test lookup result = jam.lookup('おみやげ') print(result.entries) self.assertEqual(len(result.entries), 1) self.assertEqual(len(result.chars), 2) self.assertEqual({c.literal for c in result.chars}, {'土', '産'})
def set_english(self): jmd = Jamdict() self.english = '' self.isVerb = 0 results = jmd.lookup(self.japanese) for entry in results.entries: # print(entry) # print() for self.kana in entry.kana_forms: if self.reading == str(self.kana): if self.english: self.english += '; ' for idx, s in enumerate(entry.senses): if idx > 0: self.english += '; ' if (str(s).find('Godan verb') != -1) or (str(s).find('Ichidan verb') != -1): self.isVerb = 1 self.english += self.remGrammar(str(s))
def set_english_from_kana(self): jmd = Jamdict() self.english = '' self.isVerb = 0 results = jmd.lookup(self.japanese) for entry in results.entries: # print(entry) # print() for noEntries, self.kana in enumerate(entry.kana_forms): if self.japanese == str(self.kana): if self.english: self.english += '; ' for idx, s in enumerate(entry.senses): if idx > 0 and self.english: self.english += '; ' if str(s).find('verb') != -1 and str(s).find( 'verb suru') == -1 and str(s).find( 'adverb') == -1: self.isVerb = 1 self.english += self.remGrammar(str(s))
def get_jam(cli, args): if not args.jdb: args.jdb = None if args.kd2: cli.logger.warning("Jamdict database location: {}".format(args.jdb)) cli.logger.warning("Kanjidic2 database location: {}".format(args.kd2)) jmd = Jamdict(db_file=args.jdb, kd2_file=args.kd2, jmd_xml_file=args.jmdxml, kd2_xml_file=args.kd2xml) else: cli.logger.debug( "Using the same database for both JMDict and Kanjidic2") jmd = Jamdict(db_file=args.jdb, kd2_file=args.jdb, jmd_xml_file=args.jmdxml, kd2_xml_file=args.kd2xml) if jmd.kd2 is None: cli.logger.warning("Kanjidic2 database could not be found") return jmd
def __init__(self, user_dict="", user_dict_en=""): self.dict_en = {} dbfile = os.path.dirname(__file__) + "/res/jamdict.db" if not os.path.isfile(dbfile): dbfile = os.path.dirname(sys.argv[0]) + "/res/jamdict.db" self.jmd = Jamdict(db_file=dbfile, kd2_file=dbfile) if user_dict != "": self.tokenizer = Tokenizer(user_dict, udic_type="simpledic", udic_enc="utf8") else: self.tokenizer = Tokenizer() self.token_filters = [POSStopFilter(['記号', '助詞']), TokenCountFilter(att='base_form')] if user_dict_en != "": with open(user_dict_en, newline='', encoding="utf-8") as csvfile: dic_reader = csv.reader(csvfile, delimiter=',', quotechar='|') for row in dic_reader: if len(row) >= 3: self.dict_en[row[0]] = { 'reading': row[1], 'meaning': row[2] }
async def word_translate(request): data = await request.json() # fake_data = { # "word": "自然", # } jmd = Jamdict() result = jmd.lookup(data["word"]) # print all word entries #for entry in result.entries: # print(entry) # tokenizer = WordTokenizer('Sentencepiece', model_path=MECAB_PATH) # print(tokenizer.tokenize(sentence)) # # => [▁, 自然, 言語, 処理, を, 勉強, し, ています] #return web.json_response({"en_word": result.entries[0]}) return web.json_response( data={"en_word": result.entries[0].text()}, # TODO: later to setup -> result.entries[0].to_json() headers=HEADERS, )
def test_jamdict_sqlite_all(self): if os.path.isfile(TEST_DB): os.unlink(TEST_DB) TEST_DB.touch() jam = Jamdict(db_file=TEST_DB, jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE) # Lookup using XML result = jam.jmdict_xml.lookup('おみやげ') getLogger().debug("Results: {}".format(result)) # Lookup using SQLite jam.import_data() # test lookup result = jam.lookup('おみやげ') self.assertIsNotNone(result.entries) self.assertEqual(len(result.entries), 1) self.assertEqual(len(result.chars), 2) self.assertEqual({c.literal for c in result.chars}, {'土', '産'}) print("Test reading DB into RAM") ram_jam = Jamdict(TEST_DB, memory_mode=True) print("1st lookup") result = ram_jam.lookup('おみやげ') self.assertIsNotNone(result.entries) self.assertEqual(len(result.entries), 1) self.assertEqual(len(result.chars), 2) self.assertEqual({c.literal for c in result.chars}, {'土', '産'}) print("2nd lookup") result = ram_jam.lookup('おみやげ') self.assertIsNotNone(result.entries) self.assertEqual(len(result.entries), 1) self.assertEqual(len(result.chars), 2) self.assertEqual({c.literal for c in result.chars}, {'土', '産'}) print("3rd lookup") result = ram_jam.lookup('おみやげ') self.assertIsNotNone(result.entries) self.assertEqual(len(result.entries), 1) self.assertEqual(2, len(result.chars)) self.assertEqual({c.literal for c in result.chars}, {'土', '産'})
def test_jamdict_data(self): jam = Jamdict() # search verb kaeru res = jam.lookup("かえる", pos="transitive verb") actual = [e.idseq for e in res.entries] self.assertIn(1510650, actual) self.assertIn(1589780, actual) forms = all_kanji(res) expected = {'変える', '代える', '換える', '替える'} self.assertTrue(expected.issubset(forms)) # search by noun kaeru res2 = jam.lookup("かえる", pos='noun (common) (futsuumeishi)') actual2 = [e.idseq for e in res2.entries] forms2 = all_kanji(res2) self.assertIn(1577460, actual2) expected2 = {'蛙', '蛤', '蝦'} self.assertTrue(expected2.issubset(forms2)) # search both noun and verb res3 = jam.lookup( "かえる", pos=['noun (common) (futsuumeishi)', "transitive verb"]) forms3 = all_kanji(res3) self.assertTrue(expected.issubset(forms3)) self.assertTrue(expected2.issubset(forms3))
def translationButtonClicked(self, text): if self.insertionBox.text() == "": msg = QMessageBox() msg.setWindowTitle("Error") msg.setText("There is no word to translate!") msg.exec_() else: jmd = Jamdict() text = self.insertionBox.text() Window.Rtext = toRomaji(text) Window.Ktext = toKatakana(text) Window.Htext = toHiragana(text) result = jmd.lookup(text) text = toTokensDictionary(text) separater = "" Window.Ftext = toFurigana(text) Window.Ftext = separater.join(Window.Ftext) if result == None: result = jmd.lookup(Window.Ktext) if result == None: result = jmd.lookup(Window.Htext) Window.Etext = repr(result.entries).strip("[]") Window.Ctext = repr(result.chars).strip("[]") self.romajiBox.setText(Window.Rtext) self.katakanaBox.setText(Window.Ktext) self.hiraganaBox.setText(Window.Htext) self.furiganaBox.setText(Window.Ftext) self.entriesBox.setText(Window.Etext) self.charsBox.setText(Window.Ctext) return Window.Rtext, Window.Ktext, Window.Htext, Window.Ftext
def test_search_by_pos(self): print("Test Jamdict search in XML files") jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE, auto_config=True) jam.import_data() # test get all pos poses = jam.all_pos() expected = { 'Godan verb - -aru special class', "Godan verb with `ku' ending", "Godan verb with `ru' ending", "Godan verb with `su' ending", "Godan verb with `u' ending", 'Ichidan verb', 'adjectival nouns or quasi-adjectives (keiyodoshi)', 'adjective (keiyoushi)', 'adverb (fukushi)', "adverb taking the `to' particle", 'auxiliary verb', 'conjunction', 'expressions (phrases, clauses, etc.)', 'interjection (kandoushi)', 'intransitive verb', 'noun (common) (futsuumeishi)', 'noun or participle which takes the aux. verb suru', 'noun or verb acting prenominally', "nouns which may take the genitive case particle `no'", 'pre-noun adjectival (rentaishi)', 'pronoun', 'transitive verb' } self.assertEqual(expected, set(poses)) result = jam.lookup('おみやげ', pos=['noun (common) (futsuumeishi)']) self.assertEqual(1, len(result.entries)) with self.assertLogs('jamdict.jmdict_sqlite', level="WARNING") as cm: result = jam.lookup('おみやげ', pos='noun (common) (futsuumeishi)') self.assertEqual(1, len(result.entries)) warned_pos_as_str = False for line in cm.output: if "POS filter should be a collection, not a string" in line: warned_pos_as_str = True break self.assertTrue(warned_pos_as_str) result = jam.lookup('おみやげ', pos=['intransitive verb']) self.assertFalse(result.entries) result = jam.lookup( 'おみやげ', pos=['intransitive verb', 'noun (common) (futsuumeishi)']) self.assertTrue(result.entries)
class LearningMaterialGetter: def __init__(self, user_dict="", user_dict_en=""): self.dict_en = {} dbfile = os.path.dirname(__file__) + "/res/jamdict.db" if not os.path.isfile(dbfile): dbfile = os.path.dirname(sys.argv[0]) + "/res/jamdict.db" self.jmd = Jamdict(db_file=dbfile, kd2_file=dbfile) if user_dict != "": self.tokenizer = Tokenizer(user_dict, udic_type="simpledic", udic_enc="utf8") else: self.tokenizer = Tokenizer() self.token_filters = [POSStopFilter(['記号', '助詞']), TokenCountFilter(att='base_form')] if user_dict_en != "": with open(user_dict_en, newline='', encoding="utf-8") as csvfile: dic_reader = csv.reader(csvfile, delimiter=',', quotechar='|') for row in dic_reader: if len(row) >= 3: self.dict_en[row[0]] = { 'reading': row[1], 'meaning': row[2] } def tokenize(self, text): a = Analyzer(tokenizer=self.tokenizer, token_filters=self.token_filters) return a.analyze(text) def getDictionaryInfos(self, pairs): infos = [] for token, v in pairs: match = re.match("[\u30A1-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff66-\uff9f]", token) if match: dic_info = self.jmd.lookup(token) if len(dic_info.entries) > 0 or len(dic_info.chars) > 0: # Inject custom dictionary meaning if len(dic_info.entries) == 0 and token in self.dict_en: meaning = self.dict_en[token]['meaning'] reading = self.dict_en[token]['reading'] d = JMDEntry() d.senses = [Sense()] d.kana_forms = [KanaForm()] d.kana_forms[0].text = reading d.senses[0].gloss.append(SenseGloss("", "", meaning)) dic_info.entries.append(d) infos.append((token, dic_info)) return infos
def test_warn_to_json_deprecated(self): print("Test Jamdict search in XML files") jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE) jam.import_data() with self.assertWarns(DeprecationWarning): r = jam.lookup("おみやげ") self.assertTrue(r.to_json()) with self.assertWarns(DeprecationWarning): r2 = jam.lookup("シェンロン") self.assertTrue(r2.to_json())
def test_lookup_iter(self): jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE, auto_config=True) jam.import_data() # verify entries res = jam.lookup_iter("おこ%", pos="noun (common) (futsuumeishi)") entries = [e.text() for e in res.entries] expected = [ 'おこのみやき (お好み焼き) : okonomiyaki/savoury pancake containing meat or seafood and ' 'vegetables', 'おこさん (お子さん) : child', "おこさま (お子様) : child (someone else's)" ] self.assertEqual(expected, entries) # verify characters res = jam.lookup_iter("お土産") self.assertIsNotNone(res.entries) self.assertIsNotNone(res.chars) self.assertIsNotNone(res.names) # verify characters chars = [repr(c) for c in res.chars] expected = [ '土:3:soil,earth,ground,Turkey', '産:11:products,bear,give birth,yield,childbirth,native,property' ] self.assertEqual(expected, chars) # verify names res = jam.lookup_iter("surname") names = [n.text() for n in res.names] expected = [ 'しめたに (〆谷) : Shimetani (surname)', 'しめき (〆木) : Shimeki (surname)', 'しめの (〆野) : Shimeno (surname)' ] self.assertEqual(expected, names)
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. ######################################################################## import json from jamdict import Jamdict ######################################################################## # Create an instance of Jamdict jam = Jamdict() print("Jamdict DB file: {}".format(jam.db_file)) # Lookup by kana result = jam.lookup('おかえし') for entry in result.entries: print(entry) # Lookup by kanji print("-----------------") result = jam.lookup('御土産') for entry in result.entries: print(entry) # Lookup a name # a name entity is also a jamdict.jmdict.JMDEntry object
def mainloop(file, savedump, database, cfgfile, records, orderby, compact, known, verbose, nosense, translate, destlang): """Get user Janpanse input then parse it and record new words into database.""" jmd = Jamdict() knp = KNP() knownlist = {} with open(known, 'r') as reader: lines = reader.readlines() for line in lines: if re.match("^#", line): continue entry = line.split(",") if len(entry) == 2: knownlist[entry[0].strip()] = entry[1].strip() appid = "" appkey = "" if translate == "true": # See https://fanyi-api.baidu.com/ # See https://fanyi-api.baidu.com/api/trans/product/desktop?req=developer # See https://docs.python.org/3/library/configparser.html config = configparser.ConfigParser() config.read(cfgfile) # Set your own appid/appkey. appid = config['api.fanyi.baidu.com']['appid'] appkey = config['api.fanyi.baidu.com']['appkey'] #print("appid=" + appid) #print("appkey=" + appkey) jumandict = sqlite3.connect(database) dictcursor = jumandict.cursor() dictcursor.execute("CREATE TABLE IF NOT EXISTS words (id INTEGER PRIMARY KEY, name TEXT UNIQUE, desc TEXT, count INTEGER)") dumper = open(savedump, 'w') dumper.write("# 日语学习记录\n\n") while True: userinputs = "" if file == "": try: if not click.confirm('想要进入编辑器输入日文句子或段落进行分析吗?'): continue except EOFError: print("\n你选择退出了哦!") break except click.Abort: print("\n你选择退出了哦!") break if records > 0: rows = dictcursor.execute("SELECT id, name, desc, count FROM words ORDER BY {} DESC LIMIT {}".format(orderby, records)).fetchall() words = len(rows) if words > 0: if orderby == "id": print("最近保存过的{}个单词(最近优先排序):".format(words)) else: print("出现频率最高的{}个单词(高频优先排序):".format(words)) count = 0 for row in rows: print('{} [{} ({}次)]:\n'.format(row[0], row[1], row[3])) print(row[2]) userinputs = click.edit() if userinputs is None: print("你啥也没输入啊!") continue else: with open(file, 'r') as reader: lines = reader.readlines() userinputs = "".join(lines) if translate == "true": # For list of language codes, please refer to `https://api.fanyi.baidu.com/doc/21` from_lang = 'jp' to_lang = destlang endpoint = 'http://api.fanyi.baidu.com' path = '/api/trans/vip/translate' url = endpoint + path salt = random.randint(32768, 65536) sign = make_md5(appid + userinputs + str(salt) + appkey) # Build request headers = {'Content-Type': 'application/x-www-form-urlencoded'} payload = {'appid': appid, 'q': userinputs, 'from': from_lang, 'to': to_lang, 'salt': salt, 'sign': sign} # Send request r = requests.post(url, params=payload, headers=headers) result = r.json() # Show response print("=================================") print(userinputs) dumper.write("```\n") dumper.write(userinputs) print("=================================") dumper.write("=================================\n") trans_result = result["trans_result"] for i in range(len(trans_result)): dst = trans_result[i]["dst"] print(dst) dumper.write(dst + "\n") dumper.write("```\n") inputsentences = [x+"。" for x in userinputs.split("。") if x.strip() != ""] for userinput in inputsentences: userinput = userinput.strip() userinput = userinput.encode('utf-8','surrogatepass').decode('utf-8') print("=================================") print(userinput) dumper.write("## "+ userinput + "\n\n") result = knp.parse(userinput.replace("\n", "")) dumper.write("```\n") dumper.write(userinput + "\n") length = 0 for bnst in result.bnst_list(): # 访问每个词组 phrase = "".join(mrph.midasi for mrph in bnst.mrph_list()) phrase = phrase.replace("\␣", " ") print(" " * length + phrase) dumper.write(" " * length + phrase + "\n") length = length + len(phrase) if length > 80: length = 0 dumper.write("```\n") print("=================================") for mrph in result.mrph_list(): # 访问每个词素 found = False for known in knownlist.keys(): if mrph.genkei == known: types = knownlist[known].split("|") for type in types: if mrph.hinsi == type: found = True break if ((found == True) and (verbose == "none")) or (mrph.hinsi == "特殊"): continue message = "ID:{}".format(mrph.mrph_id) if mrph.midasi: message += ", 词汇:{}".format(mrph.midasi) if mrph.yomi: message += ", 读法:{}".format(mrph.yomi) if mrph.genkei: message += ", 原形:{}".format(mrph.genkei) if mrph.hinsi and mrph.hinsi != "*": message += ", 词性:{}".format(mrph.hinsi) if mrph.bunrui and mrph.bunrui != "*": message += ", 词性细分:{}".format(mrph.bunrui) if mrph.katuyou1 and mrph.katuyou1 != "*": message += ", 活用型:{}".format(mrph.katuyou1) if mrph.katuyou2 and mrph.katuyou2 != "*": message += ", 活用形:{}".format(mrph.katuyou2) if mrph.imis and mrph.imis != "NIL": message += ", {}".format(mrph.imis) #语义信息: elif mrph.repname: message += ", 代表符号:{}".format(mrph.repname) print("\t" + message) dumper.write("### " + message + "\n") if nosense == "true" or (found == True and verbose == "half"): continue # use exact matching to find exact meaning dictcheck = jmd.lookup(mrph.genkei) if len(dictcheck.entries) == 0: dictcheck = jmd.lookup(mrph.midasi) if len(dictcheck.entries) == 0: dictcheck = jmd.lookup(mrph.yomi) if len(dictcheck.entries) > 0: desc = "" print("\n") dumper.write("\n") for entry in dictcheck.entries: text = "" if compact == "true": text = entry.text(compact=False, no_id=True) text = re.sub('[`\']', '"', text) print(text) else: tmp = [] if entry.kana_forms: tmp.append(entry.kana_forms[0].text) if entry.kanji_forms: tmp.append("({})".format(entry.kanji_forms[0].text)) header = " ".join(tmp) tmp = [] if entry.senses: for sense, idx in zip(entry.senses, range(len(entry.senses))): tmps = [str(x) for x in sense.gloss] if sense.pos: s = '{gloss} ({pos})'.format(gloss='/'.join(tmps), pos=('(%s)' % '|'.join(sense.pos))) else: s = '/'.join(tmps) s = re.sub('[`\']', '"', s) tmp.append(' {i}. {s}\n'.format(i=idx + 1, s=s)) senses = "".join(tmp) print(header) print(senses) text = "**" + header + "**\n" + senses desc = desc + text + "\n" text = re.sub('[|]', '\|', text) dumper.write("- " + text + "\n") dictcursor.execute('INSERT INTO words (name, desc, count) VALUES ("{}", "{}", "{}") ON CONFLICT (name) DO UPDATE SET count = count + 1' .format(mrph.genkei.replace('"', '""'), desc.replace('"', '""'), 1)) jumandict.commit() dumper.flush() if file != "": break jumandict.close() dumper.close()
import operator import os import pickle import random import re import sys import time from collections import OrderedDict, defaultdict from itertools import chain from typing import List, Optional import pygame import romkan from jamdict import Jamdict JMD = Jamdict() if not os.path.exists('data'): os.mkdir('data') WORDS_FREQ_FILEPATH = "data/nf_words_freq" def generate_word_frequency_file(filepath): nf_to_kanjis = defaultdict(set) for entry in JMD.jmdict_xml.entries: for word in chain(entry.kanji_forms, entry.kana_forms): for pri in word.pri: if pri.startswith('nf'): nf_x = int(pri[-2:]) nf_to_kanjis[nf_x].add(word.text)
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. ######################################################################## import os from jamdict import Jamdict ######################################################################## # Create an instance of Jamdict jam = Jamdict() print("Jamdict DB file: {}".format(jam.db_file)) # Lookup by kana result = jam.lookup('おかえし') for entry in result.entries: print(entry) print("-----------------") # Lookup by kanji result = jam.lookup('御土産') for entry in result.entries: print(entry) print("-----------------")
class Kotoba(): def __init__(self): self.jmd = Jamdict() self.moras = [ 'あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ', 'た', 'ち', 'つ', 'て', 'と', 'な', 'に', 'ぬ', 'ね', 'の', 'は', 'ひ', 'ふ', 'へ', 'ほ', 'ま', 'み', 'む', 'め', 'も', 'ら', 'り', 'る', 'れ', 'ろ', 'や', 'ゆ', 'よ', 'わ', 'ん' ] self.special_cases = { 'か': ['が'], 'き': ['ぎ'], 'け': ['げ'], 'こ': ['ご'], 'さ': ['ざ'], 'し': ['じ'], 'す': ['ず'], 'せ': ['ぜ'], 'そ': ['ぞ'], 'た': ['だ'], 'ち': ['ぢ'], 'つ': ['づ', 'っ'], 'て': ['で'], 'と': ['ど'], 'は': ['ば', 'ぱ'], 'ひ': ['び', 'ぴ'], 'ふ': ['ぶ', 'ぷ'], 'へ': ['べ', 'ぺ'], 'ほ': ['ぼ', 'ぽ'], 'や': ['ゃ'], 'ゆ': ['ゅ'], 'よ': ['ょ'], } def find_kotoba(self, word, hidden): temp = set() hidden_set = set() for m in word: temp.add(m) for m in hidden: hidden_set.add(m) if not temp.issubset(hidden): return None test = self.jmd.lookup(word) return test.entries def generate_moras(self): current_set = [] all_set = [] hidden = [] all_set.append([self.moras[2]]) current_set.append(self.moras[2]) current_set.append(self.moras[17]) tsu = [self.moras[17]] for mora in self.special_cases[self.moras[17]]: tsu.append(mora) all_set.append(tsu) while len(current_set) < 9: cur = self.moras[random.randint(0, len(self.moras) - 1)] if cur not in current_set: current_set.append(cur) hidden.append(cur) if cur in self.special_cases: temp = [cur] for char in self.special_cases[cur]: temp.append(char) hidden.append(char) all_set.append(temp) else: all_set.append([cur]) dup = copy.deepcopy(all_set) str_display = ''.join(flatten(dup)) shuffle(all_set) return (str_display, current_set, all_set, hidden) def display_result(self, word, entry, max_score): print("Correct!") print("Current Score: " + str(max_score)) print(entry.entries) def start_session(self): session_started = True mora_list = self.generate_moras() guesses = set() lives_left = 3 cur_score = 0 max_score = 0 while session_started: print('The letters are: ' + mora_list[0]) cmd = input('Create a word: ') if cmd not in guesses: check = self.jmd.lookup(cmd) if check != "Found nothing": cur_score += 1 max_score += 1 self.display_result(cmd, check, max_score) else: lives_left -= 1 else: print("You have already used " + cmd) #reset for a new mora list if cur_score >= 5: cmd = input('Generate a new list?') if cmd == 'y': cur_score = 0 mora_list = self.generate_moras() def start_game(self): print('Hello from start') self.start_session()
from flask import Flask, Response from functools import wraps from flask import request from chirptext.cli import setup_logging from jamdict import Jamdict from jamdict import __version__ # --------------------------------------------------------------------- # CONFIGURATION # --------------------------------------------------------------------- setup_logging('logging.json', 'logs') app = Flask(__name__, static_url_path="") jmd = Jamdict() def getLogger(): return logging.getLogger(__name__) # --------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------- def jsonp(func): @wraps(func) def decorated_function(*args, **kwargs): data = func(*args, **kwargs)
class OfflineTranslator(Translator): """ Offline 'translator' based on stanza, jamdict (dictionary) and jaconv (kana romanization) Results are pretty bad, but you don't need cloud API to use it. To use this class, make sure to: - pip install stanza jamdict jaconv - run `stanza.download('ja')` in Python console once, to download the resources for Stanza """ CACHE_NAME = "pyTranslateSwf-cache-OfflineTranslator.json" BATCH_SIZE = 100 def __init__(self): super().__init__() # stanza.download('ja') self.nlp = stanza.Pipeline('ja') self.jmd = Jamdict() self._translate_jmd_cache = {} def _translate_all(self, input_strings: List[str]) -> List[str]: return [self._translate(s) for s in input_strings] def _translate(self, string: str) -> str: """The actual translation logic""" if not string or string.isspace(): return "" doc = self.nlp(string) # run annotation over a sentence input_tokens = [] output_tokens = [] for sentence in doc.to_dict(): for d in sentence: token = d["text"] input_tokens.append(token) if d["upos"] in ("NOUN", "VERB"): x = self._translate_jmd(token) else: x = self._transliterate(token).upper() output_tokens.append(x) return " ".join(output_tokens).replace("( ", "(").replace(" )", ")").replace( " .", ".") def _translate_jmd(self, token: str) -> str: if token in self._translate_jmd_cache: return self._translate_jmd_cache[token] result = self.jmd.lookup(token) # get first dictionary meaning try: meaning = result.entries[0].senses[0].text() meaning = meaning.split("/")[0] meaning = re.sub(r",.*|to |\(.+\)", "", meaning) output = self._translate_jmd_cache[token] = meaning.strip() return output except Exception: pass # get first radical meaning try: meaning = result.chars[0].meanings()[0] meaning = meaning.split("/")[0] meaning = re.sub(r",.*|to |\(.+\)", "", meaning) output = self._translate_jmd_cache[token] = meaning.strip() return output except Exception: pass output = self._translate_jmd_cache[token] = self._transliterate( token).upper() return output @staticmethod def _transliterate(token: str) -> str: s = jaconv.kata2hira(token) s = jaconv.kana2alphabet(s) return s
def __init__(self, info, name="jam"): super().__init__(info, name) self.parser = JapaneseAnalyser() self.jam = Jamdict()
def mainloop(file, database, savedump, records, orderby, guimode): """Get user Janpanse input then parse it and record new words into database.""" jmd = Jamdict() knp = KNP() jumandict = sqlite3.connect(database) dictcursor = jumandict.cursor() dictcursor.execute( "CREATE TABLE IF NOT EXISTS words (id INTEGER PRIMARY KEY, name TEXT UNIQUE, desc TEXT, count INTEGER)" ) dumper = open(savedump, 'w') # Pass any command line argument for Web use if guimode == "web": # if there is use the Web Interface import PySimpleGUIWeb as sg import socket elif guimode == "tk": # default uses the tkinter GUI import PySimpleGUI as sg elif guimode == "qt": import PySimpleGUIQt as sg else: import PySimpleGUIWx as sg # All the stuff inside your window. header_list = [ "ID", "词汇", "读法", "原形", "词性", "词性细分", "活用型", "活用形", "语义信息", "代表符号" ] uifont = "Ariel 32" left_column_layout = [ [ sg.T("输入日语"), sg.FolderBrowse(), ], [ sg.Multiline("", size=(75, 10), key="nihongo"), ], [ sg.Button("分析", size=(30, 3), font=uifont, button_color=('white', 'green'), key="submit"), sg.Button("退出", size=(30, 3), font=uifont, button_color=('white', 'red'), key="exit") ], [ sg.Listbox(values=[], enable_events=True, size=(75, 20), key="parsedwords") ], ] right_column_layout = [ [sg.T("词汇意义")], [ sg.Listbox(values=[], enable_events=True, size=(75, 33), key="foundentries") ], ] layout = [[ sg.VSeperator(), sg.Column(left_column_layout), sg.VSeperator(), sg.Column(right_column_layout), ]] # Create the Window if guimode == "web": hostname = socket.gethostname() local_ip = socket.gethostbyname(hostname) print("local_ip is " + local_ip) window = sg.Window('日语学习', layout, web_ip=local_ip, web_port=8888, web_start_browser=False) else: window = sg.Window('日语学习', layout) resultlist = [] # Run the Event Loop while True: event, values = window.read() if event == "exit" or event == sg.WIN_CLOSED: break # Folder name was filled in, make a list of files in the folder if event == "submit": userinput = values["nihongo"] print("=================================") print(userinput) userinput = userinput.strip() userinput = userinput.encode('utf-8', 'surrogatepass').decode('utf-8') dumper.write(userinput + "\n\n") result = knp.parse(userinput.replace("\n", "")) print("=================================") print("词素") resultlist = result.mrph_list() parsedwords = [] for mrph in resultlist: # 访问每个词素 if mrph.midasi in {"、", "。", "「", "」", "\␣"}: continue message = "\tID:{}, 词汇:{}, 读法:{}, 原形:{}, 词性:{}, 词性细分:{}, 活用型:{}, 活用形:{}, 语义信息:{}, 代表符号:{}".format( mrph.mrph_id, mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname) print(message) dumper.write(message + "\n") parsedwords += [message] # use exact matching to find exact meaning dictcheck = jmd.lookup(mrph.genkei) if len(dictcheck.entries) == 0: dictcheck = jmd.lookup(mrph.midasi) if len(dictcheck.entries) == 0: dictcheck = jmd.lookup(mrph.yomi) if len(dictcheck.entries) > 0: desc = "" for entry in dictcheck.entries: desc = desc + entry.text(compact=False, no_id=True) + "\n" print("\n" + desc) dumper.write("\n" + desc + "\n") dictcursor.execute( 'INSERT INTO words (name, desc, count) VALUES ("{}", "{}", "{}") ON CONFLICT (name) DO UPDATE SET count = count + 1' .format(mrph.genkei.replace('"', '""'), desc.replace('"', '""'), 1)) jumandict.commit() window["parsedwords"].update(parsedwords) elif event == "parsedwords": # A file was chosen from the listbox selectedword = values["parsedwords"][0] print(selectedword) selectedid = int(selectedword.split(',')[0].split(':')[1].strip()) print("selectedid=" + str(selectedid) + " among " + str(len(resultlist)) + " entries") foundentries = [] for mrph in resultlist: # 访问每个词素 if selectedid != mrph.mrph_id: continue message = "\tID:{}, 词汇:{}, 读法:{}, 原形:{}, 词性:{}, 词性细分:{}, 活用型:{}, 活用形:{}, 语义信息:{}, 代表符号:{}".format( mrph.mrph_id, mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname) print(message) # use exact matching to find exact meaning dictcheck = jmd.lookup(mrph.genkei) if len(dictcheck.entries) == 0: dictcheck = jmd.lookup(mrph.midasi) if len(dictcheck.entries) == 0: dictcheck = jmd.lookup(mrph.yomi) foundentries += [message] foundentries += ["==================================="] if len(dictcheck.entries) > 0: for entry in dictcheck.entries: desc = entry.text(compact=False, no_id=True) print("\n" + desc) foundentries += [desc] window["foundentries"].update(foundentries) window.close() jumandict.close() dumper.close()
import uuid as uid from functools import reduce from jamdict import Jamdict from fastapi import FastAPI, Form, HTTPException # from starlette.responses import FileResponse # -------------- # Game Machanics # -------------- from de.mindscan.orangemoon.httpserver.game_directory import GameDirectory from de.mindscan.orangemoon.httpserver.game_room import GameRoom from de.mindscan.orangemoon.httpserver.game_player import GamePlayer myJamDict = Jamdict() RADICAL_STROKE_DATA = 'kanjiRadicalStrokeData.json' KANJI_STROKE_DATA = 'kanjiStrokeData.json' with open(os.path.join(DATA_BASE_DIR, RADICAL_STROKE_DATA), 'r') as jsonFile: global_radicalDict = json.load(jsonFile) with open(os.path.join(DATA_BASE_DIR, KANJI_STROKE_DATA), 'r') as jsonFile: global_kanjiDict = json.load(jsonFile) app = FastAPI() @app.get("/") def read_root():
from jamdict import Jamdict, config jmd = Jamdict(db_file=config.get_file('JAMDICT_DB')) def lookup_dic(word, igonre=True): word_dic = dict() result = jmd.lookup(word, strict_lookup=True, lookup_chars=False) result = result.to_json()['entries'] for entry in result: if entry['kanji'][0]['text'] == word: word_dic['word'] = entry['kanji'][0]['text'] word_dic['pronunciation'] = "" for k, p in enumerate(entry['kana'], start=1): if len(entry['kana']) == 1: word_dic['pronunciation'] += p['text'] else: word_dic['pronunciation'] += str(k) + ". " + p['text'] + "<br>" word_dic['meaning'] = "" for j, meaning in enumerate(entry['senses'], start=1): if j >= 3: break if len(entry['senses']) > 1: word_dic['meaning'] += str(j) + ". " for i, one_sense in enumerate(meaning['SenseGloss'], start=1): if i >= 5: break word_dic['meaning'] += one_sense['text'] if i != len(meaning['SenseGloss']) and i != 4: word_dic['meaning'] += "; " if j != len(entry['senses']) and j != 2: word_dic['meaning'] += "<br>"