def test_accent(text, accent): # This checks for correct handling of feature fields containing commas as reported in #13 tagger = Tagger() tokens = tagger.parseToNodeList(text) # Skip if UnidicFeatures17 is used because it doesn't have 'atype' attribute if tokens and isinstance(tokens[0].feature, UnidicFeatures17): pytest.skip() accent_ = [tok.feature.aType for tok in tokens] assert accent_ == accent
class Tokenizer(): def __init__(self): self.tagger = Tagger("-Owakati") def tokenize(self, text): tokens = self.tagger.parse(text).split(" ") return tokens
def load_fugashi(write_cfg=False): try: # help python find libmecab.dll, adjust this to fit your env if necessary dll_path = None for base in sys.path: x = os.path.join(base, "fugashi") if os.path.exists(os.path.join(x, "cli.py")) and not dll_path: dll_path = x x2 = os.path.join(x, "../../../lib/site-packages/fugashi") if os.path.exists(x2): dll_path = x2 break if not dll_path: raise Exception("could not find fugashi installation path") if WINDOWS: os.add_dll_directory(dll_path) from fugashi import Tagger dicrc = os.path.join(dll_path, "dicrc") if write_cfg: with open(dicrc, "wb") as f: f.write("\n".join([ r"node-format-yomi = %f[9] ", r"unk-format-yomi = %m", r"eos-format-yomi = \n", "", ]).encode("utf-8")) wakati = Tagger("-Owakati") yomi = Tagger("-Oyomi -r " + dicrc.replace("\\", "\\\\")) # import MeCab # wakati = MeCab.Tagger('-Owakati') info("found fugashi") return wakati, yomi except: import traceback warn("could not load fugashi:\n" + traceback.format_exc() + "-" * 72 + "\n")
def main(): text = 'softbank' tagger = Tagger() gtagger = GenericTagger() print('Tagger:') print(tagger.parse(text)) for word in tagger(text): print(word.surface) print(word.feature) print() print('GenericTagger:') print(gtagger.parse(text)) for word in gtagger(text): print(word.surface) print(word.feature) print() print('DONE')
def main(): tagger = Tagger() wakati_tagger = Tagger('-Owakati') text = '私はご飯を食べます。' result = wakati_tagger.parse(text) print('result1(parse + wakati):') print(result) print(type(result)) print() result = tagger.parse(text) print('result2(parse):') print(result) print(type(result)) print() result = wakati_tagger(text) print('result3(_call_+wakati):') print(result) print(type(result)) print(inspect.getmembers(result[0])) print(type(result[0])) print() result = tagger(text) print('result4(_call_):') print(result) print(type(result)) print(inspect.getmembers(result[0])) print(type(result[0])) print() print('DONE')
def info(): """Print configuration info.""" args = ' '.join(sys.argv[1:]) try: tagger = GenericTagger(args, quiet=True) except RuntimeError: tagger = Tagger(args) #TODO get the fugashi version here too print("Fugashi dictionary info:") print("-----") for di in tagger.dictionary_info: for field in 'version size charset filename'.split(): print( (field + ':').ljust(10), di[field]) print('-----')
def main(): tagger = Tagger() neologd_tagger = Tagger('-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-unidic-neologd') text = '私は、渋谷ストリームでランチを食べる。' print('unidic:') print(tagger.parse(text)) print() print('unidic-neologd:') print(neologd_tagger.parse(text)) print('DONE')
def main(): """ This is a simple wrapper for fugashi so you can test it from the command line. Like the mecab binary, it treats each line of stdin as one sentence. You can pass tagger arguments here too. """ args = ' '.join(sys.argv[1:]) # This should work if you specify a different dictionary, # but it should also work with the pip unidic. # Try the GenericTagger and then try the Unidic tagger. try: tagger = GenericTagger(args, quiet=True) except RuntimeError: tagger = Tagger(args) for line in fileinput.input([]): print(tagger.parse(line.strip()))
def test_pos(text, tags): # There should be a pos property when using the default tagger tagger = Tagger() tags_ = [tok.pos for tok in tagger(text)] assert tags == tags_
def test_invalid_args(): # Invalid args will give a NULL pointer for the Tagger object # don't try to use the null object! with pytest.raises(RuntimeError): tagger = Tagger('-fail')
def test_nbest(text, saved): tagger = Tagger('-Owakati') assert tagger.nbest(text, 2) == saved
def test_tokens(text, saved): # testing the token objects is tricky, so instead just check surfaces #TODO: maybe save serialized nodes to compare? tagger = Tagger() tokens = [str(tok) for tok in tagger(text)] assert tokens == saved
def test_wakati(text, wakati): tagger = Tagger('-Owakati') assert tagger.parse(text) == wakati
from fugashi import Tagger from dataclasses import dataclass from jamdict import Jamdict, jmdict from japaneseverbconjugator.src.constants.EnumeratedTypes import VerbClass import jconj.conj as jconj SudachiPos = Tuple[str, str, str, str, str, str] K = TypeVar("K") V = TypeVar("V") CT = jconj.read_conj_tables("./jconj/data") JMDICT_ABBREV_MAP = {v: k for k, vs in CT["kwpos"].items() for v in vs} JMDICT_ABBREV_MAP["expressions (phrases, clauses, etc.)"] = "exp" tokenizer_obj = dictionary.Dictionary().create() tagger = Tagger("-Owakati") jmd = Jamdict() google_translate = googletrans.Translator() SUDACHI_POS_MAP = { "感動詞": "interjection", "記号": "symbol", "補助記号": "supplementary symbol", "名詞": "noun", "接尾辞": "suffix", "助詞": "particle", "形容詞": "adjective", # "i-adjective", "助動詞": "auxiliary verb", "代名詞": "pronoun", "空白": "blank space", "動詞": "verb",
#!/usr/bin/env python from fugashi import Tagger tt = Tagger() from collections import Counter wc = Counter() for line in open('wagahai.txt'): for word in tt.parseToNodeList(line.strip()): wc[word.surface] += 1
def __init__(self): self.tagger = Tagger("-Owakati")
def test_accent(text, accent): # This checks for correct handling of feature fields containing commas as reported in #13 tagger = Tagger() accent_ = [tok.feature.aType for tok in tagger.parseToNodeList(text)] assert accent_ == accent
""" vocabulaire = [] phrase = "12345" liste_mots = [] for i in range(len(phrase)+1): for j in range(i,len(phrase)+1): mot = phrase[i:j] if mot in vocabulaire: liste_mots.append(mot) """ import numpy as np from fugashi import Tagger tagger = Tagger('-Owakati') text = "今日はパリから東京まで散歩するつもりだ" text = "この暑い焼き鳥お食べ次第すぐにビールお飲みます" text = '僕は自分中心' text = "この暑い焼き鳥お食べ次第すぐにビールお飲みます" text = 'でないと'
def recognize_image(image_file, clipboard_buffer): """Returns document bounds given an image.""" client = vision.ImageAnnotatorClient() with io.open(image_file, "rb") as image_file: content = image_file.read() image = types.Image(content=content) response = client.document_text_detection(image=image) document = response.full_text_annotation texts = response.text_annotations s = wx.ScreenDC() ss_x1 = c1x + c1x_delta ss_x2 = c2x + c2x_delta ss_y1 = c1y + c1y_delta ss_y2 = c2y + c2y_delta global mode console_output = "" table = Table(show_header=True, header_style="bold magenta", box=box.MINIMAL_DOUBLE_HEAD) table.add_column("日本語", style="dim") table.add_column(mode) if mode == "Vocab": table.add_column("読み方") table.add_column("意味") for page in document.pages: for block in track(page.blocks): results = [] results.append([]) for paragraph in block.paragraphs: for word in paragraph.words: for symbol in word.symbols: results[-1].append(symbol.text) bound = block.bounding_box start_x = bound.vertices[0].x start_y = bound.vertices[0].y width = bound.vertices[2].x - bound.vertices[0].x height = bound.vertices[2].y - bound.vertices[0].y s.Pen = wx.Pen("#FF0000") s.SetTextForeground((255, 0, 0)) s.SetTextBackground((0, 0, 0)) s.Brush = wx.Brush(wx.Colour(255, 255, 255)) s.SetFont( wx.Font( 12, wx.FONTFAMILY_DECORATIVE, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_BOLD, )) ocr_results = "".join(results[-1]) clipboard_buffer = clipboard_buffer + ocr_results clipboard_buffer = clipboard_buffer + "\n" if mode == "Romaji": katsu = cutlet.Cutlet() hepburn_block = katsu.romaji(ocr_results) table.add_row(ocr_results, hepburn_block) hepburn_block = "\n".join(textwrap.wrap(hepburn_block, 25)) if mode == "Vocab": tagger = Tagger("-Owakati") nl_separated_block = [] for word in tagger(ocr_results): if word.char_type == 2: results = jmd.lookup(str(word.feature.lemma)) meaning = " " for k in range(len(results.entries)): result = results.entries[k] if k > 0: meaning = meaning + "\n " meaning = (meaning + f"[bold red]{str(k + 1)}. [/bold red]" + " \\ ".join([ str(sense.gloss[0]) for sense in result.senses ])) console_output = console_output + "\t".join([ str(word), "『" + str(word.feature.kana) + "』", str(meaning), "\n", ]) nl_separated_block.append("\t".join([ str(word), "『" + str(word.feature.kana) + "』", str(meaning), ])) table.add_row( str(word), str(word.feature.lemma), "『" + str(word.feature.kana) + "』", str(meaning), ) hepburn_block = "\n".join(nl_separated_block) # table.add_row(ocr_results, hepburn_block) if mode == "Google": translator = Translator() translated = translator.translate(ocr_results).text table.add_row("\n".join(textwrap.wrap(ocr_results, 25)), translated) hepburn_block = "\n".join(textwrap.wrap(translated, 25)) if mode == "DeepL": url = "https://api.deepl.com/v2/translate" response = requests.get( url, params={ "auth_key": deepL_auth, "text": ocr_results, "target_lang": "EN", }, ) result = response.json() translated = result["translations"][0]["text"] table.add_row("\n".join(textwrap.wrap(ocr_results, 25)) + "\n", translated) hepburn_block = "\n".join(textwrap.wrap(translated, 40)) nl_separated_block = hepburn_block.split("\n") max_x_bound = ( max([s.GetTextExtent(line)[0] for line in nl_separated_block]) + 3) max_y_bound = ( s.GetTextExtent(hepburn_block)[1] * len(nl_separated_block) + 3) w, h, = s.GetTextExtent(hepburn_block) # modify this with dpi scale when screen device context is fixed s.DrawRectangle(ss_x1 + start_x - 3, ss_y1 + start_y - 3, max_x_bound, max_y_bound) s.DrawText(hepburn_block, ss_x1 + start_x, ss_y1 + start_y) console.print(table) return clipboard_buffer
import pandas, regex, functools from fugashi import Tagger from pykakasi import kakasi from collections import OrderedDict import re from ja_sentence_segmenter.common.pipeline import make_pipeline from ja_sentence_segmenter.concatenate.simple_concatenator import concatenate_matching from ja_sentence_segmenter.normalize.neologd_normalizer import normalize from ja_sentence_segmenter.split.simple_splitter import split_newline, split_punctuation df_word = pandas.read_csv("../dict_data/ja/jlpt/JLPT.csv") fugger = Tagger() pos_list = [] kana_list = [] df_word=df_word.fillna("NONE") for index, row in df_word.iterrows(): index += 1 if not row['kanji'] == "NONE": kanji = row['kanji'] wf = fugger(kanji) else: furigana = row['furigana'] wf = fugger(furigana) if len(wf) > 1:
def JlptLevel(text): # Data import df_word = pandas.read_csv("dict_data/ja/jlpt/JLPT.csv") df_kanji = pandas.read_csv("dict_data/ja/jlpt/JLPT_Kanji.csv") kks = kakasi() kks.setMode("J", "H") conv = kks.getConverter() # Text to words ########################################################################################################################### text = text ########################################################################################################################### sentence_list = segmenter(text) fugger = Tagger() text_level = 5 dict = [] if sentence_list: for sentence_num, sentence in enumerate(sentence_list): sentence_word_level_count_dict = {5: 0, 4: 0, 3: 0, 2: 0, 1: 0} kanji_level_count_dict = OrderedDict({5: 0, 4: 0, 3: 0, 2: 0, 1: 0}) sentence_num += 1 word_list = [] for w in fugger(sentence): word = w if word: word_original = str(w) word_dict_form = word.feature.lemma word_kanji_hiragana = conv.do(word_original) level = None word_dict_form_hiragana = None if word.feature.pos1 in ["動詞", "助動詞", "形容詞", "形状詞", "助動詞"]: word_dict_form_hiragana = conv.do(word_dict_form) p = regex.compile(r'\p{Script=Han}+') # Kanji unicode coverage if p.findall(word_original): # The original word is written in Kanji # word_dict[word_original]["letter_type"] = "kanji" <- delete later kanji_dict = OrderedDict() for kanji in p.findall(word_original): kanji_single_list = [] for kanji_single in kanji: kanji_dict[kanji_single] = {} index_list = df_kanji[df_kanji.kanji == kanji_single].index level_list = [] for index in index_list: level_list.append(df_kanji.loc[index, "jlpt"]) if level_list: level = max(level_list) kanji_dict = OrderedDict({ "kanji_single": kanji_single, "kanji_level": level, }) kanji_single_list.append(kanji_dict) # Check the Kanji word level if word_dict_form_hiragana: # if word can conjugate index_list = df_word[df_word.furigana == word_dict_form_hiragana].index else: index_list = df_word[df_word.kanji == word_original].index level_list = [] for index in index_list: if df_word.loc[index].any(): level_list.append(df_word.loc[index, "jlpt"]) if level_list: level = max(level_list) sentence_word_level_count_dict[level] += 1 pos = word.feature.pos1 word_dict = { "kanji_elements": kanji_single_list, "word_level": level, "word_pos": pos, "word": word, "word_dict_form": word_dict_form, "word_index": index_list, } else: # The original word is not written in Kanji if word_dict_form_hiragana: # if word can conjugate index_list = df_word[df_word.furigana == word_dict_form_hiragana].index else: index_list = df_word[df_word.furigana == word].index index_list = df_word[df_word.furigana == word_original].index level_list = [] for index in index_list: level_list.append(df_word.loc[index, "jlpt"]) if level_list: level = max(level_list) sentence_word_level_count_dict[level] += 1 pos = word.feature.pos1 word_dict = { "word_level": level, "word_pos": pos, "word": word, "word_dict_form": word_dict_form, "word_index": index_list, } word_list.append(word_dict) # Define the level of each sentence #print(sentence_word_level_count_dict) freq_level = max(sentence_word_level_count_dict, key=sentence_word_level_count_dict.get) rare_level = min(sentence_word_level_count_dict, key=sentence_word_level_count_dict.get) highest_level = 5 for level in sentence_word_level_count_dict: if sentence_word_level_count_dict[level] > 0: highest_level = level sentence_level_dict = { "max_level": freq_level, "min_level": rare_level, "highest_level": highest_level, } sentence_dict = { "sentence": sentence, "sentence_num": sentence_num, "sentence_word_level_count_dict": sentence_word_level_count_dict, "sentence_word_level_dict": sentence_level_dict, #"sentence_kanji_level": sentence_kanji_level, "word_dict": word_list, } dict.append(sentence_dict) pprint.pprint(dict) return dict
#!/usr/bin/env python from fugashi import Tagger tt = Tagger('-Owakati') from collections import Counter wc = Counter() for line in open('wagahai.txt'): for word in tt.parse(line.strip()).split(' '): wc[word] += 1