def load_tokenizer(lang): if lang == "en": from nltk.tokenize import word_tokenize as wt tokenizer = wt elif lang == "ko": from konlpy.tag import Kkma tokenizer = Kkma() elif lang == "ja": import Mykytea opt = "-model jp-0.4.7-1.mod" tokenizer = Mykytea.Mykytea(opt) elif lang == "zh_cn": import Mykytea opt = "-model ctb-0.4.0-1.mod" tokenizer = Mykytea.Mykytea(opt) elif lang == "zh_tw": import jieba tokenizer = jieba elif lang == "vi": from pyvi import ViTokenizer tokenizer = ViTokenizer elif lang == "th": from pythainlp.tokenize import word_tokenize tokenizer = word_tokenize elif lang == "ar": import pyarabic.araby as araby tokenizer = araby else: tokenizer = None return tokenizer
def load_tokenizer(lang): if lang == "ko": from konlpy.tag import Mecab tokenizer = Mecab() elif lang == "ja": import Mykytea opt = "-model jp-0.4.7-1.mod" tokenizer = Mykytea.Mykytea(opt) elif lang == "zh_cn": import Mykytea opt = "-model ctb-0.4.0-1.mod" tokenizer = Mykytea.Mykytea(opt) elif lang == "zh_tw": import jieba tokenizer = jieba elif lang == "vi": from pyvi import ViTokenizer tokenizer = ViTokenizer elif lang == "th": from pythainlp.tokenize import word_tokenize tokenizer = word_tokenize elif lang == "ar": import pyarabic.araby as araby tokenizer = araby # elif lang=="en": # from nltk import word_tokenize # tokenizer = word_tokenize else: from nltk.tokenize import ToktokTokenizer tokenizer = ToktokTokenizer() return tokenizer
def __init__(self, type, model=None): assert type in ('Kytea') self.type = type if type == 'Kytea': if model is None: splitter = Mykytea.Mykytea('-wsconst D') else: splitter = Mykytea.Mykytea(f'-wsconst D -model {model}') else: raise ValueError( "Spliter type should be in ['MeCab' or 'Kytea']. ") self.splitter = splitter
def main(): parser = ArgumentParser() parser.add_argument('--model', type=str, help='path to trained kytea model') parser.add_argument('--test', type=str, help='path to test file') args = parser.parse_args() with open(args.test) as f: tests = [line.strip() for line in f] opt = f'-model {args.model}' mk = Mykytea.Mykytea(opt) l_cor, l_sys, l_lcs = 0, 0, 0 for line in tqdm(tests): s = ''.join(w.split('/')[0] for w in line.split()) gold = [w.split('/')[1] for w in line.split()] prediction = [word.tag[0][0][0] for word in mk.getTags(s)] # print('input: ' + s) # print('prediction: ' + ' '.join(prediction)) # print('gold: ' + ' '.join(gold)) # print() l_cor += len(gold) l_sys += len(prediction) l_lcs += lcs(prediction, gold) recall = l_lcs / l_cor precision = l_lcs / l_sys f1_score = (2 * recall * precision) / (recall + precision) print(f'recall: {recall:0.3} ({l_lcs}/{l_cor})') print(f'precision: {precision:0.3} ({l_lcs}/{l_sys})') print(f'f1_score: {f1_score:0.3}')
def __init__(self): opt = "-deftag UNK" # Put UNK when a word doesn't appear in the dictionary #Mapping extracted from https://gist.github.com/neubig/2555399 self.EN_TAGS = {'名詞' : 'N', # Noun '代名詞' : 'PRP', # Pronoun '連体詞' : 'DT', # Adjectival determiner '動詞' : 'V', # Verb '形容詞' : 'ADJ', # Adjective '形状詞' : 'ADJV', # Adjectival verb '副詞' : 'ADV', # Adverb '助詞' : 'PRT', # Particle '助動詞' : 'AUXV', # Auxiliary verb '補助記号' : '.', # Punctuation '記号' : 'SYM', # Symbol '接尾辞' : 'SUF', # Suffix '接頭辞' : 'PRE', # Prefix '語尾' : 'TAIL', # Word tail (conjugation) '接続詞' : 'CC', # Conjunction 'URL' : 'URL', # URL '英単語' : 'ENG', # English word '言いよどみ' : 'FIL', # Filler 'web誤脱' : 'MSP', # Misspelling '感動詞' : 'INT', # Interjection '新規未知語' : 'UNK', # Unclassified unknown word } self.tagger = Mykytea.Mykytea(opt)
def __init__(self, with_postag: bool = False, **kwargs): super(KyTeaTokenizer, self).__init__(name="kytea", with_postag=with_postag) # NOQA try: import Mykytea except ModuleNotFoundError: raise ModuleNotFoundError("kytea is not installed") flag = "" self.kytea = Mykytea.Mykytea(flag)
def ja_tokenize(self, text): if self.ja_word_tokenizer is None: try: import Mykytea self.ja_word_tokenizer = Mykytea.Mykytea( f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin" ) except (AttributeError, ImportError): raise return list(self.ja_word_tokenizer.getWS(text))
def ja_tokenize(self, text): if self.ja_word_tokenizer is None: try: import Mykytea self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~')) except (AttributeError, ImportError) as e: logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps") logger.error("1. git clone [email protected]:neubig/kytea.git && cd kytea") logger.error("2. autoreconf -i") logger.error("3. ./configure --prefix=$HOME/local") logger.error("4. make && make install") logger.error("5. pip install kytea") raise e return list(self.ja_word_tokenizer.getWS(text))
def __init__(self, with_postag: bool = False, model_path: Optional[str] = None, **kwargs): super(KyTeaTokenizer, self).__init__(name="kytea", with_postag=with_postag, model_path=model_path) try: import Mykytea except ImportError: raise ImportError("kytea is not installed") kytea_option = "" if model_path is not None: kytea_option += "-model {}".format(model_path) self.kytea = Mykytea.Mykytea(kytea_option)
def __init__( self, with_postag: bool = False, model_path: Optional[str] = None, **kwargs ) -> None: super(KyTeaTokenizer, self).__init__( name="kytea", with_postag=with_postag, model_path=model_path ) try: import Mykytea except ImportError: msg = "importing kytea failed for some reason." msg += "\n 1. make sure KyTea is successfully installed." msg += "\n 2. make sure Mykytea-python is successfully installed." raise ImportError(msg) kytea_option = "" if model_path is not None: kytea_option += "-model {}".format(model_path) self._tokenizer = Mykytea.Mykytea(kytea_option)
async def transliterate(text): """ transliterate: Retrieves the Japanese transliteration via kytea :param text: The text to be transliterated :return: The transliterated string """ # Create a kytea object that loads in model mk = Mykytea.Mykytea('-model model.bin') # split up the text split = mk.getTagsToString(text).split(' ') transliterated = '' # Loop through the split up parts, find equivalent hiragana, and add it to the transliterated string for list in split: if len(list) > 0: characters = list.split('/') transliterated += transliterate_pick_best(characters) # Convert the hiragana string into a romaji string then return it return romkan.to_roma(transliterated)
def uk(opt=''): mk = Mykytea.Mykytea(opt) return mk
def get_mk(opt=''): mk = Mykytea.Mykytea(opt) return mk
def __init__(self, option_string=''): assert isinstance(option_string, (str, str)) # option string is argument of Kytea. self.kytea = Mykytea.Mykytea(option_string)
for t3 in t2: out = out + "/" + str(t3) out += "\t" out += "\t" print out def list_tags(t): def convert(t2): return (t2[0], type(t2[1])) return [(word.surface, [[convert(t2) for t2 in t1] for t1 in word.tag]) for word in t] # You can pass arguments KyTea style like following opt = "-deftag UNKNOWN!!" # You can also set your own model #opt = "-model /usr/local/share/kytea/model.bin" mk = Mykytea.Mykytea(opt) s = "今日はいい天気です。1999" #分かち書きを取得 for word in mk.getWS(s): print word #解析結果を文字列で取得 print mk.getTagsToString(s) #1位のタグを取得 t = mk.getTags(s) showTags(t) #すべてのタグを取得
def __init__(self, option_string='-deftag UNKNOWN!!'): # type: (string_types)->None # option string is argument of Kytea. assert isinstance(option_string, string_types) self.kytea = Mykytea.Mykytea(option_string)
#!/usr/bin/env python import Mykytea from collections import Counter tok = Mykytea.Mykytea("-deftag UNKNOWN!!") wc = Counter() for line in open('wagahai.txt'): for word in tok.getWS(line.strip()): wc[word] += 1
_spacy_available = False _spacy_version = False try: import spacy nlp = spacy.load('ja_ginza') _ginza_available = True _ginza_version = spacy.__version__ except: _ginza_available = False _ginza_version = False try: import Mykytea mk = Mykytea.Mykytea("") _kytea_available = True _kytea_version = "0.1.5" except: _kytea_available = False _kytea_version = False try: from pyknp import Juman jumanpp = Juman() _jumanpp_available = True _jumanpp_version = "0.4.1" except: _jumanpp_available = False _jumanpp_version = False
def __init__(self): # KyTea self.kytea = Mykytea.Mykytea('')