def load_tokenizer(lang):
    if lang == "en":
        from nltk.tokenize import word_tokenize as wt
        tokenizer = wt
    elif lang == "ko":
        from konlpy.tag import Kkma
        tokenizer = Kkma()
    elif lang == "ja":
        import Mykytea
        opt = "-model jp-0.4.7-1.mod"
        tokenizer = Mykytea.Mykytea(opt)
    elif lang == "zh_cn":
        import Mykytea
        opt = "-model ctb-0.4.0-1.mod"
        tokenizer = Mykytea.Mykytea(opt)
    elif lang == "zh_tw":
        import jieba
        tokenizer = jieba
    elif lang == "vi":
        from pyvi import ViTokenizer
        tokenizer = ViTokenizer
    elif lang == "th":
        from pythainlp.tokenize import word_tokenize
        tokenizer = word_tokenize
    elif lang == "ar":
        import pyarabic.araby as araby
        tokenizer = araby
    else:
        tokenizer = None

    return tokenizer
Esempio n. 2
0
def load_tokenizer(lang):
    if lang == "ko":
        from konlpy.tag import Mecab
        tokenizer = Mecab()
    elif lang == "ja":
        import Mykytea
        opt = "-model jp-0.4.7-1.mod"
        tokenizer = Mykytea.Mykytea(opt)
    elif lang == "zh_cn":
        import Mykytea
        opt = "-model ctb-0.4.0-1.mod"
        tokenizer = Mykytea.Mykytea(opt)
    elif lang == "zh_tw":
        import jieba
        tokenizer = jieba
    elif lang == "vi":
        from pyvi import ViTokenizer
        tokenizer = ViTokenizer
    elif lang == "th":
        from pythainlp.tokenize import word_tokenize
        tokenizer = word_tokenize
    elif lang == "ar":
        import pyarabic.araby as araby
        tokenizer = araby
    # elif lang=="en":
    #     from nltk import word_tokenize
    #     tokenizer = word_tokenize
    else:
        from nltk.tokenize import ToktokTokenizer
        tokenizer = ToktokTokenizer()

    return tokenizer
Esempio n. 3
0
 def __init__(self, type, model=None):
     assert type in ('Kytea')
     self.type = type
     if type == 'Kytea':
         if model is None:
             splitter = Mykytea.Mykytea('-wsconst D')
         else:
             splitter = Mykytea.Mykytea(f'-wsconst D -model {model}')
     else:
         raise ValueError(
             "Spliter type should be in ['MeCab' or 'Kytea']. ")
     self.splitter = splitter
Esempio n. 4
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--model',
                        type=str,
                        help='path to trained kytea model')
    parser.add_argument('--test', type=str, help='path to test file')
    args = parser.parse_args()

    with open(args.test) as f:
        tests = [line.strip() for line in f]

    opt = f'-model {args.model}'

    mk = Mykytea.Mykytea(opt)

    l_cor, l_sys, l_lcs = 0, 0, 0
    for line in tqdm(tests):
        s = ''.join(w.split('/')[0] for w in line.split())
        gold = [w.split('/')[1] for w in line.split()]
        prediction = [word.tag[0][0][0] for word in mk.getTags(s)]
        # print('input: ' + s)
        # print('prediction: ' + ' '.join(prediction))
        # print('gold: ' + ' '.join(gold))
        # print()

        l_cor += len(gold)
        l_sys += len(prediction)
        l_lcs += lcs(prediction, gold)

    recall = l_lcs / l_cor
    precision = l_lcs / l_sys
    f1_score = (2 * recall * precision) / (recall + precision)
    print(f'recall: {recall:0.3} ({l_lcs}/{l_cor})')
    print(f'precision: {precision:0.3} ({l_lcs}/{l_sys})')
    print(f'f1_score: {f1_score:0.3}')
Esempio n. 5
0
  def __init__(self):

    opt = "-deftag UNK" # Put UNK when a word doesn't appear in the dictionary

    #Mapping extracted from https://gist.github.com/neubig/2555399
    self.EN_TAGS = {'名詞' : 'N',  # Noun
              '代名詞' : 'PRP',  # Pronoun
              '連体詞' : 'DT',  # Adjectival determiner
              '動詞' : 'V',  # Verb
              '形容詞' : 'ADJ',  # Adjective
              '形状詞' : 'ADJV',  # Adjectival verb
              '副詞' : 'ADV', # Adverb
              '助詞' : 'PRT',  # Particle
              '助動詞' : 'AUXV',  # Auxiliary verb
              '補助記号' : '.',  # Punctuation
              '記号' : 'SYM',  # Symbol
              '接尾辞' : 'SUF',  # Suffix
              '接頭辞' : 'PRE',  # Prefix
              '語尾' : 'TAIL',  # Word tail (conjugation)
              '接続詞' : 'CC',  # Conjunction
              'URL' : 'URL',  # URL
              '英単語' : 'ENG',  # English word
              '言いよどみ' : 'FIL',  # Filler
              'web誤脱' : 'MSP',  # Misspelling
              '感動詞' : 'INT',  # Interjection
              '新規未知語' : 'UNK',  # Unclassified unknown word
    }

    self.tagger = Mykytea.Mykytea(opt)
Esempio n. 6
0
    def __init__(self, with_postag: bool = False, **kwargs):
        super(KyTeaTokenizer, self).__init__(name="kytea",
                                             with_postag=with_postag)  # NOQA
        try:
            import Mykytea
        except ModuleNotFoundError:
            raise ModuleNotFoundError("kytea is not installed")

        flag = ""
        self.kytea = Mykytea.Mykytea(flag)
Esempio n. 7
0
    def ja_tokenize(self, text):
        if self.ja_word_tokenizer is None:
            try:
                import Mykytea

                self.ja_word_tokenizer = Mykytea.Mykytea(
                    f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
                )
            except (AttributeError, ImportError):
                raise
        return list(self.ja_word_tokenizer.getWS(text))
Esempio n. 8
0
 def ja_tokenize(self, text):
     if self.ja_word_tokenizer is None:
         try:
             import Mykytea
             self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~'))
         except (AttributeError, ImportError) as e:
             logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps")
             logger.error("1. git clone [email protected]:neubig/kytea.git && cd kytea")
             logger.error("2. autoreconf -i")
             logger.error("3. ./configure --prefix=$HOME/local")
             logger.error("4. make && make install")
             logger.error("5. pip install kytea")
             raise e
     return list(self.ja_word_tokenizer.getWS(text))
Esempio n. 9
0
    def __init__(self,
                 with_postag: bool = False,
                 model_path: Optional[str] = None,
                 **kwargs):

        super(KyTeaTokenizer, self).__init__(name="kytea",
                                             with_postag=with_postag,
                                             model_path=model_path)
        try:
            import Mykytea
        except ImportError:
            raise ImportError("kytea is not installed")

        kytea_option = ""
        if model_path is not None:
            kytea_option += "-model {}".format(model_path)
        self.kytea = Mykytea.Mykytea(kytea_option)
Esempio n. 10
0
    def __init__(
        self, with_postag: bool = False, model_path: Optional[str] = None, **kwargs
    ) -> None:

        super(KyTeaTokenizer, self).__init__(
            name="kytea", with_postag=with_postag, model_path=model_path
        )
        try:
            import Mykytea
        except ImportError:
            msg = "importing kytea failed for some reason."
            msg += "\n  1. make sure KyTea is successfully installed."
            msg += "\n  2. make sure Mykytea-python is successfully installed."
            raise ImportError(msg)

        kytea_option = ""
        if model_path is not None:
            kytea_option += "-model {}".format(model_path)
        self._tokenizer = Mykytea.Mykytea(kytea_option)
Esempio n. 11
0
async def transliterate(text):
    """
    transliterate: Retrieves the Japanese transliteration via kytea
    :param text: The text to be transliterated
    :return: The transliterated string
    """
    # Create a kytea object that loads in model
    mk = Mykytea.Mykytea('-model model.bin')

    # split up the text
    split = mk.getTagsToString(text).split(' ')

    transliterated = ''

    # Loop through the split up parts, find equivalent hiragana, and add it to the transliterated string
    for list in split:
        if len(list) > 0:
            characters = list.split('/')
            transliterated += transliterate_pick_best(characters)

    # Convert the hiragana string into a romaji string then return it
    return romkan.to_roma(transliterated)
Esempio n. 12
0
def uk(opt=''):
    mk = Mykytea.Mykytea(opt)
    return mk
def get_mk(opt=''):
    mk = Mykytea.Mykytea(opt)
    return mk
Esempio n. 14
0
 def __init__(self, option_string=''):
     assert isinstance(option_string, (str, str))
     # option string is argument of Kytea.
     self.kytea = Mykytea.Mykytea(option_string)
Esempio n. 15
0
                for t3 in t2:
                    out = out + "/" + str(t3)
                out += "\t"
            out += "\t"
        print out

def list_tags(t):
    def convert(t2):
        return (t2[0], type(t2[1]))
    return [(word.surface, [[convert(t2) for t2 in t1] for t1 in word.tag]) for word in t]

# You can pass arguments KyTea style like following
opt = "-deftag UNKNOWN!!"
# You can also set your own model
#opt = "-model /usr/local/share/kytea/model.bin"
mk = Mykytea.Mykytea(opt)

s = "今日はいい天気です。1999"

#分かち書きを取得
for word in mk.getWS(s):
    print word

#解析結果を文字列で取得
print mk.getTagsToString(s)

#1位のタグを取得
t = mk.getTags(s)
showTags(t)

#すべてのタグを取得
 def __init__(self, option_string='-deftag UNKNOWN!!'):
     # type: (string_types)->None
     # option string is argument of Kytea.
     assert isinstance(option_string, string_types)
     self.kytea = Mykytea.Mykytea(option_string)
Esempio n. 17
0
#!/usr/bin/env python
import Mykytea
from collections import Counter

tok = Mykytea.Mykytea("-deftag UNKNOWN!!")
wc = Counter()

for line in open('wagahai.txt'):
    for word in tok.getWS(line.strip()):
        wc[word] += 1
Esempio n. 18
0
    _spacy_available = False
    _spacy_version = False

try:
    import spacy
    nlp = spacy.load('ja_ginza')
    _ginza_available = True

    _ginza_version = spacy.__version__
except:
    _ginza_available = False
    _ginza_version = False

try:
    import Mykytea
    mk = Mykytea.Mykytea("")
    _kytea_available = True
    _kytea_version = "0.1.5"
except:
    _kytea_available = False
    _kytea_version = False

try:
    from pyknp import Juman
    jumanpp = Juman()
    _jumanpp_available = True
    _jumanpp_version = "0.4.1"
except:
    _jumanpp_available = False
    _jumanpp_version = False
Esempio n. 19
0
 def __init__(self):
     # KyTea
     self.kytea = Mykytea.Mykytea('')