class ExplainJapaneseSentences(BaseFilter): def __init__(self): super().__init__() # Initialize a RakutenMA instance with an empty model # the default ja feature set is set already self.rma = RakutenMA() # Initialize a RakutenMA instance with a pre-trained model self.rma = RakutenMA( phi=1024, c=0.007812 ) # Specify hyperparameter for SCW (for demonstration purpose) # https://github.com/ikegami-yukino/rakutenma-python/tree/master/rakutenma/model self.rma.load(abspath(r'..\resource\model_ja.min.json')) def __call__(self, chunk): chunk = self._duplicate_chunk(chunk) chunk.final = True result = [chunk] text = self.tokenize(chunk.text) result.append( TextChunk(text=text, language='japanese', audible=False, printable=True, final=True)) return result def tokenize(self, text): tokens = self.rma.tokenize(text) return ' '.join(map(lambda pair: f'{pair[0]} ({pair[1]})', tokens))
def tagWordsInSentences(self, studying, entry): '''Tags the part of speech for each word.''' jar_path = 'stanford-postagger-full/stanford-postagger.jar' if studying in self.english: words = parseWordsFromEntry(entry) tagged_words = tagWords(words) return tagged_words elif studying in self.japanese or self.korean or self.mandarin: #segmenter = TinySegmenter() #words = segmenter.tokenize(entry) rm = RakutenMA() tagged_words = rm.tokenize(entry) #mecab = Mecab() #tagged_words = mecab.pos(entry) return tagged_words else: if studying in self.spanish: model_path = 'stanford-postagger-full/models/spanish.tagger' words = parseWordsFromEntry(entry) elif studying in self.french: model_path = 'stanford-postagger-full/models/french.tagger' words = parseWordsFromEntry(entry) postagger = StanfordPOSTagger(model_path, jar_path, encoding='utf8') tagged_words = postagger.tag(words) return tagged_words
def tokenize(x, t): if t in TOKC: from jieba import posseg toks = posseg.cut(x) if t == POSC: return u'\u3000'.join([('%s [%s]' % (f.word, f.flag)) for f in toks]) elif t == SPACEC: return u'\u3000'.join([('%s' % (f.word)) for f in toks]) else: return lexDens(toks, t) elif t in TOKJ: from rakutenma import RakutenMA rma = RakutenMA() rma = RakutenMA(phi=1024, c=0.007812) tD, tF = os.path.split(__file__) jSon = os.path.join(tD, 'model_ja.min.json') rma.load(jSon) toks = rma.tokenize(x) if t == SPACEJ: return u'\u3000'.join([i[0] for i in toks]) elif t == POSJ: return u'\u3000'.join([('%s [%s]' % (i[0], i[1])) for i in toks]) else: return lexDens(toks, t)
def test_train_one(self): rma = RakutenMA() rma.featset = ["w0"] res = rma.train_one([["foo", "N-nc"], ["bar", "N-nc"]]) assert_true(res["updated"]) assert_true(Trie.find(rma.model["mu"], ["w0", "f", "B-N"]) > 0) assert_true(Trie.find(rma.model["mu"], ["w0", "o", "I-N"]) > 0) assert_true(Trie.find(rma.model["mu"], ["w0", "o", "E-N"]) > 0) assert_equals(rma.tokenize("foobar"), [["foo", "N-nc"], ["bar", "N-nc"]])
class PairScorer(): def __init__(self, model): print model self.rma = RakutenMA(json.loads(open(model).read())) self.rma.hash_func = RakutenMA.create_hash_func(self.rma, 15) return def extract_ja_content_lemmas(self, s): """ extracxts content words from a japanese sentence (nouns, verb roots, adjectives, no okurigana) """ s = unicode(s, 'utf-8') out = [] for [x, y] in self.rma.tokenize(s): if y in RAKUTEN_POS_TAGS: if y.startswith('V'): out += [(guess, y) for guess in guess_stem(x)] else: out.append( (x, y) ) return out def extract_en_content_lemmas(self, s): def penn_to_wordnet(pos): p = pos[0].lower() if p == 'j': return 'a' elif p == 'r': return 'r' elif p == 'v': return 'v' else: return 'n' lemmatizer = nltk.stem.WordNetLemmatizer() s = unicode(s, 'utf-8') out = [] for w, pos in nltk.pos_tag(nltk.word_tokenize(s)): if pos in PENN_POS_TAGS: out.append( (lemmatizer.lemmatize(w, pos=penn_to_wordnet(pos)), pos) ) return out
def splitShuffle(expr, t): expr = stripHTML(expr).strip() if t == SGJ: from rakutenma import RakutenMA rma = RakutenMA(phi=1024, c=0.007812) tD, tF = os.path.split(__file__) jSon = os.path.join(tD, 'model_ja.min.json') rma.load(jSon) resultl = rma.tokenize(expr) result = [r for r, s in resultl] elif t in SGC: import jieba result = jieba.cut(expr, cut_all=False) elif t == JSS: result = expr.split(' ') elif t in WRAPL: result = list(expr) newResult, glosses = getResult(result, t) jn = u'' full = jn.join(newResult) random.shuffle(newResult) strResult = u''.join(newResult) return strResult, full, glosses
import re #segmenter = tinysegmenter.TinySegmenter() result = tinysegmenter.tokenize( "米中間選挙は6日に午後6時(日本時間7日午前8時)に一部の投票所が締め切られ、開票が始まった。米連邦議会の多数党がどちらになるかによって、ドナルド・トランプ米大統領の政策の行方が決まる。特に下院でどれだけ、民主党が共和党現職の議席を奪うかが注目されている。") print('Segmenter: ') print(result) # Initialize a RakutenMA instance with an empty model # the default ja feature set is set already rma = RakutenMA() # Let's analyze a sample sentence (from http://tatoeba.org/jpn/sentences/show/103809) # With a disastrous result, since the model is empty! print('Result') print(rma.tokenize(result)) print('Original') print(rma.tokenize("米中間選挙は6日に午後6時(日本時間7日午前8時)に一部の投票所が締め切られ、開票が始まった。")) print('------------------') # print(rma.tokenize("子どものみなさん、ゆるしてください。ぼくはこの本をひとりのおとなのひとにささげます。でもちゃんとしたわけがあるのです。")) # print(rma.tokenizetwo("彼は新しい仕事できっと成功するだろう。")) # print(rma.tokenize("彼は新しい仕事できっと成功するだろう。")) # Feed the model with ten sample sentences from tatoeba.com # "tatoeba.json" is available at https://github.com/rakuten-nlp/rakutenma import json tatoeba = json.load(open("tatoeba.json", encoding='utf-8')) j = 0 while (j < 10):