class ExplainJapaneseSentences(BaseFilter): def __init__(self): super().__init__() # Initialize a RakutenMA instance with an empty model # the default ja feature set is set already self.rma = RakutenMA() # Initialize a RakutenMA instance with a pre-trained model self.rma = RakutenMA( phi=1024, c=0.007812 ) # Specify hyperparameter for SCW (for demonstration purpose) # https://github.com/ikegami-yukino/rakutenma-python/tree/master/rakutenma/model self.rma.load(abspath(r'..\resource\model_ja.min.json')) def __call__(self, chunk): chunk = self._duplicate_chunk(chunk) chunk.final = True result = [chunk] text = self.tokenize(chunk.text) result.append( TextChunk(text=text, language='japanese', audible=False, printable=True, final=True)) return result def tokenize(self, text): tokens = self.rma.tokenize(text) return ' '.join(map(lambda pair: f'{pair[0]} ({pair[1]})', tokens))
def tokenize(x, t): if t in TOKC: from jieba import posseg toks = posseg.cut(x) if t == POSC: return u'\u3000'.join([('%s [%s]' % (f.word, f.flag)) for f in toks]) elif t == SPACEC: return u'\u3000'.join([('%s' % (f.word)) for f in toks]) else: return lexDens(toks, t) elif t in TOKJ: from rakutenma import RakutenMA rma = RakutenMA() rma = RakutenMA(phi=1024, c=0.007812) tD, tF = os.path.split(__file__) jSon = os.path.join(tD, 'model_ja.min.json') rma.load(jSon) toks = rma.tokenize(x) if t == SPACEJ: return u'\u3000'.join([i[0] for i in toks]) elif t == POSJ: return u'\u3000'.join([('%s [%s]' % (i[0], i[1])) for i in toks]) else: return lexDens(toks, t)
def _get_tokenizer(lang): rma = None if lang == 'ja': rma = RakutenMA() rma.load('model_ja.json') rma.hash_func = rma.create_hash_func(15) tokenizer = _jap_tokenizer # tokenizer = _jap_character_tokenizer else: tokenizer = _eng_tokenizer return tokenizer, rma
def splitShuffle(expr, t): expr = stripHTML(expr).strip() if t == SGJ: from rakutenma import RakutenMA rma = RakutenMA(phi=1024, c=0.007812) tD, tF = os.path.split(__file__) jSon = os.path.join(tD, 'model_ja.min.json') rma.load(jSon) resultl = rma.tokenize(expr) result = [r for r, s in resultl] elif t in SGC: import jieba result = jieba.cut(expr, cut_all=False) elif t == JSS: result = expr.split(' ') elif t in WRAPL: result = list(expr) newResult, glosses = getResult(result, t) jn = u'' full = jn.join(newResult) random.shuffle(newResult) strResult = u''.join(newResult) return strResult, full, glosses
del counter['DETWH'] counter['ET'] += 0 counter['I'] += 0 counter['NC'] += counter['N'] + counter['VINF']; del counter['N']; del counter['VINF'] counter['NP'] += counter['NPP']; del counter['NPP'] del counter['P'] # The Japanese tag set doesn't account for prepositions. We could manually look for them using a table like this: http://mylanguages.org/japanese_prepositions.php counter['PREF'] += 0 counter['PRO'] += counter['PROREL'] + counter['PROWH']; del counter['PROREL']; del counter['PROWH'] counter['V'] += counter['VIMP'] + counter['VPR'] + counter['VS']; del counter['VIMP']; del counter['VPR']; del counter['VS'] counter['PUNC'] += counter['.$$.']; del counter['.$$.'] return dict(counter) rma = RakutenMA() rma.load("model_ja.json") def _analyze_ja(text): tags = rma.tokenize(text) counter = collections.Counter([x[1] for x in tags]) # see the same premise above in the French section subordinating_conjunctions = list(filter(lambda tup: tup[1] == 'C' and tup[0] in jsc, tags)) return { # we need to map the Japanese tagset to a subset of the French tagset, so that we can compare the two 'ADJ': counter['A-c'] + counter['A-dp'] + counter['J-c'] + counter['J-tari'] + counter['J-xs'] + counter['R'], 'ADV': counter['F'], 'CC': counter['C'] - len(subordinating_conjunctions), 'CS': len(subordinating_conjunctions), 'ET': counter['E'], 'I': counter['I-c'], 'NC': counter['N-n'] + counter['N-nc'], 'NP': counter['N-pn'], 'PREF': counter['P'],
def __init__(self): rma = RakutenMA() rma.load("model_ja.json") rma.hash_func = rma.create_hash_func(15) self.rma = rma
def create_tokenizer(): rma = RakutenMA() rma.load('model_ja.json') rma.hash_func = rma.create_hash_func(15) return rma.tokenize