def parse_with_cabocha(self, text): from cabocha.analyzer import CaboChaAnalyzer analyzer = CaboChaAnalyzer() tree = analyzer.parse(text) words = [] for chunk in tree: for token in chunk: # print(token, token.pos) words.append(token.surface) return words
def Tokenizer(self, request, context): metadata = dict(context.invocation_metadata()) print(metadata) text = request.text print(".. analyse ", text) analyzer = CaboChaAnalyzer() tree = analyzer.parse(text) msg_chunks = nlp_messages.NlCabochaChunks() chunks = [] for chunk in tree: msg_chunk = nlp_messages.NlCabochaChunk() msg_chunk.id = chunk.id if not chunk.additional_info is None: msg_chunk.additional_info = chunk.additional_info msg_chunk.feature_list.extend(chunk.feature_list) msg_chunk.func_pos = chunk.func_pos msg_chunk.head_pos = chunk.head_pos msg_chunk.link = chunk.link msg_chunk.score = chunk.score msg_chunk.token_pos = chunk.token_pos msg_chunk.next_link_id = chunk.next_link_id msg_chunk.prev_link_ids.extend(chunk.prev_link_ids) words = [] for token in chunk: # print(token, token.pos) word = nlp_messages.NlCabochaToken( surface=token.surface, id=token.id, additional_info=token.additional_info, feature_list=token.feature_list, ne=token.ne, normalized_surface=token.normalized_surface, pos=token.pos, pos1=token.pos1, pos2=token.pos2, pos3=token.pos3, ctype=token.ctype, cform=token.cform, genkei=token.genkei, yomi=token.yomi) words.append(word) msg_chunk.tokens.extend(words) chunks.append(msg_chunk) msg_chunks.chunks.extend(chunks) return msg_chunks
def create_parts(self, sentence, romas): func = "_noname_" analyzer = CaboChaAnalyzer() tree = analyzer.parse(sentence) l = [] mainPart = 0 for chunk in tree: for token in chunk: kan = token.feature.split(',')[-2] if kan == '*': kan = token.surface romas.append(romkan.to_roma(kan)) if chunk.link == -1: mainPart = chunk.id func = self.get_first_token(chunk) for chunk in tree: curword = chunk.tokens[0].surface curfeature = chunk.tokens[0].feature feat = self.analyse_feature(curfeature) if feat == '@num' or feat == '@n': curword = self.join_tokens(chunk) elif feat == '@nc': curword = self.join_nc_tokens(chunk) elif feat == '@v': parts = curfeature.split(',') raw = parts[-3] if raw != '*': curword = raw ## main part if chunk.link == -1: prefix = "" if feat == '@v': prefix = "act:" elif feat == '@adj': prefix = "desc:" elif feat == '@n': prefix = "prop:" l.append(prefix + "*" + curword + feat) elif chunk.link == mainPart: l.append(self.get_prefix(chunk) + "+" + curword + feat) else: l.append("." + curword + feat) result = func + '(' + ", ".join(l) + ')' return result
class CaboChaBasicTokenizer: """CaboCha による原型トークナイザ。 pos がセットされた場合は、pos であるトークンに制限する""" def __init__(self, pos=None): self._analyzer = CaboChaAnalyzer() self._pos = pos def tokenize(self, text): if self._pos: return [ token.surface if token.genkei == "*" else token.genkei for token in self._analyzer.parse(text).tokens if token.pos in self._pos ] else: return [ token.surface if token.genkei == "*" else token.genkei for token in self._analyzer.parse(text).tokens ]
class POSLocatorTests(unittest.TestCase): analyzer = CaboChaAnalyzer() def test_can_find_a_particle(self): tree = self.analyzer.parse("今日は石山さんと一緒に語るのは結構時間が掛かりました") jikan_ga = tree.chunks[5] particles = POSLocator.locate_particle(jikan_ga) self.assertEqual(particles[0].surface, "が") def test_can_find_a_particle_combo(self): tree = self.analyzer.parse("僕には無理です") boku_wa = tree.chunks[0] particles = POSLocator.locate_particle(boku_wa) self.assertEqual(particles[0].surface, "に") self.assertEqual(particles[1].surface, "は") def test_can_find_a_noun(self): tree = self.analyzer.parse("今日は果物を買います") kudamono_wo = tree.chunks[1] nouns = POSLocator.locate_noun(kudamono_wo) self.assertEqual(nouns[0].surface, "果物") def test_can_find_an_adv(self): tree = self.analyzer.parse("ゆっくり歩く") yukkuri = tree.chunks[0] adverbs = POSLocator.locate_adverb(yukkuri) self.assertEqual(adverbs[0].surface, "ゆっくり") def test_can_find_an_adj(self): tree = self.analyzer.parse("美味しいケーキを食べる") oishii = tree.chunks[0] adjectives = POSLocator.locate_adjective(oishii) self.assertEqual(adjectives[0].surface, "美味しい") def test_can_find_suru_verb(self): tree = self.analyzer.parse("ファイルを添付しました") tempu_shimashita = tree.chunks[1] verbs = POSLocator.locate_verb(tempu_shimashita) self.assertEqual(verbs[0].feature_list[6], "する") self.assertEqual(verbs[1].surface, "添付")
def tweet2insta(content, entities, hashtags, translate): # TODO: 文節区切りでハッシュタグをつける # フィルタリング:固有名詞は残す、有名人のハッシュタグが含まれていれば残す # とりあえずつけとけばいいハッシュタグ「写真好きとつながりたい」みたいなのをつける toriaezu_hash = [ '#写真好きとつながりたい', '#love', '#instagood', '#happy', '#new', '#photo', '#instalike', '#photooftheday', '#like4like', '#l4l' ] hashtag = [] tree = CaboChaAnalyzer().parse(content) # chunkからタグ候補作成 for chunk in tree: _chunk = '' for token in chunk.tokens: if token.pos not in ['助詞', '助動詞']: _chunk += token.genkei if token.genkei != '*' else token.surface else: break hashtag.append(_chunk) # 固有名詞っぽいもの以外抜き取り _hashtags1 = [] for _hashtag in hashtag: if _hashtag in entities: _hashtags1.append(_hashtag) _hashtags2 = [] for _hashtag in hashtag: if _hashtag in hashtags: _hashtags2.append(_hashtag) eng_tag = [] for _h in hashtag: if _h in translate: eng_tag += translate[_h] hashtag = set(_hashtags1) | set(_hashtags2) | set(eng_tag) _toriaezu = random.sample(toriaezu_hash, k=4) hashtag = ['#%s' % s for s in hashtag] + _toriaezu return ' '.join(hashtag)
def __init__(self): self._model = None self.size = 50 self.analyzer = CaboChaAnalyzer()
class WordEmbeddings(object): def __init__(self): self._model = None self.size = 50 self.analyzer = CaboChaAnalyzer() def train_word_embeddings(self, sentences, save_path, **params): # paramsが空だとエラーが起きるっぽい model = gensim.models.Word2Vec(sentences, **params) model.save(save_path) self._model = model def load_word_embeddings(self, path): model = gensim.models.word2vec.Word2Vec.load(path) self._model = model def get_word_vector(self, term): try: vector = self._model.wv[term] except KeyError: raise KeyError("Term doesn't exists.") return vector def get_vec(self, text): mt = MeCab.Tagger('') mt.parse('') sum_vec = np.zeros(self.size) word_count = 0 node = mt.parseToNode(text) while node: fields = node.feature.split(",") # 限定 if fields[0] == '名詞' or fields[0] == '動詞' or fields[0] == '形容詞': try: sum_vec += self._model.wv[node.surface] except: pass word_count += 1 node = node.next return sum_vec / word_count def get_vector(self, text): if text == '' or text is None: return np.zeros(self.size) tree = self.analyzer.parse(text) sum_vec = np.zeros(self.size) word_count = 0 for chunk in tree: for token in chunk: if token.pos == '名詞' or token.pos == '動詞' or token.pos == '形容詞': try: sum_vec += self._model.wv[token.surface] except: pass word_count += 1 return (sum_vec / word_count) def get_vectors(self, text_array): return np.sum(self.get_vector(text) for text in text_array) def cos_sim(self, v1, v2): return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) def get_most_similar_word(self, word): try: results = self._model.wv.most_similar(positive=[word]) return [word for word, result in results] except: return []
import cabocha from cabocha.analyzer import CaboChaAnalyzer analyzer = CaboChaAnalyzer() tree = analyzer.parse("日本語の形態素解析はすごいです。") for chunk in tree: for token in chunk: print(token)
from cabocha.analyzer import CaboChaAnalyzer analyzer = CaboChaAnalyzer() tree = analyzer.parse( "僕は短文の付箋を作ることとか、長文で書いてしまったものを短く刻むことに慣れてるのだけど、世の中の人は慣れてないから長文のまま入れてしまって「字が小さすぎて読めない付箋」を作っちゃうよね" ) # for chunk in tree: # for token in chunk: # print(token) start = 0 while start < tree.chunk_size: i = start result = [tree[i].surface] while True: if tree[i].next_link_id == i + 1: result.append(tree[i + 1].surface) i += 1 else: break print(start, result, tree[i].next_link_id) start = i + 1
def __init__(self, pos=None): self._analyzer = CaboChaAnalyzer() self._pos = pos
class CollocationGeneratorSpec(unittest.TestCase): analyzer = CaboChaAnalyzer() def test_can_build_noun_particle_verb_collocation_with_a_verb(self): tree = self.analyzer.parse("朝から日本語を勉強する") benkyo_suru = tree[2] collocations = CollocationGenerator.build_noun_particle_verb_collocations(benkyo_suru) self.assertEqual(collocations[0].np[0].surface, "朝") self.assertEqual(collocations[0].pp[0].surface, "から") self.assertEqual(collocations[0].vp[1].surface, "勉強") self.assertEqual(collocations[0].vp[0].surface, "する") self.assertEqual(collocations[1].np[0].surface, "日本語") self.assertEqual(collocations[1].pp[0].surface, "を") self.assertEqual(collocations[1].vp[1].surface, "勉強") self.assertEqual(collocations[1].vp[0].surface, "する") def test_can_build_adverb_verb_collocation_with_verb(self): tree = self.analyzer.parse("徐々に進んでいる") susundeiru = tree[1] collocations = CollocationGenerator.build_adverb_verb_collocations(susundeiru) self.assertEqual(collocations[0].vp[0].feature_list[6], "進む") self.assertEqual(collocations[0].ap[0].surface, "徐々に") def test_can_build_adjective_noun_collocation_with_noun(self): tree = self.analyzer.parse("美味しいケーキを食べる") keekiwo = tree[1] collocations = CollocationGenerator.build_adjective_noun_collocations(keekiwo) self.assertEqual(collocations[0].adjp[0].surface, "美味しい") self.assertEqual(collocations[0].np[0].surface, "ケーキ") def test_only_builds_complete_npv_collocation(self): tree = self.analyzer.parse("めっちゃ食べる") taberu = tree[1] incomplete_npv = CollocationGenerator.build_noun_particle_verb_collocations(taberu) self.assertEqual(len(incomplete_npv), 0) def test_only_builds_complete_advv_collocation(self): tree = self.analyzer.parse("ケーキを食べる") taberu = tree[1] incomplete_advv = CollocationGenerator.build_adverb_verb_collocations(taberu) self.assertEqual(len(incomplete_advv), 0) def test_only_builds_complete_adjn_collocation(self): tree = self.analyzer.parse("昨日食べた美味しいたこ焼きは最高でした") saikou = tree[4] incomplete_adjn = CollocationGenerator.build_adjective_noun_collocations(saikou) print(incomplete_adjn) self.assertEqual(len(incomplete_adjn), 0)
import random import sys from cabocha.analyzer import CaboChaAnalyzer analyzer = CaboChaAnalyzer() B = {'ば', 'び', 'ぶ', 'べ', 'ぼ', 'バ', 'ビ', 'ブ', 'ベ', 'ボ'} K = {'か', 'き', 'く', 'け', 'こ', 'カ', 'キ', 'ク', 'ケ', 'コ'} class BKB: def __init__(self, path): self.path = path with open(path) as f: text = f.read() self.text = text def extract_bkb(self): bs = [] ks = [] tree = analyzer.parse(self.text) for chunk in tree: word = chunk.surface[0] if chunk.surface[0] != ' ' else chunk.surface[1] if word in B: bs.append(chunk.surface) if word in K: ks.append(chunk.surface) return [bs, ks] def hiiya_sususu(self, b, k): if len(b) == 0 or len(k) == 0: print('\nヒイァ...\n') return None print(f'\n{random.choice(b)}', end=' '*4)