예제 #1
0
    def parse_with_cabocha(self, text):
        from cabocha.analyzer import CaboChaAnalyzer

        analyzer = CaboChaAnalyzer()
        tree = analyzer.parse(text)
        words = []
        for chunk in tree:
            for token in chunk:
                # print(token, token.pos)
                words.append(token.surface)
        return words
예제 #2
0
    def Tokenizer(self, request, context):
        metadata = dict(context.invocation_metadata())
        print(metadata)

        text = request.text
        print(".. analyse ", text)

        analyzer = CaboChaAnalyzer()
        tree = analyzer.parse(text)
        msg_chunks = nlp_messages.NlCabochaChunks()
        chunks = []
        for chunk in tree:
            msg_chunk = nlp_messages.NlCabochaChunk()
            msg_chunk.id = chunk.id
            if not chunk.additional_info is None:
                msg_chunk.additional_info = chunk.additional_info
            msg_chunk.feature_list.extend(chunk.feature_list)
            msg_chunk.func_pos = chunk.func_pos
            msg_chunk.head_pos = chunk.head_pos
            msg_chunk.link = chunk.link
            msg_chunk.score = chunk.score
            msg_chunk.token_pos = chunk.token_pos
            msg_chunk.next_link_id = chunk.next_link_id
            msg_chunk.prev_link_ids.extend(chunk.prev_link_ids)

            words = []
            for token in chunk:
                # print(token, token.pos)
                word = nlp_messages.NlCabochaToken(
                    surface=token.surface,
                    id=token.id,
                    additional_info=token.additional_info,
                    feature_list=token.feature_list,
                    ne=token.ne,
                    normalized_surface=token.normalized_surface,
                    pos=token.pos,
                    pos1=token.pos1,
                    pos2=token.pos2,
                    pos3=token.pos3,
                    ctype=token.ctype,
                    cform=token.cform,
                    genkei=token.genkei,
                    yomi=token.yomi)
                words.append(word)
            msg_chunk.tokens.extend(words)
            chunks.append(msg_chunk)

        msg_chunks.chunks.extend(chunks)
        return msg_chunks
예제 #3
0
    def create_parts(self, sentence, romas):
        func = "_noname_"
        analyzer = CaboChaAnalyzer()
        tree = analyzer.parse(sentence)
        l = []
        mainPart = 0
        for chunk in tree:
            for token in chunk:
                kan = token.feature.split(',')[-2]
                if kan == '*':
                    kan = token.surface
                romas.append(romkan.to_roma(kan))
            if chunk.link == -1:
                mainPart = chunk.id
                func = self.get_first_token(chunk)
        for chunk in tree:
            curword = chunk.tokens[0].surface
            curfeature = chunk.tokens[0].feature
            feat = self.analyse_feature(curfeature)
            if feat == '@num' or feat == '@n':
                curword = self.join_tokens(chunk)
            elif feat == '@nc':
                curword = self.join_nc_tokens(chunk)
            elif feat == '@v':
                parts = curfeature.split(',')
                raw = parts[-3]
                if raw != '*':
                    curword = raw

            ## main part
            if chunk.link == -1:
                prefix = ""
                if feat == '@v':
                    prefix = "act:"
                elif feat == '@adj':
                    prefix = "desc:"
                elif feat == '@n':
                    prefix = "prop:"
                l.append(prefix + "*" + curword + feat)
            elif chunk.link == mainPart:
                l.append(self.get_prefix(chunk) + "+" + curword + feat)
            else:
                l.append("." + curword + feat)
        result = func + '(' + ", ".join(l) + ')'
        return result
예제 #4
0
class CaboChaBasicTokenizer:
    """CaboCha による原型トークナイザ。
    pos がセットされた場合は、pos であるトークンに制限する"""
    def __init__(self, pos=None):
        self._analyzer = CaboChaAnalyzer()
        self._pos = pos

    def tokenize(self, text):
        if self._pos:
            return [
                token.surface if token.genkei == "*" else token.genkei
                for token in self._analyzer.parse(text).tokens
                if token.pos in self._pos
            ]
        else:
            return [
                token.surface if token.genkei == "*" else token.genkei
                for token in self._analyzer.parse(text).tokens
            ]
예제 #5
0
class POSLocatorTests(unittest.TestCase):
    analyzer = CaboChaAnalyzer()

    def test_can_find_a_particle(self):
        tree = self.analyzer.parse("今日は石山さんと一緒に語るのは結構時間が掛かりました")
        jikan_ga = tree.chunks[5]
        particles = POSLocator.locate_particle(jikan_ga)
        self.assertEqual(particles[0].surface, "が")

    def test_can_find_a_particle_combo(self):
        tree = self.analyzer.parse("僕には無理です")
        boku_wa = tree.chunks[0]
        particles = POSLocator.locate_particle(boku_wa)
        self.assertEqual(particles[0].surface, "に")
        self.assertEqual(particles[1].surface, "は")

    def test_can_find_a_noun(self):
        tree = self.analyzer.parse("今日は果物を買います")
        kudamono_wo = tree.chunks[1]
        nouns = POSLocator.locate_noun(kudamono_wo)
        self.assertEqual(nouns[0].surface, "果物")

    def test_can_find_an_adv(self):
        tree = self.analyzer.parse("ゆっくり歩く")
        yukkuri = tree.chunks[0]
        adverbs = POSLocator.locate_adverb(yukkuri)
        self.assertEqual(adverbs[0].surface, "ゆっくり")

    def test_can_find_an_adj(self):
        tree = self.analyzer.parse("美味しいケーキを食べる")
        oishii = tree.chunks[0]
        adjectives = POSLocator.locate_adjective(oishii)
        self.assertEqual(adjectives[0].surface, "美味しい")

    def test_can_find_suru_verb(self):
        tree = self.analyzer.parse("ファイルを添付しました")
        tempu_shimashita = tree.chunks[1]
        verbs = POSLocator.locate_verb(tempu_shimashita)
        self.assertEqual(verbs[0].feature_list[6], "する")
        self.assertEqual(verbs[1].surface, "添付")
예제 #6
0
파일: main.py 프로젝트: KAWAYu/Tweet2Insta
def tweet2insta(content, entities, hashtags, translate):
    # TODO: 文節区切りでハッシュタグをつける
    # フィルタリング:固有名詞は残す、有名人のハッシュタグが含まれていれば残す
    # とりあえずつけとけばいいハッシュタグ「写真好きとつながりたい」みたいなのをつける
    toriaezu_hash = [
        '#写真好きとつながりたい', '#love', '#instagood', '#happy', '#new', '#photo',
        '#instalike', '#photooftheday', '#like4like', '#l4l'
    ]
    hashtag = []
    tree = CaboChaAnalyzer().parse(content)
    # chunkからタグ候補作成
    for chunk in tree:
        _chunk = ''
        for token in chunk.tokens:
            if token.pos not in ['助詞', '助動詞']:
                _chunk += token.genkei if token.genkei != '*' else token.surface
            else:
                break
        hashtag.append(_chunk)
    # 固有名詞っぽいもの以外抜き取り
    _hashtags1 = []
    for _hashtag in hashtag:
        if _hashtag in entities:
            _hashtags1.append(_hashtag)
    _hashtags2 = []
    for _hashtag in hashtag:
        if _hashtag in hashtags:
            _hashtags2.append(_hashtag)
    eng_tag = []
    for _h in hashtag:
        if _h in translate:
            eng_tag += translate[_h]

    hashtag = set(_hashtags1) | set(_hashtags2) | set(eng_tag)

    _toriaezu = random.sample(toriaezu_hash, k=4)

    hashtag = ['#%s' % s for s in hashtag] + _toriaezu
    return ' '.join(hashtag)
예제 #7
0
 def __init__(self):
     self._model = None
     self.size = 50
     self.analyzer = CaboChaAnalyzer()
예제 #8
0
class WordEmbeddings(object):
    def __init__(self):
        self._model = None
        self.size = 50
        self.analyzer = CaboChaAnalyzer()

    def train_word_embeddings(self, sentences, save_path,
                              **params):  # paramsが空だとエラーが起きるっぽい
        model = gensim.models.Word2Vec(sentences, **params)
        model.save(save_path)
        self._model = model

    def load_word_embeddings(self, path):
        model = gensim.models.word2vec.Word2Vec.load(path)
        self._model = model

    def get_word_vector(self, term):
        try:
            vector = self._model.wv[term]
        except KeyError:
            raise KeyError("Term doesn't exists.")
        return vector

    def get_vec(self, text):
        mt = MeCab.Tagger('')
        mt.parse('')
        sum_vec = np.zeros(self.size)
        word_count = 0
        node = mt.parseToNode(text)
        while node:
            fields = node.feature.split(",")
            # 限定
            if fields[0] == '名詞' or fields[0] == '動詞' or fields[0] == '形容詞':
                try:
                    sum_vec += self._model.wv[node.surface]
                except:
                    pass
                word_count += 1
            node = node.next
        return sum_vec / word_count

    def get_vector(self, text):
        if text == '' or text is None:
            return np.zeros(self.size)
        tree = self.analyzer.parse(text)
        sum_vec = np.zeros(self.size)
        word_count = 0
        for chunk in tree:
            for token in chunk:
                if token.pos == '名詞' or token.pos == '動詞' or token.pos == '形容詞':
                    try:
                        sum_vec += self._model.wv[token.surface]
                    except:
                        pass
                    word_count += 1
        return (sum_vec / word_count)

    def get_vectors(self, text_array):
        return np.sum(self.get_vector(text) for text in text_array)

    def cos_sim(self, v1, v2):
        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

    def get_most_similar_word(self, word):
        try:
            results = self._model.wv.most_similar(positive=[word])
            return [word for word, result in results]
        except:
            return []
예제 #9
0
import cabocha
from cabocha.analyzer import CaboChaAnalyzer
analyzer = CaboChaAnalyzer()
tree = analyzer.parse("日本語の形態素解析はすごいです。")
for chunk in tree:
  for token in chunk:
    print(token)
예제 #10
0
from cabocha.analyzer import CaboChaAnalyzer
analyzer = CaboChaAnalyzer()
tree = analyzer.parse(
    "僕は短文の付箋を作ることとか、長文で書いてしまったものを短く刻むことに慣れてるのだけど、世の中の人は慣れてないから長文のまま入れてしまって「字が小さすぎて読めない付箋」を作っちゃうよね"
)
# for chunk in tree:
#     for token in chunk:
#         print(token)

start = 0
while start < tree.chunk_size:
    i = start
    result = [tree[i].surface]
    while True:
        if tree[i].next_link_id == i + 1:
            result.append(tree[i + 1].surface)
            i += 1
        else:
            break
    print(start, result, tree[i].next_link_id)
    start = i + 1
예제 #11
0
 def __init__(self, pos=None):
     self._analyzer = CaboChaAnalyzer()
     self._pos = pos
예제 #12
0
class CollocationGeneratorSpec(unittest.TestCase):
    analyzer = CaboChaAnalyzer()

    def test_can_build_noun_particle_verb_collocation_with_a_verb(self):
        tree = self.analyzer.parse("朝から日本語を勉強する")
        benkyo_suru = tree[2]

        collocations = CollocationGenerator.build_noun_particle_verb_collocations(benkyo_suru)

        self.assertEqual(collocations[0].np[0].surface, "朝")
        self.assertEqual(collocations[0].pp[0].surface, "から")
        self.assertEqual(collocations[0].vp[1].surface, "勉強")
        self.assertEqual(collocations[0].vp[0].surface, "する")

        self.assertEqual(collocations[1].np[0].surface, "日本語")
        self.assertEqual(collocations[1].pp[0].surface, "を")
        self.assertEqual(collocations[1].vp[1].surface, "勉強")
        self.assertEqual(collocations[1].vp[0].surface, "する")

    def test_can_build_adverb_verb_collocation_with_verb(self):
        tree = self.analyzer.parse("徐々に進んでいる")
        susundeiru = tree[1]

        collocations = CollocationGenerator.build_adverb_verb_collocations(susundeiru)

        self.assertEqual(collocations[0].vp[0].feature_list[6], "進む")
        self.assertEqual(collocations[0].ap[0].surface, "徐々に")

    def test_can_build_adjective_noun_collocation_with_noun(self):
        tree = self.analyzer.parse("美味しいケーキを食べる")
        keekiwo = tree[1]

        collocations = CollocationGenerator.build_adjective_noun_collocations(keekiwo)

        self.assertEqual(collocations[0].adjp[0].surface, "美味しい")
        self.assertEqual(collocations[0].np[0].surface, "ケーキ")

    def test_only_builds_complete_npv_collocation(self):
        tree = self.analyzer.parse("めっちゃ食べる")
        taberu = tree[1]

        incomplete_npv = CollocationGenerator.build_noun_particle_verb_collocations(taberu)

        self.assertEqual(len(incomplete_npv), 0)

    def test_only_builds_complete_advv_collocation(self):
        tree = self.analyzer.parse("ケーキを食べる")
        taberu = tree[1]

        incomplete_advv = CollocationGenerator.build_adverb_verb_collocations(taberu)

        self.assertEqual(len(incomplete_advv), 0)

    def test_only_builds_complete_adjn_collocation(self):
        tree = self.analyzer.parse("昨日食べた美味しいたこ焼きは最高でした")
        saikou = tree[4]

        incomplete_adjn = CollocationGenerator.build_adjective_noun_collocations(saikou)
        print(incomplete_adjn)

        self.assertEqual(len(incomplete_adjn), 0)
예제 #13
0
import random
import sys

from cabocha.analyzer import CaboChaAnalyzer

analyzer = CaboChaAnalyzer()
B = {'ば', 'び', 'ぶ', 'べ', 'ぼ', 'バ', 'ビ', 'ブ', 'ベ', 'ボ'}
K = {'か', 'き', 'く', 'け', 'こ', 'カ', 'キ', 'ク', 'ケ', 'コ'}

class BKB:
    def __init__(self, path):
        self.path = path
        with open(path) as f:
            text = f.read()
        self.text = text
    
    def extract_bkb(self): 
        bs = []   
        ks = []
        tree = analyzer.parse(self.text)
        for chunk in tree:
            word = chunk.surface[0] if chunk.surface[0] != ' ' else chunk.surface[1]
            if word in B: bs.append(chunk.surface)
            if word in K: ks.append(chunk.surface)
        return [bs, ks]
    
    def hiiya_sususu(self, b, k):
        if len(b) == 0 or len(k) == 0:
            print('\nヒイァ...\n')
            return None
        print(f'\n{random.choice(b)}', end=' '*4)