def text_to_word(text, conv): word = Word() word.kaki = text word.yomi = conv.do(text) # 同じ文字の三回以上の繰り返しを消し去る # ex. おはよ!!! → おはよ! word.yomi = re.sub(r'(.)\1{2,}', r'\1', word.yomi) # 括弧以降を無視 # ex. ちょん↑ぱぁ!(しょうり) → ちょん↑ぱぁ! word.yomi = re.sub(r'^([^()()「」]+)[((「].*$', r'\1', word.yomi) # ひらがなと一部の記号のみにする # ex. ちょん↑ぱぁ! → ちょんぱぁ word.yomi = "".join(re.findall(r'[ぁ-ん、。ー]+', word.yomi)) print(f"kaki: {word.kaki}, yomi: {word.yomi}") return word
def main(): auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_secret) api = tweepy.API(auth) kakasi = kks() kakasi.setMode('K', 'H') kakasi.setMode('J', 'H') conv = kakasi.getConverter() results = api.user_timeline(screen_name=ACCOUNT, count=COUNT) for r in results: hashtag_tags = r.entities['hashtags'] hashtag = "" if hashtag_tags: hashtag = hashtag_tags[0]['text'] text = r.text text = filter_text(text, hashtag) if not text: pass lines = text.splitlines() for l in lines: if find_word(l): word = Word() word.kaki = l word.yomi = conv.do(l) print(f"kaki: {word.kaki}, yomi: {word.yomi}") find_or_add_word(session, word) tweet = Tweet() tweet.twitterId = r.id tweet.text = text find_or_add_tweet(session, tweet)
import unittest from analysis import save_word_from_tweet from models import find_or_add_tweet, Word, Tweet, session """ ImportErrorで、直接は実行できないのでインタープリタでドーンするしかNASA """ sample = [("チョン↑パァ!(勝利)", [], "ちょんぱぁ", "チョン↑パァ!(勝利)", 10), ("ばにしぇだよ〜wwwww", [], "ばにしぇだよ", "ばにしぇだよ〜wwwww", 20), ("任せてほ↑しい", [], "まかせてほしい", "任せてほ↑しい", 30)] for text, hashtags, yomi, kaki, n in sample: for i in range(1, 6): tweet = Tweet() tweet.text = text tweet.twitterId = 200 + n + i find_or_add_tweet(session, tweet) word = Word() word.yomi = yomi word.kaki = kaki tmp = save_word_from_tweet(text, hashtags) print("generated: ", word.yomi, " => ", word.kaki) print("expected: ", tmp.yomi, " => ", tmp.kaki) print(word.yomi == tmp.yomi and word.kaki == tmp.kaki)