def quickAnalyze(): # dbからデータを読み込む u = LoadUserData(conf_path) dbSession = model.startSession(u) table, footer = readReplyTable.read(exec_path + "/common/replyTable.json") regexes = makeRegexes(table) # 前回の更新時間から現在までのデータを入手する q = dbSession.query(model.Tweet) tq = q.filter(model.Tweet.isAnalyze == 0)[:10000] for t in tq: #1発言毎 t.text = RemoveCharacter(t.text) #print_d2(t.text) analyzeReply2(t,dbSession,table,regexes) t.isAnalyze = 1 t_enc = t.text.encode(g_mecabencode,'ignore') sarray = mecab.sparse_all(t_enc,mecabPath).split("\n") sarray2 = connectUnderScore(sarray) markovWordList,topNWordList = TakeWordList(sarray) #最近出た名詞貯める for tn in topNWordList: hot = model.Hot() hot.word = unicode(tn,g_systemencode) dbSession.add(hot) dbSession.add(t) dbSession.commit()
def sparse_sentence(self, s): #print s s_sparse =\ mecab.sparse_all(s.encode("utf-8"),mecab_path).split("\n")[:-2] candidate = set() for s2 in s_sparse: # この時点で単語レベルのハズ(ただしs2=単語 品詞 # とかかなぁ #print "s2", s3 = s2.decode("utf-8").split("\t") s4 = s3[1].split(",") #print s3[0],s4[0] if s4[0] != u"記号" and s4[0] != u"助動詞" \ and s4[0] != u"助詞":#数が集まったら名詞のみにしたい candidate.add(s3[0]) return candidate
def sparse_sentence(s): #print s s_sparse =\ mecab.sparse_all(s.encode("utf-8"),"/usr/lib/libmecab.so.1").split("\n")[:-2] candidate = set() for s2 in s_sparse: # この時点で単語レベルのハズ(ただしs2=単語 品詞 # とかかなぁ #print "s2", s3 = s2.decode("utf-8").split("\t") s4 = s3[1].split(",") if s4[0] == u"名詞": #if s4[0] != u"記号" and s4[0] != u"助動詞" \ # and s4[0] != u"助詞":#数が集まったら名詞のみにしたい # print s3[0],s4[0],s4[1] if not stopwords(s3[0]): candidate.add(s3[0]) return candidate