def main(): # TF-IDF로 문서의 주제가 될만한 단어 선택 text_list, paper_list = TFIDF.load_text_list() top7_list = TFIDF.create_top7(text_list) ft_model = FastText.load('./fasttext/eng_ft') # ft 모델 로드 user_input = str(input('▶키워드를 입력하세요:')) user_num = int(input('▶추천받을 논문 수를 입력하세요:')) user_input = user_input.split() # 유저 인풋 user_input = [w.lower() for w in user_input] doc_link = extract_simdoc_list(ft_model, top7_list, paper_list, user_input, user_num) user_inter = Userinter.craw(doc_link) while (True): print('\n[추천 논문 리스트]') for i in user_inter: print(str(user_inter.index(i) + 1), i[0]) user_pick = int(input('\n▶원하는 논문번호를 입력하세요:')) user_pick = user_inter[user_pick - 1] doc_show(user_pick) choice = int(input('▶다른 추천 논문을 보시려면 0, 키워드를 다시 입력하시려면 1 입력:')) if choice == 0: continue else: user_input = str(input('▶키워드를 입력하세요:')) user_num = int(input('▶추천받을 논문 수를 입력하세요:')) user_input = user_input.split() # 유저 인풋 user_input = [w.lower() for w in user_input] doc_link = extract_simdoc_list(ft_model, top7_list, paper_list, user_input, user_num) user_inter = Userinter.craw(doc_link)
def Edmundson(result, answer): # result 机器摘要 # answer 参考摘要 r_s = set(TFIDF.cut_by_sentence(result)) r_a = set(TFIDF.cut_by_sentence(answer)) share = r_s & r_a return len(share) / len(r_a)
def TFIDF(df, word_list): ''' 调用前面写的tfidf算法来训练矩阵 :return: 返回TFIDF矩阵 ''' df = read_file("classification_simple_test.csv", [0]) data = [a[0].split() for a in np.array(df).tolist()] IDF_list = T.IDF(data, word_list) # 计算idf值 TFIDF_met = np.empty(shape=(len(data), len(word_list))) for i, record in enumerate(data): TF_dic = T.TF(record) for index, word in enumerate(word_list): TFIDF_met[i][index] = TF_dic.get(word, 0) * IDF_list[index] return TFIDF_met
def dialog(s, isRandMetaS=True, isPrint=True, isLearn=False, n=5, tryCnt=10, needs=set(['名詞', '固有名詞', '動詞', '形容詞'])): keys = TFIDF.getKWs(s, threshold=50, n=n, length=1, isPrint=isPrint, needs=needs, RandNum=5) isAssociate = False if keys[0] == '': return '...そうなんですね' if isAssociate: BA = associateAns(keys[0]) else: # wordset = getSimilarWords(w = keys[0], cnt = tryCnt) BA = formTrigram(word=keys[0], isRandMetaS=isRandMetaS) # ansSims = {ans: TFIDF.cosSimilarity(ans, s) for ans in ANSs} # BA = sorted(ansSims.items(), reverse = True, key=lambda x:x[1])[0] if isLearn: TrigramCore(s, isLearn=True, isDebug=False) if isPrint: print('=> 自動生成した応答文は以下のとおりです。') print(BA) randnum = np.random.randint(10) BA = BA.replace('<接尾>', 'さん').replace('<地域>', 'アキバ').replace('<数>', str(randnum)) return BA
def filterBazaakParshaReadTFIDF(parshaName, lang='heb', min_count=MIN_WORD_COUNT, splitParshiot=None, min_distance=MIN_DISTANCE): if not splitParshiot: splitParshiot = Parshiot.createSplitParshiot(lang) topTFIDF = TFIDF.parshaIDF(parshaName, splitParshiot) totalWords = len(topTFIDF) # find the percentage needed percent = PERCENT / 100 topTFIDF = topTFIDF.most_common(int(totalWords * percent)) # just get the keys, the words topTFIDF = [i[0] for i in topTFIDF] parsha = splitParshiot[parshaName] read = BazaakRead(parsha, min_count, min_distance) # create a new dictionary, only containing results where the key was in the top PERCENT% of TF-IDF scores newRead = {k: v for k, v in read.items() if k in topTFIDF} return newRead
def __init__(self, span, d, iterations, index): self.kspan = span # 共现窗口的长度 self.d = d self.iteration = iterations # ---- TF-IDF 初始化 ---- # docs = dataset.read_sogou() for i, doc in enumerate(docs): docs[i] = TFIDF.cut_by_words(doc) data, self.VSM = TFIDF.create_VSM(docs) print("Create VSM Over !") # 生成 IDF self.idf = TFIDF.IDF(data, self.VSM) self.tfidf = TFIDF.TF_IDF(data[index], self.VSM, self.idf) self.keywords_doc = TFIDF.extract_keywords_tfidf(self.tfidf, self.VSM) print("Got the keywors from the docs !")
def process_documents(): '''Read From Document''' documents = Utilities.read_from_time_all() #documents = read_lines() '''Tokens and Stem Documents''' documents = Utilities.tokenize_stem_docs(documents) '''calculate doc lengths''' doc_len = Utilities.calculate_doc_len(documents) ''' term frequency''' tf = TFIDF.term_frequency(documents) '''calculates tf-idf''' tfidf = TFIDF.TFIDF(len(documents), tf) '''Read From Document''' queries = Utilities.read_from_time_que() #queries = ['pop love song', 'chinese american', 'city'] '''Tokens and Stem Documents''' queries = Utilities.tokenize_stem_docs(queries) #print Search.search_by_cosine(tfidf,len(documents),['CARTOONISTS'.lower()]) cosine_result = [] rsv_result = [] BM25_1_5 = [] #b=1 k= 0.5 BM25_1_1 = [] #b=1 k= 1 BM25_2_5 = [] #b=2 k= 0.5 BM25_2_1 = [] #b=2 k= 1 for query in queries: cosine_result.append(Search.search_by_cosine(tfidf,len(documents),query)) rsv_result.append(Search.search_by_rsv(tf,len(documents),query)) BM25_1_5.append(Search.search_by_BM25(tf,doc_len,query,1.0,0.5)) BM25_1_1.append(Search.search_by_BM25(tf,doc_len,query,1.0,1.0)) BM25_2_5.append(Search.search_by_BM25(tf,doc_len,query,2.0,0.5)) BM25_2_1.append(Search.search_by_BM25(tf,doc_len,query,2.0,1.0)) #print cosine_result[1] ''' read from time.rel ''' rel_dict = Utilities.read_from_time_rel() ''' print result ''' result = [] result.append(('System','Precision','Recall','F1','MAP')) result.append( ('cosine ',) + Metrics.getMetrics(cosine_result,rel_dict,20)) #limit to top 20 search result.append( ('RSV ',) + Metrics.getMetrics(rsv_result,rel_dict,20)) result.append(('BM25 (1, .5) ',)+ Metrics.getMetrics(BM25_1_5,rel_dict,20)) result.append(('BM25 (1, 1) ',)+Metrics.getMetrics(BM25_1_1,rel_dict,20)) result.append(('BM25 (2, .5) ',)+Metrics.getMetrics(BM25_2_5,rel_dict,20)) result.append(('BM25 (2, 1) ',)+Metrics.getMetrics(BM25_2_1,rel_dict,20)) Utilities.tabulate(result) Utilities.plot_graph(result)
def textrank_words(self, sentence, size): ''' 使用TextRank算法抽取关键字 ''' graph = Graph(self.d, self.iteration) # the TextRank Algorithm and return `size` keywords of the sentence words = TFIDF.cut_by_words(sentence) # Create the graph for the TextRank # clean with the stopwords and symbol in the sentence stopwords_set = TFIDF.get_stopwords() edge_weight = defaultdict(int) # count the weight of the edge for index_i, word in enumerate(words): if word not in stopwords_set: for index in range(index_i + 1, index_i + self.kspan): if index >= len(words): # end this loop break if words[index] in stopwords_set: # do not care about the stopwords continue # add the edge, defaultdict default that is 0 edge_weight[(word, words[index])] += 1 # create the graph of the word for key, value in edge_weight.items(): graph.add_edge(key[0], key[1], value) res = graph.rank() if res == False: return False result = sorted(graph.WS.items(), key=itemgetter(1), reverse=True) result = join_result(result, sentence) result = sorted(graph.WS.items(), key=itemgetter(1), reverse=True) if size > len(result): print("Overload !") size = len(result) return result[:size]
def learnLang(sList): i = 1 for s in sList: print('++++++++++++++++++++++++++++++++++++++++++++++++++') print(i, s) try: trigram = TrigramCore(s, 1, 0) tfidf = TFIDF.TFIDF(s, i, True, 0) except Exception as e: print('') i += 1
def calculate_idf_for_query(tfidf_dict, total_documents, query): """ used to calculate idf values of the query - just a look up in tfidf_dict/tf dict for doc-frequency and calculate idf """ query_idf = defaultdict(lambda: -1) for word in query: if Utilities.is_key_present(tfidf_dict, word): # if query term present in our dictionary word_idf = TFIDF.cal_idf(float(total_documents), len(tfidf_dict[word])) Utilities.add_to_dictinoary(query_idf, word, word_idf) return query_idf
def query(query, subSet, useFeatures, usePageRank, useTFIDF, useClustering, useRecommend, topN=100): results = {} if not useFeatures and not usePageRank and not useTFIDF and not useClustering and not useRecommend: return None query = normalizeString(query, datasets.STOPWORDS, lemmatization).split() print "Querying with: %r" % query if useFeatures: results['IF'] = features.queryFeatures(query, subSet) prevMethod = 'IF' if usePageRank: results['PR'] = pagerank.queryPageRank(query, subSet) if useTFIDF: results['TI'] = TFIDF.queryTFIDF(query, subSet) if useClustering: results['CL'] = clustering.queryClustering(query, subSet) prevMethod = 'CL' if useRecommend: results['RE'] = recommend.queryRecommend(query, subSet) prevMethod = 'RE' endresults = collections.defaultdict(float) # Loop through all pmid's after having ID set sliced like subSet prescribes for pmid in [pmid for pmid in datasets.IDS if str(pmid).startswith(subSet)]: i = 0 if useFeatures and results['IF'].has_key(pmid): i += 1 endresults[pmid] += results['IF'][pmid] * 1.0 # Weighted modifier, if required if usePageRank and results['PR'].has_key(pmid): i += 1 endresults[pmid] += results['PR'][pmid] * 1.0 # Weighted modifier, if required if useTFIDF and results['TI'].has_key(pmid): i += 1 endresults[pmid] += results['TI'][pmid] * 1.0 # Weighted modifier, if required if useClustering and results['CL'].has_key(pmid): i += 1 endresults[pmid] += results['CL'][pmid] * 1.0 # Weighted modifier, if required if useRecommend and results['RE'].has_key(pmid): i += 1 endresults[pmid] += results['RE'][pmid] * 1.0 # Weighted modifier, if required if i > 0: endresults[pmid] /= i # Divide by the amount of succesful techniques ran on this pmid normalizeScore(endresults) # Return the topN results, sorted descending by score return sorted(endresults.items(), key=operator.itemgetter(1), reverse=True)[:topN]
def calculate_idf_for_query(tfidf_dict, total_documents, query): """ used to calculate idf values of the query - just a look up in tfidf_dict/tf dict for doc-frequency and calculate idf """ query_idf = defaultdict(lambda: -1) for word in query: if Utilities.is_key_present( tfidf_dict, word): #if query term present in our dictionary word_idf = TFIDF.cal_idf(float(total_documents), len(tfidf_dict[word])) Utilities.add_to_dictinoary(query_idf, word, word_idf) return query_idf
def dialog(s, isRandMetaS = True, isPrint = True, isLearn = False, n =5, tryCnt = 10, needs = set(['名詞', '固有名詞', '動詞', '形容詞'])): keys = TFIDF.getKWs(s, threshold = 50, n = n, length = 1, isPrint = isPrint, needs = needs, RandNum = 5) isAssociate = False if keys[0] == '': return '...そうなんですね' if isAssociate: BA = associateAns(keys[0]) else: # wordset = getSimilarWords(w = keys[0], cnt = tryCnt) BA = formTrigram(word = keys[0], isRandMetaS = isRandMetaS) # ansSims = {ans: TFIDF.cosSimilarity(ans, s) for ans in ANSs} # BA = sorted(ansSims.items(), reverse = True, key=lambda x:x[1])[0] if isLearn: TrigramCore(s, isLearn = True, isDebug = False) if isPrint: print('=> 自動生成した応答文は以下のとおりです。') print(BA) randnum = np.random.randint(10) BA = BA.replace('<接尾>', 'さん').replace('<地域>', 'アキバ').replace('<数>', str(randnum)) return BA
def dialog(intext, isRandMetaS = True, isPrint = True, isLearn = False, n =5, needs = set(['名詞', '固有名詞', '動詞', '形容詞'])): keys = TFIDF.getKWs(intext, threshold = 50, n = n, length = 1, isPrint = isPrint, needs = needs, RandNum = 5) if isRandMetaS: MetaFrame = getMetaSentence() MFs = [''.join([f, '助詞']) if not f[-1] == '>' else f for f in MetaFrame.split('助詞,')] cnt = len(MFs) try: ansList = [getTrigram(keys[i], MFs[i]) for i in range(cnt)] ###ここのkey部修正のために話題連想データベースを作る必要がある。 except Exception as e: # print(e) if keys[0] == None: keys = [''] ansList = [getTrigram(keys[0])] ans = ''.join(ansList).replace('<BOS>', '').replace('<EOS>', 'です。') else: ans = getTrigram(keys[0]).replace('<BOS>', '').replace('<EOS>', '') if isLearn: TrigramCore(intext, isLearn = True, isDebug = False) if isPrint: print('=> 自動生成した応答文は以下のとおりです。') print(ans) return ans
def searchInsideWhoosh(terms, rule): queryResultBM25 = BM25.bm25(terms, rule) queryResultTFIDF = TFIDF.tfidf(terms, rule) # titulo : [BM25, TFIDF, SUM] evalValues = {} #templateList = [0,0] #BM25 ; TFIDF ; PageRank; Y for docBM25 in queryResultBM25: templateList = [queryResultBM25[docBM25],0,0] evalValues[docBM25] = templateList for docTFIDF in queryResultTFIDF: if docTFIDF in evalValues: evalValues[docTFIDF][1] = queryResultTFIDF[docTFIDF] else: templateList = [0,queryResultBM25[docBM25],0] evalValues[docTFIDF] = templateList #Values Mix (Simple sum) for doc in evalValues: evalValues[doc][2] = evalValues[doc][0] + evalValues[doc][1] return evalValues
def Main(status, tmp, mode = 'dm'): ans = '' IMGfile = '' tweetStatus = '' filename = '' text = status['cleanText'] status_id = status['id_str'] screen_name = status['user']['screen_name'] userinfo, isNewUser = dealSQL.getUserInfo(screen_name) now = status['now'] BOT_ID = tmp['BOT_ID'] #時間計測(秒) try: try: delta = now - userinfo['time'] except: #文字列対策 print('convert str into datetime') delta = now - datetime.strptime(userinfo['time'], '%Y-%m-%d %H:%M:%S.%f') deltasec = delta.total_seconds() except Exception as e: print(e) deltasec = 50 #返答タイムアウト処理 if deltasec > 1000: userinfo['cnt'] = 0 userinfo['mode'] = 'dialog' if userinfo['mode'] == 'confirm.tag.img': src = userinfo['tmpFile'] drc = DIRIMGundefined if os.path.exists(drc) == False: os.mkdir(drc) shutil.copy(src, drc) # 応答 if 'ping' in text: ans = ''.join(['Δsec : ', str(deltasec)]) elif userinfo['mode'] == 'ignore': userinfo['cnt'] = 0 userinfo['mode'] = 'dialog' elif deltasec < 3: ans = dealSQL.getPhrase(s_type = 'tooFreq', n = 20) userinfo['mode'] = 'ignore' elif 'userinfo' in text: ans = str(userinfo) elif userinfo['mode'] == 'learn.text': if status['in_reply_to_screen_name'] in {BOT_ID}: text = status['text'].replace('@'+BOT_ID, '') text = re.sub(r'(@[^\s ]+)', '{ID}', text) if 'end' in text: userinfo['mode'] = 'dialog' userinfo['tmp'] = '' ans = 'learningモードをクローズしました。この結果は開発にフィードバックされます。ご協力感謝します。' else: labelstatus = userinfo['tmp'] userinfo['cnt'] = 0 dealSQL.savePhrase(phrase = text, author = screen_name, status = labelstatus, s_type = 'UserLearn') ans = '[learning]saved!!... 続けて覚えさせるテキストをリプライしてください。\nendと入力するまでモードは続きます。' else: ans = 'learningモードの途中です。覚えさせるテキストをリプライしてください。\nendと入力するまでモードは続きます。' elif userinfo['mode'] == 'sleeping' and deltasec > 3600: ans = dealSQL.getPhrase(s_type = 'goodmorning', n = 1) ans += '\n' + dealSQL.getPhrase(s_type = 'sleep.span', n = 1).format(utiltools.sec2HMSstr(deltasec)) userinfo['mode'] = 'dialog' elif 'media' in status['entities'] and status['in_reply_to_screen_name'] in {BOT_ID}: userinfo['cnt'] = 0 fileID = now.strftime("%Y%m%d%H%M%S") if status['entities']['hashtags'] != []: imgtag = status['entities']['hashtags'][0]['text'] try: filenames = utiltools.saveMedias(status, ID = fileID, DIR = '/'.join([DIRIMGfeedback, imgtag])) ans = dealSQL.getPhrase(s_type = 'appreciate.giveme.img', n = 1).format(imgtag) except Exception as e: print(e) ans = dealSQL.getPhrase(s_type = 'err.get.img', n = 1) else: try: filenames = utiltools.saveMedias(status, ID = fileID, DIR = DIRIMGtmp) filename = filenames[0] label, FACEflag, IMGfile = NNimg.predictAns(filename = filename, isShow = False, model = modelNNimg, workDIR = '') if FACEflag == False: ans = dealSQL.getPhrase(s_type = 'confirm.detect.img.noface', n = 1).format(label) else: ans = dealSQL.getPhrase(s_type = 'confirm.detect.img', n = 1).format(label) drc = '/'.join([DIRIMGfeedback, label]) if os.path.exists(drc) == False: os.mkdir(drc) shutil.copy(filename, drc) userinfo['mode'] = 'confirm.tag.img' print('/'.join([drc, filename.split('/')[-1]])) userinfo['tmpFile'] = '/'.join([drc, filename.split('/')[-1]]) filename = IMGfile except Exception as e: print(e) ans = dealSQL.getPhrase(s_type = 'err.get.img', n = 1) elif userinfo['mode'] == 'confirm.tag.img': userinfo['cnt'] = 0 if status['entities']['hashtags'] != []: imgtag = status['entities']['hashtags'][0]['text'] isMoveDIR = True elif not 'ない' in text and ('正解' in text or '正し' in text): ans = dealSQL.getPhrase(s_type = 'success.detect.img', n = 1) userinfo['mode'] = 'dialog' isMoveDIR = False else: try: imgtag = TFIDF.calcKWs(text, length = 1, needs = {'固有名詞', '名詞'})[0][0] except Exception as e: print(e) imgtag = 'undefined' isMoveDIR = True if isMoveDIR: src = userinfo['tmpFile'] drc = '/'.join([DIRIMGfeedback, imgtag]) if os.path.exists(drc) == False: os.mkdir(drc) shutil.copy(src, drc) if imgtag != 'undefined': ans = dealSQL.getPhrase(s_type = 'appreciate.feedback.img', n = 1).format(imgtag) userinfo['mode'] = 'dialog' else: ans = dealSQL.getPhrase(s_type = 'ask.feedback.img', n = 1) elif userinfo['cnt'] > 6: ans = dealSQL.getPhrase(s_type = 'cntOver', n = 20) userinfo['mode'] = 'ignore' elif '海未face' in text and status['in_reply_to_screen_name'] in {BOT_ID}: umipicDIR = '/Users/masaMikam/Dropbox/Project/IAs/Data/imgs/海未' filename = utiltools.getRandIMG(umipicDIR) ans = '...' elif 'timer' in text and status['in_reply_to_screen_name'] in {BOT_ID}: cmds = text.split(' ') try: timersec = cmds[1] except: timersec = 300 try: tmptext = cmds[2] except: tmptext = '' setTime = datetime.utcnow() + timedelta(hours=0, minutes=0, seconds = int(timersec)) dealSQL.saveTask(taskdict = {'who':screen_name, 'what': 'timer', 'to_whom': screen_name, 'when':setTime, 'tmptext': tmptext}) setTimeJ = setTime + timedelta(hours=9) ans = datetime.strftime(setTimeJ, '%m月%d日 %H時%M分%S秒') + 'にタイマーをセットしました。' elif 'learn' in text and status['in_reply_to_screen_name'] in {BOT_ID}: userinfo['mode'] = 'learn.text' cmds = text.split(' ') tmplabel = cmds[1] userinfo['tmp'] = tmplabel userinfo['cnt'] = 0 ans = '[Learningモード]\n' + tmplabel+ 'として覚えさせるテキストをリプライしてください。\nendと入力するまでモードは続きます。' elif 'respon' in text and status['in_reply_to_screen_name'] in {BOT_ID}: if 'clear' in text: try: tmp['responseWord'] = {} ans = '全てのTL監視を停止しました。by @' + screen_name + '\n 監視ワードを追加するには半角スペース区切りで、\n response [監視ワード] [応答文]' screen_name = '' except: ans = '設定失敗。半角スペースで区切ってオーダーしてください。' else: try: cmds = text.split(' ') tgword = cmds[1] response = cmds[2] if len(tgword) > 3: tmp['responseWord'][tgword] = response ans = '「' + tgword + '」を監視して\n「' + response + '」と5分間反応します。by @' + screen_name + '\n 監視ワードを追加するには半角スペース区切りで、\n response [監視ワード] [応答文]' setTime = datetime.utcnow() + timedelta(hours=0, minutes=5) dealSQL.saveTask(taskdict = {'who':screen_name, 'what': 'erase.tmp.responseWord', 'to_whom': screen_name, 'when':setTime, 'tmptext': tgword}) screen_name = '' else: ans = '監視ワードは4文字以上である必要があります。' except: ans = '設定失敗。半角スペースで区切ってオーダーしてください。' elif 'kusoripu' in text and status['in_reply_to_screen_name'] in {BOT_ID}: try: cmds = text.split(' ') tgname = cmds[1] user = twtr.get_user(screen_name = tgname)._json isFollowing = user['following'] if isFollowing: screen_name = '' status_id = '' ans = getKusoripu(tg1 = tgname) else: ans = 'そのユーザーはFF外です。クソリプは制限されます。' except: ans = 'クソリプ失敗。半角スペースで区切ってオーダーしてください。送信先はアットマークなしで記述してください。' elif 'su modsys' in text and status['in_reply_to_screen_name'] in {BOT_ID}: cmds = text.split(' ') tmp[cmds[2]] = cmds[3] ans = 'mod '+ cmds[2] + ' into ' + cmds[3] elif tmp['imitating'] != '' and 'default' in text: if twf.defaultProfile(): ans = 'デフォルトに戻りました' tmp['imitating'] = '' else: ans = 'デフォルトに戻るのに失敗 @_apkX' elif 'imitat' in text and status['in_reply_to_screen_name'] in {BOT_ID}: try: cmds = text.split(' ') tgname = cmds[1].replace('@', '').replace('.', '') ans = 'imitateErr' print(cmds, tgname) # imitation中 print(tmp['imitating']) ##TODO check whether ff or not if imitate(tgname): ans = tgname + 'さんのまねっこ5分間開始 defaultリプで元に戻ります。' mode = 'open' tmp['imitating'] = tgname # tmp['clocks']['imitationLimit'] = now + timedelta(hours=0, minutes=30) # tmp['clocks']['imitationTimer'] = now + timedelta(hours=0, minutes=5) setTime = now + timedelta(hours=0, minutes=5) dealSQL.saveTask(taskdict = {'who':BOT_ID, 'what': 'imitate.default', 'to_whom':screen_name, 'when':setTime, 'tmptext': ''}) else: ans = tgname + 'さんのまねっこ失敗 FF外の場合はまねっこできません。' except Exception as e: print('[ERR][Main.imitation]') print(e) ans = 'まねっこがどこか失敗です...' elif 'しりとり' in text or userinfo['mode'] == 'srtr': userinfo['mode'] = 'srtr' ans = myGame.SRTR(text, screen_name) if '\END' in ans: ans = ans.replace('\END', '') userinfo['mode'] = 'dialog' if '\MISS' in ans: ans = ans.replace('\MISS', '') if userinfo['cnt'] > 3: ans = dealSQL.getPhrase(s_type = 'shiritori.end', n = 1) userinfo['mode'] = 'dialog' userinfo['cnt'] = 0 else: userinfo['cnt'] = 0 elif 'おてもん' in text or userinfo['mode'] == 'mon': userinfo['mode'] = 'mon' userinfo['cnt'] = 0 try: ans = GAME_MON.Main(text, screen_name, 'アルパカさん') if '\END' in ans: ans = ans.replace('\END', '') userinfo['mode'] = 'dialog' if '\MISS' in ans: ans = ans.replace('\MISS', '') except: ans = '工事中...' userinfo['mode'] = 'dialog' elif 'おみくじ' in text or '占い' in text: ans = dealSQL.getPhrase(s_type = 'おみくじ', n = 20) elif 'おはよ' in text and status['in_reply_to_screen_name'] in set([None, BOT_ID]): ans = dealSQL.getPhrase(s_type = 'goodmorning', n = 1) elif 'おやすみ' in text and status['in_reply_to_screen_name'] in set([None, BOT_ID]): ans = dealSQL.getPhrase(s_type = 'goodnight', n = 1) userinfo['mode'] = 'sleeping' elif 'トレンドワード' in text: ans = '\n- '.join(['[現在のトレンドワード]']+tmp['trendwordsList'][:10]) elif deltasec > 600000: #3日 ans = dealSQL.getPhrase(s_type = 'longtimenosee', n = 1) else: ans = trigramMC.dialog(text, isRandMetaS = True, isPrint = True, isLearn = False, n =5, tryCnt = 10, needs = set(['名詞', '固有名詞'])).replace('<人名>', status['user']['name']) ans = charconv(ans, BOT_ID) # if isNewUser: # ans = dealSQL.getPhrase(s_type = 'welcomeNewUser', n = 20) if ans != '': tweetStatus, tmp = twf.send(ans, screen_name = screen_name, imgfile = filename, status_id = status_id, mode = mode, tmp = tmp) userinfo['time'] = now userinfo['cnt'] += 1 dealSQL.saveUserInfo(userinfo) return tweetStatus, tmp
import time as t import DataHandler as d import TFIDF as tf import UserProfiler as u # Load in score data start = t.time() dataHandler = d.DataHandler(False) print('Data loaded and preprocessed in ' + str(t.time() - start) + ' seconds.') print() # Use TFIDF algo start = t.time() similarities = tf.TFIDF(dataHandler.anime) print('Genre importance calculated in ' + str(t.time() - start) + ' seconds.') print() # Get recommendations start = t.time() recommender = u.UserProfiler(similarities, dataHandler) print('Recommednations generated in ' + str(t.time() - start) + ' seconds.') print() watched_anime = recommender.get_user_ratings( 'user in animelist_cleaned.csv').sort_values(by='my_score', ascending=False) recommendations = recommender.get_user_recommendations( 'user in animelist_cleaned.csv') print('User has watched ' + str(watched_anime.size) + ' anime.')
except Exception as e: print(e) ans = '画像を読み取れませんでした。' tweetStatus, Altbot_status = tweet(ans, screen_name = screen_name, status_id = status_id, bot_status = bot_status) elif userinfo['mode'] == 'confirm.tag.img': userinfo['cnt'] = 0 if status['entities']['hashtags'] != []: imgtag = status['entities']['hashtags'][0]['text'] isMoveDIR = True elif not 'ない' in text and ('正解' in text or '正し' in text): ans = 'やりました!正解ですね。' userinfo['mode'] = 'dialog' isMoveDIR = False else: try: imgtag = TFIDF.calcKWs(text, length = 1, needs = set(['固有名詞', '名詞']))[0][0] except Exception as e: print(e) imgtag = 'undefined' isMoveDIR = True if isMoveDIR: src = userinfo['tmpFile'] drc = ''.join(['/Users/xxxx', imgtag]) if os.path.exists(drc) == False: os.mkdir(drc) shutil.copy(src, drc) if imgtag != 'undefined': ans = ''.join(['...成る程...「', imgtag, '」なのですね。ありがとうございます。\n(フィードバックしました。学習反映にまでは時間がかかります。)']) userinfo['mode'] = 'dialog' else: ans = '...一体、これは何なのですか?(好奇心)'
def buscarLinksEnGoogle(datasetCSV): posts = datasetCSV.dataset for i in range(datasetCSV.inicio, datasetCSV.fin): try: print(i) # post_link = posts[i][1] link_url = posts[i][2] print(link_url) link = Link(link_url) print(link.linkDomain) if link.esLinkAOmitir(): print('Omitiendo') posts[i].append("LINK NULL") posts[i].append("LINK NULL") posts[i].append("LINK NULL") continue # postFacebook = PostFacebook(post_link) # datosPost = postFacebook.getInfoPostFacebook() # titulo_post = datosPost[0] # esto desde el csv # posts[i].append(titulo_post) # posts[i].append(datosPost[1]) post_fecha = convertirTextoAFecha(posts[i][3]) titulo_post = posts[i][4] print(titulo_post) # Buscar el link en base a los datos que tengo # texto del post # de los resultados elegir segun la fecha tmb linkMismoDominio = [] # en caso que funcione el link lo agrego directamente if (link.linkReal is not None): linkMismoDominio.append(link.linkReal) texto_a_buscar = titulo_post.replace('"', '') + " " + link.linkDomain for url in search(texto_a_buscar, tld='com.ar', lang='es', stop=5): print(url) linkNuevo = Link(url) if ('clarin' in url): postPortal = ClarinPost(linkNuevo) else: if ('nacion' in url): postPortal = NacionPost(linkNuevo) else: continue fecha_portal = postPortal.getFecha() if (fecha_portal == "FECHA NO ENCONTRADA"): continue fecha_portal = datetime.datetime.strptime( fecha_portal, '%Y-%m-%d').date() if (fecha_portal <= post_fecha): linkMismoDominio.append(url) # Siempre doy prioridad al orden de google porque es mas problable que sea # mejor su medida de similitud que la que podamos calcular por # nuestros medios cantidadLinksMismoDominio = len(linkMismoDominio) if cantidadLinksMismoDominio == 1: posts[i].append(linkMismoDominio[0]) else: if cantidadLinksMismoDominio == 0: print("No encontre link") posts[i].append("No encontre link") else: print("Necesito Distancia de Texto") tfidf = TFIDF.TfIdf() linkMasProximo = tfidf.getNearestLinkToTerm( linkMismoDominio, titulo_post) if linkMasProximo is None: print("No encontre link") posts[i].append("No encontre link") else: print(linkMasProximo) posts[i].append(linkMasProximo) posts[i].append(linkMasProximo) # esperar unos segundos para que nos banee google time.sleep(10) except Exception as ex: columnas = len(posts[i]) + 1 for _ in range(columnas, datasetCSV.cantidadColumnas): posts[i].append("TIME OUT" + str(ex)) print("TIME OUT") print(ex) time.sleep(30)
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = round(eval_loss / nb_eval_steps, 10) if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") # here is output print("------------------------- eval_loss = ", eval_loss) with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## ------------------------------------- # Make output file and image graph. import make_output_file_graph.py Make_out_graph = ifg.make_output_file_graph(preds) one, zero = Make_out_graph.make_output_labels() Make_out_graph.make_output_labels_num(one, zero) #Make_out_graph.make_graph(one, zero) #Make_out_graph.make_graph2(num_, loss_graph) Make_out_graph.make_bert_pred_bad_des() Make_out_graph.make_bad_long_des_file() Make_out_graph.make_bad_shot_des_file() count = 0 bzl.bring_output_label() filename = "sd1_allp.tsv" bzl.bring_test_file(filename, count) bzl.print_bad_des(filename) TFIDF.main__run() # ------------------------------------- return results
inverted_index = indexing.invert_index(direct_index) print len(inverted_index), 'terms indexed.\n' # For convenience, remember the number of terms and reviews n_reviews = len(direct_index) n_terms = max(t['termid'] for t in inverted_index.values()) + 1 # Compute the (optional) IDF threshold and print it if isn't null idf_threshold = None if not term_fraction else indexing.IDF_threshold(inverted_index, term_fraction) if idf_threshold: print "IDF threshold set at", idf_threshold, ".\n" ############################################################################### print 'Computing TFIDF representations...' TFIDFs = TFIDF.compute_all_TFIDFs(inverted_index, idf_threshold) #print "TFIDF " + str(TFIDFs) ############################################################################### target_index = None if search_text: # Given a search_text, compute its TFIDF representation print 'Computing search text\'s TFIDF representation...' search_terms = parser.extract_terms(search_text) searched_TFIDF = TFIDF.compute_new_TFIDF(search_terms, inverted_index, idf_threshold) # Warn if the search_text is empty due to an high IDF threshold if len(searched_TFIDF) == 0: print '*** WARNING *** Empty search, IDF threshold is too high!'
def process_documents(): '''Read From Document''' documents = Utilities.read_from_time_all() #documents = read_lines() '''Tokens and Stem Documents''' documents = Utilities.tokenize_stem_docs(documents) '''calculate doc lengths''' doc_len = Utilities.calculate_doc_len(documents) ''' term frequency''' tf = TFIDF.term_frequency(documents) '''calculates tf-idf''' tfidf = TFIDF.TFIDF(len(documents), tf) '''Read From Document''' queries = Utilities.read_from_time_que() #queries = ['pop love song', 'chinese american', 'city'] '''Tokens and Stem Documents''' queries = Utilities.tokenize_stem_docs(queries) #print Search.search_by_cosine(tfidf,len(documents),['CARTOONISTS'.lower()]) cosine_result = [] rsv_result = [] BM25_1_5 = [] #b=1 k= 0.5 BM25_1_1 = [] #b=1 k= 1 BM25_2_5 = [] #b=2 k= 0.5 BM25_2_1 = [] #b=2 k= 1 for query in queries: cosine_result.append( Search.search_by_cosine(tfidf, len(documents), query)) rsv_result.append(Search.search_by_rsv(tf, len(documents), query)) BM25_1_5.append(Search.search_by_BM25(tf, doc_len, query, 1.0, 0.5)) BM25_1_1.append(Search.search_by_BM25(tf, doc_len, query, 1.0, 1.0)) BM25_2_5.append(Search.search_by_BM25(tf, doc_len, query, 2.0, 0.5)) BM25_2_1.append(Search.search_by_BM25(tf, doc_len, query, 2.0, 1.0)) #print cosine_result[1] ''' read from time.rel ''' rel_dict = Utilities.read_from_time_rel() ''' print result ''' result = [] result.append(('System', 'Precision', 'Recall', 'F1', 'MAP')) result.append(('cosine ', ) + Metrics.getMetrics( cosine_result, rel_dict, 20)) #limit to top 20 search result.append(('RSV ', ) + Metrics.getMetrics(rsv_result, rel_dict, 20)) result.append(('BM25 (1, .5) ', ) + Metrics.getMetrics(BM25_1_5, rel_dict, 20)) result.append(('BM25 (1, 1) ', ) + Metrics.getMetrics(BM25_1_1, rel_dict, 20)) result.append(('BM25 (2, .5) ', ) + Metrics.getMetrics(BM25_2_5, rel_dict, 20)) result.append(('BM25 (2, 1) ', ) + Metrics.getMetrics(BM25_2_1, rel_dict, 20)) Utilities.tabulate(result) Utilities.plot_graph(result)
def Main(status, bot_status): text = status["cleanText"] status_id = status["id_str"] screen_name = status["user"]["screen_name"] userinfo, isNewUser = dealSQL.getUserInfo(screen_name) now = datetime.utcnow() if isNewUser: welcomeTweet = "はじめまして。よろしくお願いしますね。\n[新規ユーザー名検出...ユーザー情報登録完了]" tweetStatus, Altbot_status = tweet( welcomeTweet, screen_name=screen_name, status_id=status_id, bot_status=bot_status ) try: delta = now - datetime.strptime(userinfo["time"], "%Y-%m-%d %H:%M:%S.%f") deltasec = delta.total_seconds() except: deltasec = 50 if deltasec > 1000: userinfo["cnt"] = 0 userinfo["mode"] = "dialog" if userinfo["mode"] == "confirm.tag.img": src = userinfo["tmpFile"] drc = "".join(["/Users/masaMikam/Dropbox/Project/umiA/Data/imgsfeedback/undefined"]) if os.path.exists(drc) == False: os.mkdir(drc) shutil.copy(src, drc) if userinfo["mode"] == "ignore": userinfo["cnt"] = 0 userinfo["mode"] = "dialog" Altbot_status = bot_status tweetStatus = False elif deltasec < 3: ans = dealSQL.getPhrase(s_type="tooFreq", n=20) userinfo["mode"] = "ignore" tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status) elif "userinfo" in text: ans = str(userinfo) tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status) elif userinfo["cnt"] > 5: ans = dealSQL.getPhrase(s_type="cntOver", n=20) userinfo["mode"] = "ignore" tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status) elif "media" in status["entities"]: userinfo["cnt"] = 0 fileID = now.strftime("%Y%m%d%H%M%S") if status["entities"]["hashtags"] != []: imgtag = status["entities"]["hashtags"][0]["text"] try: filenames = saveMedias( status, ID=fileID, DIR="/Users/masaMikam/Dropbox/Project/umiA/Data/imgsfeedback/" + imgtag ) ans = "".join(["画像を「", imgtag, "」として学習対象に登録しました。ご協力ありがとうございます。"]) except Exception as e: print(e) ans = "画像を読み取れませんでした。" tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status) else: try: filenames = saveMedias(status, ID=fileID, DIR="/Users/masaMikam/Dropbox/Project/umiA/Data/twimgs") filename = filenames[0] label, FACEflag, altfilename = NNimg.predictAns( filename=filename, isShow=False, model="/Users/masaMikam/Dropbox/Project/umiA/Data/ML_Brain/DNN_skf", workDIR="", ) if FACEflag == False: ans = "顔認識に失敗しています。 精度は下がりますが...\n" + label + "ですか?正しかったら、「正解」と言ってください。" else: ans = label + "ですか?正しかったら、「正解」と言ってください。" tweetStatus, Altbot_status = tweet( ans, screen_name=screen_name, status_id=status_id, imgfile=altfilename, bot_status=bot_status ) drc = "".join(["/Users/masaMikam/Dropbox/Project/umiA/Data/imgsfeedback/", label]) if os.path.exists(drc) == False: os.mkdir(drc) shutil.copy(filename, drc) userinfo["mode"] = "confirm.tag.img" print("/".join([drc, filename.split("/")[-1]])) userinfo["tmpFile"] = "/".join([drc, filename.split("/")[-1]]) except Exception as e: print(e) ans = "画像を読み取れませんでした。" tweetStatus, Altbot_status = tweet( ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status ) elif userinfo["mode"] == "confirm.tag.img": userinfo["cnt"] = 0 if status["entities"]["hashtags"] != []: imgtag = status["entities"]["hashtags"][0]["text"] isMoveDIR = True elif not "ない" in text and ("正解" in text or "正し" in text): ans = "やりました!正解ですね。" userinfo["mode"] = "dialog" isMoveDIR = False else: try: imgtag = TFIDF.calcKWs(text, length=1, needs=set(["固有名詞", "名詞"]))[0][0] except Exception as e: print(e) imgtag = "undefined" isMoveDIR = True if isMoveDIR: src = userinfo["tmpFile"] drc = "".join(["/Users/masaMikam/Dropbox/Project/umiA/Data/imgsfeedback/", imgtag]) if os.path.exists(drc) == False: os.mkdir(drc) shutil.copy(src, drc) if imgtag != "undefined": ans = "".join(["...成る程...「", imgtag, "」なのですね。ありがとうございます。\n(フィードバックしました。学習反映にまでは時間がかかります。)"]) userinfo["mode"] = "dialog" else: ans = "...一体、これは何なのですか?(好奇心)" ##送信 tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status) elif "しりとり" in text or userinfo["mode"] == "srtr": userinfo["mode"] = "srtr" ans = myGame.SRTR(text, screen_name) if "\END" in ans: ans = ans.replace("\END", "") userinfo["mode"] = "dialog" if "\MISS" in ans: ans = ans.replace("\MISS", "") if userinfo["cnt"] > 3: ans = "しりとりは終わりにしましょう" userinfo["mode"] = "dialog" userinfo["cnt"] = 0 else: userinfo["cnt"] = 0 tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status) elif "おみくじ" in text or "占い" in text: ans = dealSQL.getPhrase(s_type="おみくじ", n=20) tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status) elif deltasec > 259200: # 3日 ans = "ご無沙汰しております...おかえりなさい。" tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status) else: ans = trigramMC2.dialog( text, isRandMetaS=True, isPrint=True, isLearn=False, n=5, tryCnt=10, needs=set(["名詞", "固有名詞", "動詞", "形容詞"]) ) ans = ans.replace("<人名>", status["user"]["name"]) tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status) userinfo["time"] = now userinfo["cnt"] += 1 dealSQL.saveUserInfo(userinfo) return tweetStatus, Altbot_status
def textrank_sentence(self, doc, threshold): ''' 使用 TextRank 算法抽取关键句,使用论文中提供的相似度计算公式,详见实验报告 抽取 `size` 个句子作为下一次喂给 MMR 的输入 ''' # create the undirect graph for the textrank sentences graphh = Graph(self.d, self.iteration) sentences = TFIDF.cut_by_sentence(doc) # change the size_word dynamicly if len(sentences) < 9: size_word = 2 else: # maybe 6 size_word = 5 if len(sentences) <= 4: print("The content is too short, do not need to summary !") return sentences # size may need to large as 0.6 size = round(len(sentences) * 0.3) if size == 0: size = min(1, len(sentences)) # use TextRank to get the keywords from the sentence # Get Keywords, need to fix # get the data sentences_array = [] for sent in sentences: result = self.textrank_words(sent, size_word) if result == False: # do not find the keywords with the TextRank result = [] # TFIDF 补充关键字,并考虑合并关键词组 result = append_keywords(sent, result, self.keywords_doc) # result = join_result(result, sent) # add the keywords with the TFIDF if result == False: words = [] else: words = list(map(lambda x: x[0], result)) sentences_array.append(words) # Get Keywords Over, need to finish fix operator # 初始化 TextRank 图 for i, sent1 in enumerate(sentences_array): for j, sent2 in enumerate(sentences_array): if i == j or len(sent1) == 0 or len(sent2) == 0: continue weights = textrank_similiar(np.array(sent1), np.array(sent2)) if weights > threshold: # similiar graphh.add_edge(i, j, weights) # 图排序开始 graphh.rank() result = graphh.WS if result is None: print("Error, because of the high threshold !") return False result = sorted(result.items(), key=itemgetter(1), reverse=True) if size > len(result): print("Overload !") size = len(result) result = result[:size] # 句子抽取 result_sent = [] for i, j in result: result_sent.append(sentences[i]) return result_sent
''' for t, l in inverted_index.items(): # Write it sys.stdout.write ('Term: %s, ID: %d, IDF: %.5f\n' % (t, l['termid'], l['idf'])) # Dor all documents appearing in the inverted index for d, c, tf in l['occurrences']: # write the document triplet sys.stdout.write ('\tDoc ID: %03d, # occurrences: %5d, TF: %.8f\n' % (d, c, tf)) exit() ''' ############################################################## print 'Computing TFIDF representations of documents in the corpus' TFIDFs = TFIDF.compute_all_TFIDFs (inverted_index, idf_threshold) # Print all cosine similarities between documents ''' similarities = [[TFIDF.cosine_similarity(d1,d2) for d2 in TFIDFs] for d1 in TFIDFs] print similarities exit() ''' # Given a query, compute its TFIDF representation print 'Computing query\'s TFIDF representation' query = 'business became meaningful' query_terms = parser.extract_terms (query) q_TFIDF = TFIDF.compute_new_TFIDF (query_terms, inverted_index, idf_threshold)
__author__ = 'sankalp' import TextCleanup import TFIDF def store_tfidf_to_db(feature_names, tfidf_matrix): from MySQLConnecter import MySQL db_obj = MySQL() for col in tfidf_matrix.nonzero()[1]: word = feature_names[col] score = tfidf_matrix[0, col] db_obj = MySQL.insert(word, score) pass tf_idf_params = TFIDF.calculate_tf_idf() tfidf = tf_idf_params[0] tfidf_matrix = tf_idf_params[1] feature_names = tf_idf_params[2] # To store the TFIDF result in to the MySQL database store_tfidf_to_db(feature_names, tfidf_matrix) # TF-IDF is a sparse matrix. To convert it into a dense matrix, uncomment this line. dense = tfidf_matrix.todense() print('Enter a search term: ') # str = 'Dispersion and migration of uranium (U) and other toxic metals and radionuclides from' str = input() response = tfidf.transform([str])
all_headlines = NewsHeadlinesScraper.get_results("ufc") print(all_headlines) data = {} noun_phrases = [] translator = str.maketrans('', '', string.punctuation) for headline in range(len(all_headlines) - 1): clean_string = all_headlines[headline].translate(translator) print(clean_string + " clean string") words = TextBlob(clean_string) noun_phrases += words.noun_phrases tokenised_string = clean_string.split(" ") #print(tokenised_string) for word in range(len(tokenised_string) - 1): tfidf_tf_val = TFIDF.term_frequency(tokenised_string[word], all_headlines[headline]) print(all_headlines[headline]) tfidf_idf_val = TFIDF.inverse_document_frequency( tokenised_string[word], all_headlines) final_val = tfidf_tf_val * tfidf_tf_val data[tokenised_string[word]] = final_val sorted_data = sorted(data.items(), key=operator.itemgetter(1)) for items in sorted_data: print(items) print(noun_phrases)
factFile = sys.argv[1] rumorFile = sys.argv[2] rumorStemmedCorpus = defaultdict(list) preprocessing(rumorStemmedCorpus,rumorFile) #print str(rumorStemmedCorpus) factStemmedCorpus = defaultdict(list) preprocessing(factStemmedCorpus,factFile) #print str(factStemmedCorpus) rumour = tfidf(rumorStemmedCorpus); tfr = rumour.getTF() fact = tfidf(factStemmedCorpus); tff = fact.getTF() idf = TFIDF.getIDF(tfr,tff); totalRumor = getTotalTFIDF(tfr,idf) totalFact = getTotalTFIDF(tff,idf) #print totalRumor #print totalFact docRumourScore = defaultdict(float) docFactScore = defaultdict(float) rumorBigger = 0 factBigger = 0 allStemmedCorpus = rumorStemmedCorpus.copy() allStemmedCorpus.update(factStemmedCorpus) for doc,words in allStemmedCorpus.iteritems(): if words is not None:
elements_per_topic[element] = elements_per_topic[element] + 1 if element not in elements_per_topic: elements_per_topic[element] = 1 #dictionary that cointains, for each topic, the id of the document document_list_for_each_topic = {} for i in range(1000): topic = documentTopics[i][0] if topic in document_list_for_each_topic: document_list_for_each_topic[topic].append(i) if topic not in document_list_for_each_topic: document_list_for_each_topic[topic] = [] document_list_for_each_topic[topic].append(i) #prepare documents wit correct vocabulary plainDocs = TFIDF.preparePlainDocs(list_of_stemmed_docs) documentConceptDocs = TFIDF.preparedocConcDocs(list_of_lemmatized_docs) documentCatDocs = TFIDF.preparedocCatDocs( TFIDF.preparedocConcDocs(list_of_lemmatized_docs)) #cosin similarity computation for the three matrices plainDocsCosSim = TFIDF.cos_sim(plainDocs) documentConceptCosSim = TFIDF.cos_sim(documentConceptDocs) documentCategoryCosSim = TFIDF.cos_sim(documentCatDocs) #tfidf matrices plainDocsTFIDF = TFIDF.tfIDF(plainDocs) documentConceptTFIDF = TFIDF.tfIDF(documentConceptDocs) documentCategoryTFIDF = TFIDF.tfIDF(documentCatDocs) #cosine distance for agglomerative clustering
def Main(status, tmp, mode='dm'): ans = '' IMGfile = '' tweetStatus = '' filename = '' text = status['cleanText'] status_id = status['id_str'] screen_name = status['user']['screen_name'] userinfo, isNewUser = dealSQL.getUserInfo(screen_name) now = status['now'] BOT_ID = tmp['BOT_ID'] #時間計測(秒) try: try: delta = now - userinfo['time'] except: #文字列対策 print('convert str into datetime') delta = now - datetime.strptime(userinfo['time'], '%Y-%m-%d %H:%M:%S.%f') deltasec = delta.total_seconds() except Exception as e: print(e) deltasec = 50 #返答タイムアウト処理 if deltasec > 1000: userinfo['cnt'] = 0 userinfo['mode'] = 'dialog' if userinfo['mode'] == 'confirm.tag.img': src = userinfo['tmpFile'] drc = DIRIMGundefined if os.path.exists(drc) == False: os.mkdir(drc) shutil.copy(src, drc) # 応答 if 'ping' in text: ans = ''.join(['Δsec : ', str(deltasec)]) elif userinfo['mode'] == 'ignore': userinfo['cnt'] = 0 userinfo['mode'] = 'dialog' elif deltasec < 3: ans = dealSQL.getPhrase(s_type='tooFreq', n=20) userinfo['mode'] = 'ignore' elif 'userinfo' in text: ans = str(userinfo) elif userinfo['mode'] == 'learn.text': if status['in_reply_to_screen_name'] in {BOT_ID}: text = status['text'].replace('@' + BOT_ID, '') text = re.sub(r'(@[^\s ]+)', '{ID}', text) if 'end' in text: userinfo['mode'] = 'dialog' userinfo['tmp'] = '' ans = 'learningモードをクローズしました。この結果は開発にフィードバックされます。ご協力感謝します。' else: labelstatus = userinfo['tmp'] userinfo['cnt'] = 0 dealSQL.savePhrase(phrase=text, author=screen_name, status=labelstatus, s_type='UserLearn') ans = '[learning]saved!!... 続けて覚えさせるテキストをリプライしてください。\nendと入力するまでモードは続きます。' else: ans = 'learningモードの途中です。覚えさせるテキストをリプライしてください。\nendと入力するまでモードは続きます。' elif userinfo['mode'] == 'sleeping' and deltasec > 3600: ans = dealSQL.getPhrase(s_type='goodmorning', n=1) ans += '\n' + dealSQL.getPhrase(s_type='sleep.span', n=1).format( utiltools.sec2HMSstr(deltasec)) userinfo['mode'] = 'dialog' elif 'media' in status['entities'] and status[ 'in_reply_to_screen_name'] in {BOT_ID}: userinfo['cnt'] = 0 fileID = now.strftime("%Y%m%d%H%M%S") if status['entities']['hashtags'] != []: imgtag = status['entities']['hashtags'][0]['text'] try: filenames = utiltools.saveMedias(status, ID=fileID, DIR='/'.join( [DIRIMGfeedback, imgtag])) ans = dealSQL.getPhrase(s_type='appreciate.giveme.img', n=1).format(imgtag) except Exception as e: print(e) ans = dealSQL.getPhrase(s_type='err.get.img', n=1) else: try: filenames = utiltools.saveMedias(status, ID=fileID, DIR=DIRIMGtmp) filename = filenames[0] label, FACEflag, IMGfile = NNimg.predictAns(filename=filename, isShow=False, model=modelNNimg, workDIR='') if FACEflag == False: ans = dealSQL.getPhrase(s_type='confirm.detect.img.noface', n=1).format(label) else: ans = dealSQL.getPhrase(s_type='confirm.detect.img', n=1).format(label) drc = '/'.join([DIRIMGfeedback, label]) if os.path.exists(drc) == False: os.mkdir(drc) shutil.copy(filename, drc) userinfo['mode'] = 'confirm.tag.img' print('/'.join([drc, filename.split('/')[-1]])) userinfo['tmpFile'] = '/'.join([drc, filename.split('/')[-1]]) filename = IMGfile except Exception as e: print(e) ans = dealSQL.getPhrase(s_type='err.get.img', n=1) elif userinfo['mode'] == 'confirm.tag.img': userinfo['cnt'] = 0 if status['entities']['hashtags'] != []: imgtag = status['entities']['hashtags'][0]['text'] isMoveDIR = True elif not 'ない' in text and ('正解' in text or '正し' in text): ans = dealSQL.getPhrase(s_type='success.detect.img', n=1) userinfo['mode'] = 'dialog' isMoveDIR = False else: try: imgtag = TFIDF.calcKWs(text, length=1, needs={'固有名詞', '名詞'})[0][0] except Exception as e: print(e) imgtag = 'undefined' isMoveDIR = True if isMoveDIR: src = userinfo['tmpFile'] drc = '/'.join([DIRIMGfeedback, imgtag]) if os.path.exists(drc) == False: os.mkdir(drc) shutil.copy(src, drc) if imgtag != 'undefined': ans = dealSQL.getPhrase(s_type='appreciate.feedback.img', n=1).format(imgtag) userinfo['mode'] = 'dialog' else: ans = dealSQL.getPhrase(s_type='ask.feedback.img', n=1) elif userinfo['cnt'] > 6: ans = dealSQL.getPhrase(s_type='cntOver', n=20) userinfo['mode'] = 'ignore' elif '海未face' in text and status['in_reply_to_screen_name'] in {BOT_ID}: umipicDIR = '/Users/xxxx' filename = utiltools.getRandIMG(umipicDIR) ans = '...' elif 'timer' in text and status['in_reply_to_screen_name'] in {BOT_ID}: cmds = text.split(' ') try: timersec = cmds[1] except: timersec = 300 try: tmptext = cmds[2] except: tmptext = '' setTime = datetime.utcnow() + timedelta( hours=0, minutes=0, seconds=int(timersec)) dealSQL.saveTask( taskdict={ 'who': screen_name, 'what': 'timer', 'to_whom': screen_name, 'when': setTime, 'tmptext': tmptext }) setTimeJ = setTime + timedelta(hours=9) ans = datetime.strftime(setTimeJ, '%m月%d日 %H時%M分%S秒') + 'にタイマーをセットしました。' elif 'learn' in text and status['in_reply_to_screen_name'] in {BOT_ID}: userinfo['mode'] = 'learn.text' cmds = text.split(' ') tmplabel = cmds[1] userinfo['tmp'] = tmplabel userinfo['cnt'] = 0 ans = '[Learningモード]\n' + tmplabel + 'として覚えさせるテキストをリプライしてください。\nendと入力するまでモードは続きます。' elif 'respon' in text and status['in_reply_to_screen_name'] in {BOT_ID}: if 'clear' in text: try: tmp['responseWord'] = {} ans = '全てのTL監視を停止しました。by @' + screen_name + '\n 監視ワードを追加するには半角スペース区切りで、\n response [監視ワード] [応答文]' screen_name = '' except: ans = '設定失敗。半角スペースで区切ってオーダーしてください。' else: try: cmds = text.split(' ') tgword = cmds[1] response = cmds[2] if len(tgword) > 3: tmp['responseWord'][tgword] = response ans = '「' + tgword + '」を監視して\n「' + response + '」と5分間反応します。by @' + screen_name + '\n 監視ワードを追加するには半角スペース区切りで、\n response [監視ワード] [応答文]' setTime = datetime.utcnow() + timedelta(hours=0, minutes=5) dealSQL.saveTask( taskdict={ 'who': screen_name, 'what': 'erase.tmp.responseWord', 'to_whom': screen_name, 'when': setTime, 'tmptext': tgword }) screen_name = '' else: ans = '監視ワードは4文字以上である必要があります。' except: ans = '設定失敗。半角スペースで区切ってオーダーしてください。' elif 'kusoripu' in text and status['in_reply_to_screen_name'] in {BOT_ID}: try: cmds = text.split(' ') tgname = cmds[1] user = twtr.get_user(screen_name=tgname)._json isFollowing = user['following'] if isFollowing: screen_name = '' status_id = '' ans = getKusoripu(tg1=tgname) else: ans = 'そのユーザーはFF外です。クソリプは制限されます。' except: ans = 'クソリプ失敗。半角スペースで区切ってオーダーしてください。送信先はアットマークなしで記述してください。' elif 'su modsys' in text and status['in_reply_to_screen_name'] in {BOT_ID}: cmds = text.split(' ') tmp[cmds[2]] = cmds[3] ans = 'mod ' + cmds[2] + ' into ' + cmds[3] elif tmp['imitating'] != '' and 'default' in text: if twf.defaultProfile(): ans = 'デフォルトに戻りました' tmp['imitating'] = '' else: ans = 'デフォルトに戻るのに失敗 @_apkX' elif 'imitat' in text and status['in_reply_to_screen_name'] in {BOT_ID}: try: cmds = text.split(' ') tgname = cmds[1].replace('@', '').replace('.', '') ans = 'imitateErr' print(cmds, tgname) # imitation中 print(tmp['imitating']) ##TODO check whether ff or not if imitate(tgname): ans = tgname + 'さんのまねっこ5分間開始 defaultリプで元に戻ります。' mode = 'open' tmp['imitating'] = tgname # tmp['clocks']['imitationLimit'] = now + timedelta(hours=0, minutes=30) # tmp['clocks']['imitationTimer'] = now + timedelta(hours=0, minutes=5) setTime = now + timedelta(hours=0, minutes=5) dealSQL.saveTask( taskdict={ 'who': BOT_ID, 'what': 'imitate.default', 'to_whom': screen_name, 'when': setTime, 'tmptext': '' }) else: ans = tgname + 'さんのまねっこ失敗 FF外の場合はまねっこできません。' except Exception as e: print('[ERR][Main.imitation]') print(e) ans = 'まねっこがどこか失敗です...' elif 'しりとり' in text or userinfo['mode'] == 'srtr': userinfo['mode'] = 'srtr' ans = myGame.SRTR(text, screen_name) if '\END' in ans: ans = ans.replace('\END', '') userinfo['mode'] = 'dialog' if '\MISS' in ans: ans = ans.replace('\MISS', '') if userinfo['cnt'] > 3: ans = dealSQL.getPhrase(s_type='shiritori.end', n=1) userinfo['mode'] = 'dialog' userinfo['cnt'] = 0 else: userinfo['cnt'] = 0 elif 'おてもん' in text or userinfo['mode'] == 'mon': userinfo['mode'] = 'mon' userinfo['cnt'] = 0 try: ans = GAME_MON.Main(text, screen_name, 'アルパカさん') if '\END' in ans: ans = ans.replace('\END', '') userinfo['mode'] = 'dialog' if '\MISS' in ans: ans = ans.replace('\MISS', '') except: ans = '工事中...' userinfo['mode'] = 'dialog' elif 'おみくじ' in text or '占い' in text: ans = dealSQL.getPhrase(s_type='おみくじ', n=20) elif 'おはよ' in text and status['in_reply_to_screen_name'] in set( [None, BOT_ID]): ans = dealSQL.getPhrase(s_type='goodmorning', n=1) elif 'おやすみ' in text and status['in_reply_to_screen_name'] in set( [None, BOT_ID]): ans = dealSQL.getPhrase(s_type='goodnight', n=1) userinfo['mode'] = 'sleeping' elif 'トレンドワード' in text: ans = '\n- '.join(['[現在のトレンドワード]'] + tmp['trendwordsList'][:10]) elif deltasec > 600000: #3日 ans = dealSQL.getPhrase(s_type='longtimenosee', n=1) else: ans = trigramMC.dialog(text, isRandMetaS=True, isPrint=True, isLearn=False, n=5, tryCnt=10, needs=set(['名詞', '固有名詞' ])).replace('<人名>', status['user']['name']) ans = charconv(ans, BOT_ID) # if isNewUser: # ans = dealSQL.getPhrase(s_type = 'welcomeNewUser', n = 20) if ans != '': tweetStatus, tmp = twf.send(ans, screen_name=screen_name, imgfile=filename, status_id=status_id, mode=mode, tmp=tmp) userinfo['time'] = now userinfo['cnt'] += 1 dealSQL.saveUserInfo(userinfo) return tweetStatus, tmp
def getTF(self): return TFIDF.getTF(self.stemmedCorpus)