Ejemplo n.º 1
0
def main():
    # TF-IDF로 문서의 주제가 될만한 단어 선택
    text_list, paper_list = TFIDF.load_text_list()
    top7_list = TFIDF.create_top7(text_list)
    ft_model = FastText.load('./fasttext/eng_ft')  # ft 모델 로드
    user_input = str(input('▶키워드를 입력하세요:'))
    user_num = int(input('▶추천받을 논문 수를 입력하세요:'))
    user_input = user_input.split()  # 유저 인풋
    user_input = [w.lower() for w in user_input]
    doc_link = extract_simdoc_list(ft_model, top7_list, paper_list, user_input,
                                   user_num)
    user_inter = Userinter.craw(doc_link)
    while (True):
        print('\n[추천 논문 리스트]')
        for i in user_inter:
            print(str(user_inter.index(i) + 1), i[0])
        user_pick = int(input('\n▶원하는 논문번호를 입력하세요:'))
        user_pick = user_inter[user_pick - 1]
        doc_show(user_pick)
        choice = int(input('▶다른 추천 논문을 보시려면 0, 키워드를 다시 입력하시려면 1 입력:'))
        if choice == 0:
            continue
        else:
            user_input = str(input('▶키워드를 입력하세요:'))
            user_num = int(input('▶추천받을 논문 수를 입력하세요:'))
            user_input = user_input.split()  # 유저 인풋
            user_input = [w.lower() for w in user_input]
            doc_link = extract_simdoc_list(ft_model, top7_list, paper_list,
                                           user_input, user_num)
            user_inter = Userinter.craw(doc_link)
Ejemplo n.º 2
0
def Edmundson(result, answer):
    # result 机器摘要
    # answer 参考摘要
    r_s = set(TFIDF.cut_by_sentence(result))
    r_a = set(TFIDF.cut_by_sentence(answer))
    share = r_s & r_a
    return len(share) / len(r_a)
def TFIDF(df, word_list):
    '''
    调用前面写的tfidf算法来训练矩阵
    :return: 返回TFIDF矩阵
    '''
    df = read_file("classification_simple_test.csv", [0])
    data = [a[0].split() for a in np.array(df).tolist()]
    IDF_list = T.IDF(data, word_list)  # 计算idf值
    TFIDF_met = np.empty(shape=(len(data), len(word_list)))
    for i, record in enumerate(data):
        TF_dic = T.TF(record)
        for index, word in enumerate(word_list):
            TFIDF_met[i][index] = TF_dic.get(word, 0) * IDF_list[index]
    return TFIDF_met
Ejemplo n.º 4
0
def dialog(s,
           isRandMetaS=True,
           isPrint=True,
           isLearn=False,
           n=5,
           tryCnt=10,
           needs=set(['名詞', '固有名詞', '動詞', '形容詞'])):
    keys = TFIDF.getKWs(s,
                        threshold=50,
                        n=n,
                        length=1,
                        isPrint=isPrint,
                        needs=needs,
                        RandNum=5)
    isAssociate = False
    if keys[0] == '':
        return '...そうなんですね'
    if isAssociate:
        BA = associateAns(keys[0])
    else:
        # wordset = getSimilarWords(w = keys[0], cnt = tryCnt)
        BA = formTrigram(word=keys[0], isRandMetaS=isRandMetaS)
        # ansSims = {ans: TFIDF.cosSimilarity(ans, s) for ans in ANSs}
        # BA = sorted(ansSims.items(), reverse = True, key=lambda x:x[1])[0]

    if isLearn:
        TrigramCore(s, isLearn=True, isDebug=False)
    if isPrint:
        print('=> 自動生成した応答文は以下のとおりです。')
        print(BA)
    randnum = np.random.randint(10)
    BA = BA.replace('<接尾>', 'さん').replace('<地域>',
                                          'アキバ').replace('<数>', str(randnum))
    return BA
Ejemplo n.º 5
0
def filterBazaakParshaReadTFIDF(parshaName,
                                lang='heb',
                                min_count=MIN_WORD_COUNT,
                                splitParshiot=None,
                                min_distance=MIN_DISTANCE):
    if not splitParshiot:
        splitParshiot = Parshiot.createSplitParshiot(lang)

    topTFIDF = TFIDF.parshaIDF(parshaName, splitParshiot)
    totalWords = len(topTFIDF)

    # find the percentage needed
    percent = PERCENT / 100

    topTFIDF = topTFIDF.most_common(int(totalWords * percent))

    # just get the keys, the words
    topTFIDF = [i[0] for i in topTFIDF]

    parsha = splitParshiot[parshaName]
    read = BazaakRead(parsha, min_count, min_distance)

    # create a new dictionary, only containing results where the key was in the top PERCENT% of TF-IDF scores
    newRead = {k: v for k, v in read.items() if k in topTFIDF}

    return newRead
Ejemplo n.º 6
0
    def __init__(self, span, d, iterations, index):
        self.kspan = span  # 共现窗口的长度
        self.d = d
        self.iteration = iterations

        # ---- TF-IDF 初始化 ---- #
        docs = dataset.read_sogou()
        for i, doc in enumerate(docs):
            docs[i] = TFIDF.cut_by_words(doc)
        data, self.VSM = TFIDF.create_VSM(docs)
        print("Create VSM Over !")
        # 生成 IDF
        self.idf = TFIDF.IDF(data, self.VSM)
        self.tfidf = TFIDF.TF_IDF(data[index], self.VSM, self.idf)
        self.keywords_doc = TFIDF.extract_keywords_tfidf(self.tfidf, self.VSM)

        print("Got the keywors from the docs !")
Ejemplo n.º 7
0
def process_documents():
    '''Read From Document'''
    documents = Utilities.read_from_time_all()
    #documents = read_lines()
    '''Tokens and Stem Documents'''
    documents = Utilities.tokenize_stem_docs(documents)
    '''calculate doc lengths'''
    doc_len = Utilities.calculate_doc_len(documents)
    ''' term frequency'''
    tf = TFIDF.term_frequency(documents)
    '''calculates tf-idf'''
    tfidf = TFIDF.TFIDF(len(documents), tf)
    '''Read From Document'''
    queries = Utilities.read_from_time_que()
    #queries = ['pop love song', 'chinese american', 'city']
    '''Tokens and Stem Documents'''
    queries = Utilities.tokenize_stem_docs(queries)
    
    
    #print Search.search_by_cosine(tfidf,len(documents),['CARTOONISTS'.lower()])
    
    
    cosine_result = []
    rsv_result = []
    BM25_1_5 = []  #b=1 k= 0.5
    BM25_1_1 = [] #b=1 k= 1
    BM25_2_5 = [] #b=2 k= 0.5
    BM25_2_1 = [] #b=2 k= 1 
    
    
    for query in queries:
        cosine_result.append(Search.search_by_cosine(tfidf,len(documents),query))
        rsv_result.append(Search.search_by_rsv(tf,len(documents),query))
        BM25_1_5.append(Search.search_by_BM25(tf,doc_len,query,1.0,0.5))
        BM25_1_1.append(Search.search_by_BM25(tf,doc_len,query,1.0,1.0))
        BM25_2_5.append(Search.search_by_BM25(tf,doc_len,query,2.0,0.5))
        BM25_2_1.append(Search.search_by_BM25(tf,doc_len,query,2.0,1.0))
    
    #print cosine_result[1]
    '''
    read from time.rel
    '''    
    rel_dict = Utilities.read_from_time_rel()
    '''
    print result
    '''
    result = []

    result.append(('System','Precision','Recall','F1','MAP')) 
    result.append( ('cosine  ',) + Metrics.getMetrics(cosine_result,rel_dict,20)) #limit to top 20 search
    result.append( ('RSV  ',) + Metrics.getMetrics(rsv_result,rel_dict,20))
    result.append(('BM25 (1, .5) ',)+ Metrics.getMetrics(BM25_1_5,rel_dict,20))
    result.append(('BM25 (1, 1) ',)+Metrics.getMetrics(BM25_1_1,rel_dict,20))
    result.append(('BM25 (2, .5) ',)+Metrics.getMetrics(BM25_2_5,rel_dict,20)) 
    result.append(('BM25 (2, 1) ',)+Metrics.getMetrics(BM25_2_1,rel_dict,20))
    
    Utilities.tabulate(result)
    Utilities.plot_graph(result)
Ejemplo n.º 8
0
    def textrank_words(self, sentence, size):
        '''
            使用TextRank算法抽取关键字
        '''
        graph = Graph(self.d, self.iteration)

        # the TextRank Algorithm and return `size` keywords of the sentence
        words = TFIDF.cut_by_words(sentence)
        # Create the graph for the TextRank

        # clean with the stopwords and symbol in the sentence
        stopwords_set = TFIDF.get_stopwords()
        edge_weight = defaultdict(int)  # count the weight of the edge
        for index_i, word in enumerate(words):
            if word not in stopwords_set:
                for index in range(index_i + 1, index_i + self.kspan):
                    if index >= len(words):
                        # end this loop
                        break
                    if words[index] in stopwords_set:
                        # do not care about the stopwords
                        continue
                    # add the edge, defaultdict default that is 0
                    edge_weight[(word, words[index])] += 1

        # create the graph of the word
        for key, value in edge_weight.items():
            graph.add_edge(key[0], key[1], value)

        res = graph.rank()
        if res == False:
            return False

        result = sorted(graph.WS.items(), key=itemgetter(1), reverse=True)
        result = join_result(result, sentence)
        result = sorted(graph.WS.items(), key=itemgetter(1), reverse=True)

        if size > len(result):
            print("Overload !")
            size = len(result)

        return result[:size]
Ejemplo n.º 9
0
def learnLang(sList):
    i = 1
    for s in sList:
        print('++++++++++++++++++++++++++++++++++++++++++++++++++')
        print(i, s)
        try:
            trigram = TrigramCore(s, 1, 0)
            tfidf = TFIDF.TFIDF(s, i, True, 0)
        except Exception as e:
            print('')
        i += 1
Ejemplo n.º 10
0
def calculate_idf_for_query(tfidf_dict, total_documents, query):
    """
    used to calculate idf values 
    of the query - just a look up in tfidf_dict/tf dict
    for doc-frequency and calculate idf
    """
    query_idf = defaultdict(lambda: -1)
    for word in query:
        if Utilities.is_key_present(tfidf_dict, word):  # if query term present in our dictionary
            word_idf = TFIDF.cal_idf(float(total_documents), len(tfidf_dict[word]))
            Utilities.add_to_dictinoary(query_idf, word, word_idf)

    return query_idf
Ejemplo n.º 11
0
def query(query, subSet, useFeatures, usePageRank, useTFIDF, useClustering, useRecommend, topN=100):
    results = {}

    if not useFeatures and not usePageRank and not useTFIDF and not useClustering and not useRecommend:
        return None

    query = normalizeString(query, datasets.STOPWORDS, lemmatization).split()
    print "Querying with: %r" % query

    if useFeatures:
        results['IF'] = features.queryFeatures(query, subSet)
        prevMethod = 'IF'
    if usePageRank:
        results['PR'] = pagerank.queryPageRank(query, subSet)
    if useTFIDF:
        results['TI'] = TFIDF.queryTFIDF(query, subSet)
    if useClustering:
        results['CL'] = clustering.queryClustering(query, subSet)
        prevMethod = 'CL'
    if useRecommend:
        results['RE'] = recommend.queryRecommend(query, subSet)
        prevMethod = 'RE'


    endresults = collections.defaultdict(float)
    # Loop through all pmid's after having ID set sliced like subSet prescribes
    for pmid in [pmid for pmid in datasets.IDS if str(pmid).startswith(subSet)]:
        i = 0
        if useFeatures and results['IF'].has_key(pmid):
            i += 1
            endresults[pmid] += results['IF'][pmid] * 1.0 # Weighted modifier, if required 
        if usePageRank and results['PR'].has_key(pmid):
            i += 1
            endresults[pmid] += results['PR'][pmid] * 1.0 # Weighted modifier, if required
        if useTFIDF and results['TI'].has_key(pmid):
            i += 1 
            endresults[pmid] += results['TI'][pmid] * 1.0 # Weighted modifier, if required
        if useClustering and results['CL'].has_key(pmid):
            i += 1 
            endresults[pmid] += results['CL'][pmid] * 1.0 # Weighted modifier, if required
        if useRecommend and results['RE'].has_key(pmid):
            i += 1 
            endresults[pmid] += results['RE'][pmid] * 1.0 # Weighted modifier, if required
        if i > 0:
            endresults[pmid] /= i # Divide by the amount of succesful techniques ran on this pmid

    normalizeScore(endresults)
    # Return the topN results, sorted descending by score
    return sorted(endresults.items(), key=operator.itemgetter(1), reverse=True)[:topN]
Ejemplo n.º 12
0
def calculate_idf_for_query(tfidf_dict, total_documents, query):
    """
    used to calculate idf values 
    of the query - just a look up in tfidf_dict/tf dict
    for doc-frequency and calculate idf
    """
    query_idf = defaultdict(lambda: -1)
    for word in query:
        if Utilities.is_key_present(
                tfidf_dict, word):  #if query term present in our dictionary
            word_idf = TFIDF.cal_idf(float(total_documents),
                                     len(tfidf_dict[word]))
            Utilities.add_to_dictinoary(query_idf, word, word_idf)

    return query_idf
Ejemplo n.º 13
0
def dialog(s, isRandMetaS = True, isPrint = True, isLearn = False, n =5, tryCnt = 10, needs = set(['名詞', '固有名詞', '動詞', '形容詞'])):
  keys = TFIDF.getKWs(s, threshold = 50, n = n, length = 1, isPrint = isPrint, needs = needs, RandNum = 5)
  isAssociate = False
  if keys[0] == '':
    return '...そうなんですね'
  if isAssociate:
    BA = associateAns(keys[0])
  else:
    # wordset = getSimilarWords(w = keys[0], cnt = tryCnt)
    BA = formTrigram(word = keys[0], isRandMetaS = isRandMetaS)
    # ansSims = {ans: TFIDF.cosSimilarity(ans, s) for ans in ANSs}
    # BA = sorted(ansSims.items(), reverse = True, key=lambda x:x[1])[0]

  if isLearn:
    TrigramCore(s, isLearn = True, isDebug = False)
  if isPrint:
    print('=> 自動生成した応答文は以下のとおりです。')
    print(BA)
  randnum = np.random.randint(10)
  BA = BA.replace('<接尾>', 'さん').replace('<地域>', 'アキバ').replace('<数>', str(randnum))
  return BA
Ejemplo n.º 14
0
def dialog(intext, isRandMetaS = True, isPrint = True, isLearn = False, n =5, needs = set(['名詞', '固有名詞', '動詞', '形容詞'])):
	keys = TFIDF.getKWs(intext, threshold = 50, n = n, length = 1, isPrint = isPrint, needs = needs, RandNum = 5)
	if isRandMetaS:
		MetaFrame = getMetaSentence()
		MFs = [''.join([f, '助詞']) if not f[-1] == '>' else f for f in MetaFrame.split('助詞,')]
		cnt = len(MFs)
		try:
			ansList = [getTrigram(keys[i], MFs[i]) for i in range(cnt)] ###ここのkey部修正のために話題連想データベースを作る必要がある。
		except Exception as e:
			# print(e)
			if keys[0] == None:
				keys = ['']
			ansList = [getTrigram(keys[0])]
		ans = ''.join(ansList).replace('<BOS>', '').replace('<EOS>', 'です。')
	else:
		ans = getTrigram(keys[0]).replace('<BOS>', '').replace('<EOS>', '')
	if isLearn:
		TrigramCore(intext, isLearn = True, isDebug = False)
	if isPrint:
		print('=> 自動生成した応答文は以下のとおりです。')
		print(ans)
	return ans
Ejemplo n.º 15
0
def searchInsideWhoosh(terms, rule):
  queryResultBM25 = BM25.bm25(terms, rule)
  queryResultTFIDF = TFIDF.tfidf(terms, rule)

  # titulo : [BM25, TFIDF, SUM]
  evalValues = {}
  #templateList = [0,0] #BM25 ; TFIDF ; PageRank; Y

  for docBM25 in queryResultBM25:
    templateList = [queryResultBM25[docBM25],0,0]
    evalValues[docBM25] = templateList

  for docTFIDF in queryResultTFIDF:
    if docTFIDF in evalValues:
      evalValues[docTFIDF][1] = queryResultTFIDF[docTFIDF]
    else:
 	  templateList = [0,queryResultBM25[docBM25],0] 
 	  evalValues[docTFIDF] = templateList

  #Values Mix (Simple sum)
  for doc in evalValues:
    evalValues[doc][2] = evalValues[doc][0] + evalValues[doc][1]

  return evalValues
Ejemplo n.º 16
0
def Main(status, tmp, mode = 'dm'):
	ans = ''
	IMGfile = ''
	tweetStatus = ''
	filename = ''
	text = status['cleanText']
	status_id = status['id_str']
	screen_name = status['user']['screen_name']
	userinfo, isNewUser = dealSQL.getUserInfo(screen_name)
	now = status['now']

	BOT_ID = tmp['BOT_ID']

	#時間計測(秒)
	try:
		try:
			delta = now - userinfo['time']
		except: #文字列対策
			print('convert str into datetime')
			delta = now - datetime.strptime(userinfo['time'], '%Y-%m-%d %H:%M:%S.%f')
		deltasec = delta.total_seconds()
	except Exception as e:
		print(e)
		deltasec = 50

	#返答タイムアウト処理
	if deltasec > 1000:
		userinfo['cnt'] = 0
		userinfo['mode'] = 'dialog'
		if userinfo['mode'] == 'confirm.tag.img':
			src = userinfo['tmpFile']
			drc = DIRIMGundefined
			if os.path.exists(drc) == False:
				os.mkdir(drc)
			shutil.copy(src, drc)

	# 応答
	if 'ping' in text:
		ans = ''.join(['Δsec : ', str(deltasec)])
	elif userinfo['mode'] == 'ignore':
		userinfo['cnt'] = 0
		userinfo['mode'] = 'dialog'
	elif deltasec < 3:
		ans = dealSQL.getPhrase(s_type = 'tooFreq', n = 20)
		userinfo['mode'] = 'ignore'
	elif 'userinfo' in text:
		ans = str(userinfo)
	elif userinfo['mode'] == 'learn.text':
		if status['in_reply_to_screen_name'] in {BOT_ID}:
			text = status['text'].replace('@'+BOT_ID, '')
			text = re.sub(r'(@[^\s ]+)', '{ID}', text)
			if 'end' in text:
				userinfo['mode'] = 'dialog'
				userinfo['tmp'] = ''
				ans = 'learningモードをクローズしました。この結果は開発にフィードバックされます。ご協力感謝します。'
			else:
				labelstatus = userinfo['tmp']
				userinfo['cnt'] = 0
				dealSQL.savePhrase(phrase = text, author = screen_name, status = labelstatus, s_type = 'UserLearn')
				ans = '[learning]saved!!... 続けて覚えさせるテキストをリプライしてください。\nendと入力するまでモードは続きます。'
		else:
			ans = 'learningモードの途中です。覚えさせるテキストをリプライしてください。\nendと入力するまでモードは続きます。'
	elif userinfo['mode'] == 'sleeping' and deltasec > 3600:
		ans = dealSQL.getPhrase(s_type = 'goodmorning', n = 1)
		ans += '\n' + dealSQL.getPhrase(s_type = 'sleep.span', n = 1).format(utiltools.sec2HMSstr(deltasec))
		userinfo['mode'] = 'dialog'

	elif 'media' in status['entities'] and status['in_reply_to_screen_name'] in {BOT_ID}:
		userinfo['cnt'] = 0
		fileID = now.strftime("%Y%m%d%H%M%S")
		if status['entities']['hashtags'] != []:
			imgtag = status['entities']['hashtags'][0]['text']
			try:
				filenames = utiltools.saveMedias(status, ID = fileID, DIR = '/'.join([DIRIMGfeedback, imgtag]))
				ans = dealSQL.getPhrase(s_type = 'appreciate.giveme.img', n = 1).format(imgtag)
			except Exception as e:
				print(e)
				ans = dealSQL.getPhrase(s_type = 'err.get.img', n = 1)
		else:
			try:
				filenames = utiltools.saveMedias(status, ID = fileID, DIR = DIRIMGtmp)
				filename = filenames[0]
				label, FACEflag, IMGfile = NNimg.predictAns(filename  = filename, isShow = False, model = modelNNimg, workDIR = '')
				if FACEflag == False:
					ans = dealSQL.getPhrase(s_type = 'confirm.detect.img.noface', n = 1).format(label)
				else:
					ans = dealSQL.getPhrase(s_type = 'confirm.detect.img', n = 1).format(label)

				drc = '/'.join([DIRIMGfeedback, label])
				if os.path.exists(drc) == False:
					os.mkdir(drc)
				shutil.copy(filename, drc)

				userinfo['mode'] = 'confirm.tag.img'
				print('/'.join([drc, filename.split('/')[-1]]))
				userinfo['tmpFile'] = '/'.join([drc, filename.split('/')[-1]])
				filename = IMGfile
			except Exception as e:
				print(e)
				ans = dealSQL.getPhrase(s_type = 'err.get.img', n = 1)

	elif userinfo['mode'] == 'confirm.tag.img':
		userinfo['cnt'] = 0
		if status['entities']['hashtags'] != []:
			imgtag = status['entities']['hashtags'][0]['text']
			isMoveDIR = True
		elif not 'ない' in text and ('正解' in text or '正し' in text):
			ans = dealSQL.getPhrase(s_type = 'success.detect.img', n = 1)
			userinfo['mode'] = 'dialog'
			isMoveDIR = False
		else:
			try:
				imgtag = TFIDF.calcKWs(text, length = 1, needs = {'固有名詞', '名詞'})[0][0]
			except Exception as e:
				print(e)
				imgtag = 'undefined'
			isMoveDIR = True
		if isMoveDIR:
			src = userinfo['tmpFile']
			drc = '/'.join([DIRIMGfeedback, imgtag])
			if os.path.exists(drc) == False:
				os.mkdir(drc)
			shutil.copy(src, drc)
			if imgtag != 'undefined':
				ans = dealSQL.getPhrase(s_type = 'appreciate.feedback.img', n = 1).format(imgtag)
				userinfo['mode'] = 'dialog'
			else:
				ans = dealSQL.getPhrase(s_type = 'ask.feedback.img', n = 1)
	elif userinfo['cnt'] > 6:
		ans = dealSQL.getPhrase(s_type = 'cntOver', n = 20)
		userinfo['mode'] = 'ignore'
	elif '海未face' in text and status['in_reply_to_screen_name'] in {BOT_ID}:
		umipicDIR = '/Users/masaMikam/Dropbox/Project/IAs/Data/imgs/海未'
		filename = utiltools.getRandIMG(umipicDIR)
		ans = '...'
	elif 'timer' in text and status['in_reply_to_screen_name'] in {BOT_ID}:
		cmds = text.split(' ')
		try:
			timersec = cmds[1]
		except:
			timersec = 300
		try:
			tmptext = cmds[2]
		except:
			tmptext = ''
		setTime = datetime.utcnow() + timedelta(hours=0, minutes=0, seconds = int(timersec))
		dealSQL.saveTask(taskdict = {'who':screen_name, 'what': 'timer', 'to_whom': screen_name, 'when':setTime, 'tmptext': tmptext})
		setTimeJ = setTime + timedelta(hours=9)
		ans = datetime.strftime(setTimeJ, '%m月%d日 %H時%M分%S秒') + 'にタイマーをセットしました。'
	elif 'learn' in text and status['in_reply_to_screen_name'] in {BOT_ID}:
		userinfo['mode'] = 'learn.text'
		cmds = text.split(' ')
		tmplabel = cmds[1]
		userinfo['tmp'] = tmplabel
		userinfo['cnt'] = 0
		ans = '[Learningモード]\n' + tmplabel+ 'として覚えさせるテキストをリプライしてください。\nendと入力するまでモードは続きます。'

	elif 'respon' in text and status['in_reply_to_screen_name'] in {BOT_ID}:
		if 'clear' in text:
			try:
				tmp['responseWord'] = {}
				ans = '全てのTL監視を停止しました。by @' + screen_name + '\n 監視ワードを追加するには半角スペース区切りで、\n response [監視ワード] [応答文]'
				screen_name = ''
			except:
				ans = '設定失敗。半角スペースで区切ってオーダーしてください。'
		else:
			try:
				cmds = text.split(' ')
				tgword = cmds[1]
				response = cmds[2]
				if len(tgword) > 3:
					tmp['responseWord'][tgword] = response
					ans = '「' + tgword + '」を監視して\n「' + response + '」と5分間反応します。by @' + screen_name + '\n 監視ワードを追加するには半角スペース区切りで、\n response [監視ワード] [応答文]'
					setTime = datetime.utcnow() + timedelta(hours=0, minutes=5)
					dealSQL.saveTask(taskdict = {'who':screen_name, 'what': 'erase.tmp.responseWord', 'to_whom': screen_name, 'when':setTime, 'tmptext': tgword})
					screen_name = ''
				else:
					ans = '監視ワードは4文字以上である必要があります。'
			except:
				ans = '設定失敗。半角スペースで区切ってオーダーしてください。'
	elif 'kusoripu' in text and status['in_reply_to_screen_name'] in {BOT_ID}:
		try:
			cmds = text.split(' ')
			tgname = cmds[1]
			user = twtr.get_user(screen_name = tgname)._json
			isFollowing = user['following']
			if isFollowing:
				screen_name = ''
				status_id = ''
				ans = getKusoripu(tg1 = tgname)
			else:
				ans = 'そのユーザーはFF外です。クソリプは制限されます。'
		except:
			ans = 'クソリプ失敗。半角スペースで区切ってオーダーしてください。送信先はアットマークなしで記述してください。'

	elif 'su modsys'  in text and status['in_reply_to_screen_name'] in {BOT_ID}:
		cmds = text.split(' ')
		tmp[cmds[2]] = cmds[3]
		ans = 'mod '+ cmds[2] + ' into ' + cmds[3]

	elif tmp['imitating'] != '' and 'default' in text:
		if twf.defaultProfile():
			ans = 'デフォルトに戻りました'
			tmp['imitating'] = ''
		else:
			ans = 'デフォルトに戻るのに失敗 @_apkX'

	elif 'imitat' in text and status['in_reply_to_screen_name'] in {BOT_ID}:
		try:
			cmds = text.split(' ')
			tgname = cmds[1].replace('@', '').replace('.', '')
			ans = 'imitateErr'
			print(cmds, tgname)
			# imitation中
			print(tmp['imitating'])
			##TODO check whether ff or not
			if imitate(tgname):
				ans = tgname + 'さんのまねっこ5分間開始 defaultリプで元に戻ります。'
				mode = 'open'
				tmp['imitating'] = tgname
				# tmp['clocks']['imitationLimit'] = now + timedelta(hours=0, minutes=30)
				# tmp['clocks']['imitationTimer'] = now + timedelta(hours=0, minutes=5)
				setTime = now + timedelta(hours=0, minutes=5)
				dealSQL.saveTask(taskdict = {'who':BOT_ID, 'what': 'imitate.default', 'to_whom':screen_name, 'when':setTime, 'tmptext': ''})
			else:
				ans = tgname + 'さんのまねっこ失敗 FF外の場合はまねっこできません。'
		except Exception as e:
			print('[ERR][Main.imitation]')
			print(e)
			ans = 'まねっこがどこか失敗です...'

	elif 'しりとり' in text or userinfo['mode'] == 'srtr':
		userinfo['mode'] = 'srtr'
		ans = myGame.SRTR(text, screen_name)
		if '\END' in ans:
			ans = ans.replace('\END', '')
			userinfo['mode'] = 'dialog'
		if '\MISS' in ans:
			ans = ans.replace('\MISS', '')
			if userinfo['cnt'] > 3:
				ans = dealSQL.getPhrase(s_type = 'shiritori.end', n = 1)
				userinfo['mode'] = 'dialog'
				userinfo['cnt'] = 0
		else:
			userinfo['cnt'] = 0

	elif 'おてもん' in text or userinfo['mode'] == 'mon':
		userinfo['mode'] = 'mon'
		userinfo['cnt'] = 0
		try:
			ans = GAME_MON.Main(text, screen_name, 'アルパカさん')
			if '\END' in ans:
				ans = ans.replace('\END', '')
				userinfo['mode'] = 'dialog'
			if '\MISS' in ans:
				ans = ans.replace('\MISS', '')
		except:
			ans = '工事中...'
			userinfo['mode'] = 'dialog'
	elif 'おみくじ' in text or '占い' in text:
		ans = dealSQL.getPhrase(s_type = 'おみくじ', n = 20)
	elif 'おはよ' in text and status['in_reply_to_screen_name'] in set([None, BOT_ID]):
		ans = dealSQL.getPhrase(s_type = 'goodmorning', n = 1)
	elif 'おやすみ' in text and status['in_reply_to_screen_name'] in set([None, BOT_ID]):
		ans = dealSQL.getPhrase(s_type = 'goodnight', n = 1)
		userinfo['mode'] = 'sleeping'
	elif 'トレンドワード' in text:
		ans = '\n- '.join(['[現在のトレンドワード]']+tmp['trendwordsList'][:10])
	elif deltasec > 600000: #3日
		ans = dealSQL.getPhrase(s_type = 'longtimenosee', n = 1)
	else:
		ans = trigramMC.dialog(text, isRandMetaS = True, isPrint = True, isLearn = False, n =5, tryCnt = 10, needs = set(['名詞', '固有名詞'])).replace('<人名>', status['user']['name'])
		ans = charconv(ans, BOT_ID)

	# if isNewUser:
	# 	ans = dealSQL.getPhrase(s_type = 'welcomeNewUser', n = 20)

	if ans != '':
		tweetStatus, tmp = twf.send(ans, screen_name = screen_name, imgfile = filename, status_id = status_id, mode = mode, tmp = tmp)
	userinfo['time'] = now
	userinfo['cnt'] += 1
	dealSQL.saveUserInfo(userinfo)
	return tweetStatus, tmp
Ejemplo n.º 17
0
import time as t

import DataHandler as d
import TFIDF as tf
import UserProfiler as u

# Load in score data
start = t.time()
dataHandler = d.DataHandler(False)
print('Data loaded and preprocessed in ' + str(t.time() - start) + ' seconds.')
print()

# Use TFIDF algo
start = t.time()
similarities = tf.TFIDF(dataHandler.anime)
print('Genre importance calculated in ' + str(t.time() - start) + ' seconds.')
print()

# Get recommendations
start = t.time()
recommender = u.UserProfiler(similarities, dataHandler)
print('Recommednations generated in ' + str(t.time() - start) + ' seconds.')
print()

watched_anime = recommender.get_user_ratings(
    'user in animelist_cleaned.csv').sort_values(by='my_score',
                                                 ascending=False)
recommendations = recommender.get_user_recommendations(
    'user in animelist_cleaned.csv')

print('User has watched ' + str(watched_anime.size) + ' anime.')
Ejemplo n.º 18
0
			except Exception as e:
				print(e)
				ans = '画像を読み取れませんでした。'
				tweetStatus, Altbot_status = tweet(ans, screen_name = screen_name, status_id = status_id, bot_status = bot_status)
	elif userinfo['mode'] == 'confirm.tag.img':
		userinfo['cnt'] = 0
		if status['entities']['hashtags'] != []:
			imgtag = status['entities']['hashtags'][0]['text']
			isMoveDIR = True
		elif not 'ない' in text and ('正解' in text or '正し' in text):
			ans = 'やりました!正解ですね。'
			userinfo['mode'] = 'dialog'
			isMoveDIR = False
		else:
			try:
				imgtag = TFIDF.calcKWs(text, length = 1, needs = set(['固有名詞', '名詞']))[0][0]
			except Exception as e:
				print(e)
				imgtag = 'undefined'
			isMoveDIR = True
		if isMoveDIR:
			src = userinfo['tmpFile']
			drc = ''.join(['/Users/xxxx', imgtag])
			if os.path.exists(drc) == False:
				os.mkdir(drc)
			shutil.copy(src, drc)
			if imgtag != 'undefined':
				ans = ''.join(['...成る程...「', imgtag, '」なのですね。ありがとうございます。\n(フィードバックしました。学習反映にまでは時間がかかります。)'])
				userinfo['mode'] = 'dialog'
			else:
				ans = '...一体、これは何なのですか?(好奇心)'
Ejemplo n.º 19
0
def buscarLinksEnGoogle(datasetCSV):
    posts = datasetCSV.dataset

    for i in range(datasetCSV.inicio, datasetCSV.fin):
        try:
            print(i)
            # post_link = posts[i][1]
            link_url = posts[i][2]
            print(link_url)

            link = Link(link_url)
            print(link.linkDomain)
            if link.esLinkAOmitir():
                print('Omitiendo')
                posts[i].append("LINK NULL")
                posts[i].append("LINK NULL")
                posts[i].append("LINK NULL")
                continue

            # postFacebook = PostFacebook(post_link)
            # datosPost = postFacebook.getInfoPostFacebook()
            # titulo_post = datosPost[0] # esto desde el csv
            # posts[i].append(titulo_post)
            # posts[i].append(datosPost[1])

            post_fecha = convertirTextoAFecha(posts[i][3])
            titulo_post = posts[i][4]
            print(titulo_post)

            # Buscar el link en base a los datos que tengo
            # texto del post
            # de los resultados elegir segun la fecha tmb

            linkMismoDominio = []

            # en caso que funcione el link lo agrego directamente
            if (link.linkReal is not None):
                linkMismoDominio.append(link.linkReal)

            texto_a_buscar = titulo_post.replace('"',
                                                 '') + " " + link.linkDomain
            for url in search(texto_a_buscar, tld='com.ar', lang='es', stop=5):
                print(url)

                linkNuevo = Link(url)

                if ('clarin' in url):
                    postPortal = ClarinPost(linkNuevo)
                else:
                    if ('nacion' in url):
                        postPortal = NacionPost(linkNuevo)
                    else:
                        continue

                fecha_portal = postPortal.getFecha()
                if (fecha_portal == "FECHA NO ENCONTRADA"):
                    continue

                fecha_portal = datetime.datetime.strptime(
                    fecha_portal, '%Y-%m-%d').date()
                if (fecha_portal <= post_fecha):
                    linkMismoDominio.append(url)

            # Siempre doy prioridad al orden de google porque es mas problable que sea
            # mejor su medida de similitud que la que podamos calcular por
            # nuestros medios
            cantidadLinksMismoDominio = len(linkMismoDominio)

            if cantidadLinksMismoDominio == 1:
                posts[i].append(linkMismoDominio[0])
            else:
                if cantidadLinksMismoDominio == 0:
                    print("No encontre link")
                    posts[i].append("No encontre link")
                else:
                    print("Necesito Distancia de Texto")
                    tfidf = TFIDF.TfIdf()
                    linkMasProximo = tfidf.getNearestLinkToTerm(
                        linkMismoDominio, titulo_post)
                    if linkMasProximo is None:
                        print("No encontre link")
                        posts[i].append("No encontre link")
                    else:
                        print(linkMasProximo)
                        posts[i].append(linkMasProximo)

            posts[i].append(linkMasProximo)
            # esperar unos segundos para que nos banee google
            time.sleep(10)
        except Exception as ex:
            columnas = len(posts[i]) + 1
            for _ in range(columnas, datasetCSV.cantidadColumnas):
                posts[i].append("TIME OUT" + str(ex))
            print("TIME OUT")
            print(ex)
            time.sleep(30)
def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)

        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)

        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'labels':         batch[3]}
                if args.model_type != 'distilbert':
                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

        eval_loss = round(eval_loss / nb_eval_steps, 10)
        if args.output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
        result = compute_metrics(eval_task, preds, out_label_ids)
        results.update(result)

        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")   # here is output
        print("-------------------------  eval_loss = ", eval_loss)
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
            ## -------------------------------------

            # Make output file and image graph. import make_output_file_graph.py
            Make_out_graph = ifg.make_output_file_graph(preds)
            one, zero = Make_out_graph.make_output_labels()
            Make_out_graph.make_output_labels_num(one, zero)
            #Make_out_graph.make_graph(one, zero)
            #Make_out_graph.make_graph2(num_, loss_graph)
            Make_out_graph.make_bert_pred_bad_des()
            Make_out_graph.make_bad_long_des_file()
            Make_out_graph.make_bad_shot_des_file()

            count = 0
            bzl.bring_output_label()
            filename = "sd1_allp.tsv"
            bzl.bring_test_file(filename, count)
            bzl.print_bad_des(filename)
            TFIDF.main__run()
            # -------------------------------------
    return results
Ejemplo n.º 21
0
inverted_index = indexing.invert_index(direct_index)
print len(inverted_index), 'terms indexed.\n'

# For convenience, remember the number of terms and reviews
n_reviews = len(direct_index)
n_terms = max(t['termid'] for t in inverted_index.values()) + 1

# Compute the (optional) IDF threshold and print it if isn't null
idf_threshold = None if not term_fraction else indexing.IDF_threshold(inverted_index, term_fraction)
if idf_threshold:
    print "IDF threshold set at", idf_threshold, ".\n"

###############################################################################

print 'Computing TFIDF representations...'
TFIDFs = TFIDF.compute_all_TFIDFs(inverted_index, idf_threshold)
#print "TFIDF  " + str(TFIDFs)

###############################################################################

target_index = None

if search_text:
    # Given a search_text, compute its TFIDF representation
    print 'Computing search text\'s TFIDF representation...'
    search_terms = parser.extract_terms(search_text)
    searched_TFIDF = TFIDF.compute_new_TFIDF(search_terms, inverted_index, idf_threshold)

    # Warn if the search_text is empty due to an high IDF threshold
    if len(searched_TFIDF) == 0:
        print '*** WARNING *** Empty search, IDF threshold is too high!'
Ejemplo n.º 22
0
def process_documents():
    '''Read From Document'''
    documents = Utilities.read_from_time_all()
    #documents = read_lines()
    '''Tokens and Stem Documents'''
    documents = Utilities.tokenize_stem_docs(documents)
    '''calculate doc lengths'''
    doc_len = Utilities.calculate_doc_len(documents)
    ''' term frequency'''
    tf = TFIDF.term_frequency(documents)
    '''calculates tf-idf'''
    tfidf = TFIDF.TFIDF(len(documents), tf)
    '''Read From Document'''
    queries = Utilities.read_from_time_que()
    #queries = ['pop love song', 'chinese american', 'city']
    '''Tokens and Stem Documents'''
    queries = Utilities.tokenize_stem_docs(queries)

    #print Search.search_by_cosine(tfidf,len(documents),['CARTOONISTS'.lower()])

    cosine_result = []
    rsv_result = []
    BM25_1_5 = []  #b=1 k= 0.5
    BM25_1_1 = []  #b=1 k= 1
    BM25_2_5 = []  #b=2 k= 0.5
    BM25_2_1 = []  #b=2 k= 1

    for query in queries:
        cosine_result.append(
            Search.search_by_cosine(tfidf, len(documents), query))
        rsv_result.append(Search.search_by_rsv(tf, len(documents), query))
        BM25_1_5.append(Search.search_by_BM25(tf, doc_len, query, 1.0, 0.5))
        BM25_1_1.append(Search.search_by_BM25(tf, doc_len, query, 1.0, 1.0))
        BM25_2_5.append(Search.search_by_BM25(tf, doc_len, query, 2.0, 0.5))
        BM25_2_1.append(Search.search_by_BM25(tf, doc_len, query, 2.0, 1.0))

    #print cosine_result[1]
    '''
    read from time.rel
    '''
    rel_dict = Utilities.read_from_time_rel()
    '''
    print result
    '''
    result = []

    result.append(('System', 'Precision', 'Recall', 'F1', 'MAP'))
    result.append(('cosine  ', ) + Metrics.getMetrics(
        cosine_result, rel_dict, 20))  #limit to top 20 search
    result.append(('RSV  ', ) + Metrics.getMetrics(rsv_result, rel_dict, 20))
    result.append(('BM25 (1, .5) ', ) +
                  Metrics.getMetrics(BM25_1_5, rel_dict, 20))
    result.append(('BM25 (1, 1) ', ) +
                  Metrics.getMetrics(BM25_1_1, rel_dict, 20))
    result.append(('BM25 (2, .5) ', ) +
                  Metrics.getMetrics(BM25_2_5, rel_dict, 20))
    result.append(('BM25 (2, 1) ', ) +
                  Metrics.getMetrics(BM25_2_1, rel_dict, 20))

    Utilities.tabulate(result)
    Utilities.plot_graph(result)
Ejemplo n.º 23
0
def Main(status, bot_status):
    text = status["cleanText"]
    status_id = status["id_str"]
    screen_name = status["user"]["screen_name"]
    userinfo, isNewUser = dealSQL.getUserInfo(screen_name)
    now = datetime.utcnow()
    if isNewUser:
        welcomeTweet = "はじめまして。よろしくお願いしますね。\n[新規ユーザー名検出...ユーザー情報登録完了]"
        tweetStatus, Altbot_status = tweet(
            welcomeTweet, screen_name=screen_name, status_id=status_id, bot_status=bot_status
        )
    try:
        delta = now - datetime.strptime(userinfo["time"], "%Y-%m-%d %H:%M:%S.%f")
        deltasec = delta.total_seconds()
    except:
        deltasec = 50

    if deltasec > 1000:
        userinfo["cnt"] = 0
        userinfo["mode"] = "dialog"
        if userinfo["mode"] == "confirm.tag.img":
            src = userinfo["tmpFile"]
            drc = "".join(["/Users/masaMikam/Dropbox/Project/umiA/Data/imgsfeedback/undefined"])
            if os.path.exists(drc) == False:
                os.mkdir(drc)
            shutil.copy(src, drc)

    if userinfo["mode"] == "ignore":
        userinfo["cnt"] = 0
        userinfo["mode"] = "dialog"
        Altbot_status = bot_status
        tweetStatus = False
    elif deltasec < 3:
        ans = dealSQL.getPhrase(s_type="tooFreq", n=20)
        userinfo["mode"] = "ignore"
        tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status)
    elif "userinfo" in text:
        ans = str(userinfo)
        tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status)
    elif userinfo["cnt"] > 5:
        ans = dealSQL.getPhrase(s_type="cntOver", n=20)
        userinfo["mode"] = "ignore"
        tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status)
    elif "media" in status["entities"]:
        userinfo["cnt"] = 0
        fileID = now.strftime("%Y%m%d%H%M%S")
        if status["entities"]["hashtags"] != []:
            imgtag = status["entities"]["hashtags"][0]["text"]
            try:
                filenames = saveMedias(
                    status, ID=fileID, DIR="/Users/masaMikam/Dropbox/Project/umiA/Data/imgsfeedback/" + imgtag
                )
                ans = "".join(["画像を「", imgtag, "」として学習対象に登録しました。ご協力ありがとうございます。"])
            except Exception as e:
                print(e)
                ans = "画像を読み取れませんでした。"
            tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status)
        else:
            try:
                filenames = saveMedias(status, ID=fileID, DIR="/Users/masaMikam/Dropbox/Project/umiA/Data/twimgs")
                filename = filenames[0]
                label, FACEflag, altfilename = NNimg.predictAns(
                    filename=filename,
                    isShow=False,
                    model="/Users/masaMikam/Dropbox/Project/umiA/Data/ML_Brain/DNN_skf",
                    workDIR="",
                )
                if FACEflag == False:
                    ans = "顔認識に失敗しています。 精度は下がりますが...\n" + label + "ですか?正しかったら、「正解」と言ってください。"
                else:
                    ans = label + "ですか?正しかったら、「正解」と言ってください。"
                tweetStatus, Altbot_status = tweet(
                    ans, screen_name=screen_name, status_id=status_id, imgfile=altfilename, bot_status=bot_status
                )
                drc = "".join(["/Users/masaMikam/Dropbox/Project/umiA/Data/imgsfeedback/", label])
                if os.path.exists(drc) == False:
                    os.mkdir(drc)
                shutil.copy(filename, drc)

                userinfo["mode"] = "confirm.tag.img"
                print("/".join([drc, filename.split("/")[-1]]))
                userinfo["tmpFile"] = "/".join([drc, filename.split("/")[-1]])
            except Exception as e:
                print(e)
                ans = "画像を読み取れませんでした。"
                tweetStatus, Altbot_status = tweet(
                    ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status
                )
    elif userinfo["mode"] == "confirm.tag.img":
        userinfo["cnt"] = 0
        if status["entities"]["hashtags"] != []:
            imgtag = status["entities"]["hashtags"][0]["text"]
            isMoveDIR = True
        elif not "ない" in text and ("正解" in text or "正し" in text):
            ans = "やりました!正解ですね。"
            userinfo["mode"] = "dialog"
            isMoveDIR = False
        else:
            try:
                imgtag = TFIDF.calcKWs(text, length=1, needs=set(["固有名詞", "名詞"]))[0][0]
            except Exception as e:
                print(e)
                imgtag = "undefined"
            isMoveDIR = True
        if isMoveDIR:
            src = userinfo["tmpFile"]
            drc = "".join(["/Users/masaMikam/Dropbox/Project/umiA/Data/imgsfeedback/", imgtag])
            if os.path.exists(drc) == False:
                os.mkdir(drc)
            shutil.copy(src, drc)
            if imgtag != "undefined":
                ans = "".join(["...成る程...「", imgtag, "」なのですね。ありがとうございます。\n(フィードバックしました。学習反映にまでは時間がかかります。)"])
                userinfo["mode"] = "dialog"
            else:
                ans = "...一体、これは何なのですか?(好奇心)"
                ##送信
        tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status)
    elif "しりとり" in text or userinfo["mode"] == "srtr":
        userinfo["mode"] = "srtr"
        ans = myGame.SRTR(text, screen_name)
        if "\END" in ans:
            ans = ans.replace("\END", "")
            userinfo["mode"] = "dialog"
        if "\MISS" in ans:
            ans = ans.replace("\MISS", "")
            if userinfo["cnt"] > 3:
                ans = "しりとりは終わりにしましょう"
                userinfo["mode"] = "dialog"
                userinfo["cnt"] = 0
        else:
            userinfo["cnt"] = 0
        tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status)
    elif "おみくじ" in text or "占い" in text:
        ans = dealSQL.getPhrase(s_type="おみくじ", n=20)
        tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status)
    elif deltasec > 259200:  # 3日
        ans = "ご無沙汰しております...おかえりなさい。"
        tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status)
    else:
        ans = trigramMC2.dialog(
            text, isRandMetaS=True, isPrint=True, isLearn=False, n=5, tryCnt=10, needs=set(["名詞", "固有名詞", "動詞", "形容詞"])
        )
        ans = ans.replace("<人名>", status["user"]["name"])
        tweetStatus, Altbot_status = tweet(ans, screen_name=screen_name, status_id=status_id, bot_status=bot_status)

    userinfo["time"] = now
    userinfo["cnt"] += 1
    dealSQL.saveUserInfo(userinfo)
    return tweetStatus, Altbot_status
Ejemplo n.º 24
0
    def textrank_sentence(self, doc, threshold):
        '''
        使用 TextRank 算法抽取关键句,使用论文中提供的相似度计算公式,详见实验报告
        抽取 `size` 个句子作为下一次喂给 MMR 的输入
        '''

        # create the undirect graph for the textrank sentences
        graphh = Graph(self.d, self.iteration)

        sentences = TFIDF.cut_by_sentence(doc)

        # change the size_word dynamicly
        if len(sentences) < 9:
            size_word = 2
        else:
            # maybe 6
            size_word = 5

        if len(sentences) <= 4:
            print("The content is too short, do not need to summary !")
            return sentences

        # size may need to large as 0.6
        size = round(len(sentences) * 0.3)
        if size == 0:
            size = min(1, len(sentences))
        # use TextRank to get the keywords from the sentence

        # Get Keywords, need to fix
        # get the data
        sentences_array = []
        for sent in sentences:
            result = self.textrank_words(sent, size_word)

            if result == False:
                # do not find the keywords with the TextRank
                result = []

            # TFIDF 补充关键字,并考虑合并关键词组
            result = append_keywords(sent, result, self.keywords_doc)
            # result = join_result(result, sent)
            # add the keywords with the TFIDF

            if result == False:
                words = []
            else:
                words = list(map(lambda x: x[0], result))
            sentences_array.append(words)

        # Get Keywords Over, need to finish fix operator

        # 初始化 TextRank 图
        for i, sent1 in enumerate(sentences_array):
            for j, sent2 in enumerate(sentences_array):
                if i == j or len(sent1) == 0 or len(sent2) == 0:
                    continue
                weights = textrank_similiar(np.array(sent1), np.array(sent2))
                if weights > threshold:
                    # similiar
                    graphh.add_edge(i, j, weights)

        # 图排序开始
        graphh.rank()
        result = graphh.WS

        if result is None:
            print("Error, because of the high threshold !")
            return False

        result = sorted(result.items(), key=itemgetter(1), reverse=True)

        if size > len(result):
            print("Overload !")
            size = len(result)
        result = result[:size]

        # 句子抽取
        result_sent = []
        for i, j in result:
            result_sent.append(sentences[i])

        return result_sent
Ejemplo n.º 25
0
'''
for t, l in inverted_index.items():
	# Write it
	sys.stdout.write ('Term: %s, ID: %d, IDF: %.5f\n'
				% (t, l['termid'], l['idf']))
	# Dor all documents appearing in the inverted index
	for d, c, tf in l['occurrences']:
		# write the document triplet
		sys.stdout.write ('\tDoc ID: %03d, # occurrences: %5d, TF: %.8f\n'
				% (d, c, tf))
exit()
'''
##############################################################

print 'Computing TFIDF representations of documents in the corpus'
TFIDFs = TFIDF.compute_all_TFIDFs (inverted_index, idf_threshold)

# Print all cosine similarities between documents
'''
similarities = [[TFIDF.cosine_similarity(d1,d2) for d2 in TFIDFs] for d1 in TFIDFs]
print similarities
exit()
'''

# Given a query, compute its TFIDF representation
print 'Computing query\'s TFIDF representation'

query = 'business became meaningful'
query_terms = parser.extract_terms (query)
q_TFIDF = TFIDF.compute_new_TFIDF (query_terms, inverted_index, idf_threshold)
Ejemplo n.º 26
0
__author__ = 'sankalp'
import TextCleanup
import TFIDF

def store_tfidf_to_db(feature_names, tfidf_matrix):
    from MySQLConnecter import MySQL
    db_obj = MySQL()
    for col in tfidf_matrix.nonzero()[1]:
        word = feature_names[col]
        score = tfidf_matrix[0, col]
        db_obj = MySQL.insert(word, score)
        pass

tf_idf_params = TFIDF.calculate_tf_idf()
tfidf = tf_idf_params[0]
tfidf_matrix = tf_idf_params[1]
feature_names = tf_idf_params[2]


# To store the TFIDF result in to the MySQL database
store_tfidf_to_db(feature_names, tfidf_matrix)

# TF-IDF is a sparse matrix. To convert it into a dense matrix, uncomment this line.
dense = tfidf_matrix.todense()


print('Enter a search term: ')
# str = 'Dispersion and migration of uranium (U) and other toxic metals and radionuclides from'
str = input()
response = tfidf.transform([str])
Ejemplo n.º 27
0
all_headlines = NewsHeadlinesScraper.get_results("ufc")
print(all_headlines)

data = {}
noun_phrases = []

translator = str.maketrans('', '', string.punctuation)

for headline in range(len(all_headlines) - 1):
    clean_string = all_headlines[headline].translate(translator)
    print(clean_string + " clean string")
    words = TextBlob(clean_string)
    noun_phrases += words.noun_phrases
    tokenised_string = clean_string.split(" ")
    #print(tokenised_string)
    for word in range(len(tokenised_string) - 1):
        tfidf_tf_val = TFIDF.term_frequency(tokenised_string[word],
                                            all_headlines[headline])
        print(all_headlines[headline])
        tfidf_idf_val = TFIDF.inverse_document_frequency(
            tokenised_string[word], all_headlines)
        final_val = tfidf_tf_val * tfidf_tf_val
        data[tokenised_string[word]] = final_val
sorted_data = sorted(data.items(), key=operator.itemgetter(1))

for items in sorted_data:
    print(items)

print(noun_phrases)
Ejemplo n.º 28
0
	factFile = sys.argv[1]
	rumorFile = sys.argv[2]

	rumorStemmedCorpus = defaultdict(list)
	preprocessing(rumorStemmedCorpus,rumorFile)
	#print str(rumorStemmedCorpus)

	factStemmedCorpus = defaultdict(list)
	preprocessing(factStemmedCorpus,factFile)
	#print str(factStemmedCorpus)
	rumour = tfidf(rumorStemmedCorpus);
	tfr  = rumour.getTF()
	fact = tfidf(factStemmedCorpus);
	tff = fact.getTF()

	idf = TFIDF.getIDF(tfr,tff);
	
	totalRumor = getTotalTFIDF(tfr,idf)
	totalFact = getTotalTFIDF(tff,idf)

	#print totalRumor
	#print totalFact
	docRumourScore = defaultdict(float)
	docFactScore = defaultdict(float)
        
        rumorBigger = 0
	factBigger = 0
	allStemmedCorpus = rumorStemmedCorpus.copy()
	allStemmedCorpus.update(factStemmedCorpus)
	for doc,words in allStemmedCorpus.iteritems():
	    if words is not None:	
Ejemplo n.º 29
0
        elements_per_topic[element] = elements_per_topic[element] + 1
    if element not in elements_per_topic:
        elements_per_topic[element] = 1

#dictionary that cointains, for each topic, the id of the document
document_list_for_each_topic = {}
for i in range(1000):
    topic = documentTopics[i][0]
    if topic in document_list_for_each_topic:
        document_list_for_each_topic[topic].append(i)
    if topic not in document_list_for_each_topic:
        document_list_for_each_topic[topic] = []
        document_list_for_each_topic[topic].append(i)

#prepare documents wit correct vocabulary
plainDocs = TFIDF.preparePlainDocs(list_of_stemmed_docs)
documentConceptDocs = TFIDF.preparedocConcDocs(list_of_lemmatized_docs)
documentCatDocs = TFIDF.preparedocCatDocs(
    TFIDF.preparedocConcDocs(list_of_lemmatized_docs))

#cosin similarity computation for the three matrices
plainDocsCosSim = TFIDF.cos_sim(plainDocs)
documentConceptCosSim = TFIDF.cos_sim(documentConceptDocs)
documentCategoryCosSim = TFIDF.cos_sim(documentCatDocs)

#tfidf matrices
plainDocsTFIDF = TFIDF.tfIDF(plainDocs)
documentConceptTFIDF = TFIDF.tfIDF(documentConceptDocs)
documentCategoryTFIDF = TFIDF.tfIDF(documentCatDocs)

#cosine distance for agglomerative clustering
Ejemplo n.º 30
0
def Main(status, tmp, mode='dm'):
    ans = ''
    IMGfile = ''
    tweetStatus = ''
    filename = ''
    text = status['cleanText']
    status_id = status['id_str']
    screen_name = status['user']['screen_name']
    userinfo, isNewUser = dealSQL.getUserInfo(screen_name)
    now = status['now']

    BOT_ID = tmp['BOT_ID']

    #時間計測(秒)
    try:
        try:
            delta = now - userinfo['time']
        except:  #文字列対策
            print('convert str into datetime')
            delta = now - datetime.strptime(userinfo['time'],
                                            '%Y-%m-%d %H:%M:%S.%f')
        deltasec = delta.total_seconds()
    except Exception as e:
        print(e)
        deltasec = 50

    #返答タイムアウト処理
    if deltasec > 1000:
        userinfo['cnt'] = 0
        userinfo['mode'] = 'dialog'
        if userinfo['mode'] == 'confirm.tag.img':
            src = userinfo['tmpFile']
            drc = DIRIMGundefined
            if os.path.exists(drc) == False:
                os.mkdir(drc)
            shutil.copy(src, drc)

    # 応答
    if 'ping' in text:
        ans = ''.join(['Δsec : ', str(deltasec)])
    elif userinfo['mode'] == 'ignore':
        userinfo['cnt'] = 0
        userinfo['mode'] = 'dialog'
    elif deltasec < 3:
        ans = dealSQL.getPhrase(s_type='tooFreq', n=20)
        userinfo['mode'] = 'ignore'
    elif 'userinfo' in text:
        ans = str(userinfo)
    elif userinfo['mode'] == 'learn.text':
        if status['in_reply_to_screen_name'] in {BOT_ID}:
            text = status['text'].replace('@' + BOT_ID, '')
            text = re.sub(r'(@[^\s ]+)', '{ID}', text)
            if 'end' in text:
                userinfo['mode'] = 'dialog'
                userinfo['tmp'] = ''
                ans = 'learningモードをクローズしました。この結果は開発にフィードバックされます。ご協力感謝します。'
            else:
                labelstatus = userinfo['tmp']
                userinfo['cnt'] = 0
                dealSQL.savePhrase(phrase=text,
                                   author=screen_name,
                                   status=labelstatus,
                                   s_type='UserLearn')
                ans = '[learning]saved!!... 続けて覚えさせるテキストをリプライしてください。\nendと入力するまでモードは続きます。'
        else:
            ans = 'learningモードの途中です。覚えさせるテキストをリプライしてください。\nendと入力するまでモードは続きます。'
    elif userinfo['mode'] == 'sleeping' and deltasec > 3600:
        ans = dealSQL.getPhrase(s_type='goodmorning', n=1)
        ans += '\n' + dealSQL.getPhrase(s_type='sleep.span', n=1).format(
            utiltools.sec2HMSstr(deltasec))
        userinfo['mode'] = 'dialog'

    elif 'media' in status['entities'] and status[
            'in_reply_to_screen_name'] in {BOT_ID}:
        userinfo['cnt'] = 0
        fileID = now.strftime("%Y%m%d%H%M%S")
        if status['entities']['hashtags'] != []:
            imgtag = status['entities']['hashtags'][0]['text']
            try:
                filenames = utiltools.saveMedias(status,
                                                 ID=fileID,
                                                 DIR='/'.join(
                                                     [DIRIMGfeedback, imgtag]))
                ans = dealSQL.getPhrase(s_type='appreciate.giveme.img',
                                        n=1).format(imgtag)
            except Exception as e:
                print(e)
                ans = dealSQL.getPhrase(s_type='err.get.img', n=1)
        else:
            try:
                filenames = utiltools.saveMedias(status,
                                                 ID=fileID,
                                                 DIR=DIRIMGtmp)
                filename = filenames[0]
                label, FACEflag, IMGfile = NNimg.predictAns(filename=filename,
                                                            isShow=False,
                                                            model=modelNNimg,
                                                            workDIR='')
                if FACEflag == False:
                    ans = dealSQL.getPhrase(s_type='confirm.detect.img.noface',
                                            n=1).format(label)
                else:
                    ans = dealSQL.getPhrase(s_type='confirm.detect.img',
                                            n=1).format(label)

                drc = '/'.join([DIRIMGfeedback, label])
                if os.path.exists(drc) == False:
                    os.mkdir(drc)
                shutil.copy(filename, drc)

                userinfo['mode'] = 'confirm.tag.img'
                print('/'.join([drc, filename.split('/')[-1]]))
                userinfo['tmpFile'] = '/'.join([drc, filename.split('/')[-1]])
                filename = IMGfile
            except Exception as e:
                print(e)
                ans = dealSQL.getPhrase(s_type='err.get.img', n=1)

    elif userinfo['mode'] == 'confirm.tag.img':
        userinfo['cnt'] = 0
        if status['entities']['hashtags'] != []:
            imgtag = status['entities']['hashtags'][0]['text']
            isMoveDIR = True
        elif not 'ない' in text and ('正解' in text or '正し' in text):
            ans = dealSQL.getPhrase(s_type='success.detect.img', n=1)
            userinfo['mode'] = 'dialog'
            isMoveDIR = False
        else:
            try:
                imgtag = TFIDF.calcKWs(text, length=1, needs={'固有名詞',
                                                              '名詞'})[0][0]
            except Exception as e:
                print(e)
                imgtag = 'undefined'
            isMoveDIR = True
        if isMoveDIR:
            src = userinfo['tmpFile']
            drc = '/'.join([DIRIMGfeedback, imgtag])
            if os.path.exists(drc) == False:
                os.mkdir(drc)
            shutil.copy(src, drc)
            if imgtag != 'undefined':
                ans = dealSQL.getPhrase(s_type='appreciate.feedback.img',
                                        n=1).format(imgtag)
                userinfo['mode'] = 'dialog'
            else:
                ans = dealSQL.getPhrase(s_type='ask.feedback.img', n=1)
    elif userinfo['cnt'] > 6:
        ans = dealSQL.getPhrase(s_type='cntOver', n=20)
        userinfo['mode'] = 'ignore'
    elif '海未face' in text and status['in_reply_to_screen_name'] in {BOT_ID}:
        umipicDIR = '/Users/xxxx'
        filename = utiltools.getRandIMG(umipicDIR)
        ans = '...'
    elif 'timer' in text and status['in_reply_to_screen_name'] in {BOT_ID}:
        cmds = text.split(' ')
        try:
            timersec = cmds[1]
        except:
            timersec = 300
        try:
            tmptext = cmds[2]
        except:
            tmptext = ''
        setTime = datetime.utcnow() + timedelta(
            hours=0, minutes=0, seconds=int(timersec))
        dealSQL.saveTask(
            taskdict={
                'who': screen_name,
                'what': 'timer',
                'to_whom': screen_name,
                'when': setTime,
                'tmptext': tmptext
            })
        setTimeJ = setTime + timedelta(hours=9)
        ans = datetime.strftime(setTimeJ,
                                '%m月%d日 %H時%M分%S秒') + 'にタイマーをセットしました。'
    elif 'learn' in text and status['in_reply_to_screen_name'] in {BOT_ID}:
        userinfo['mode'] = 'learn.text'
        cmds = text.split(' ')
        tmplabel = cmds[1]
        userinfo['tmp'] = tmplabel
        userinfo['cnt'] = 0
        ans = '[Learningモード]\n' + tmplabel + 'として覚えさせるテキストをリプライしてください。\nendと入力するまでモードは続きます。'

    elif 'respon' in text and status['in_reply_to_screen_name'] in {BOT_ID}:
        if 'clear' in text:
            try:
                tmp['responseWord'] = {}
                ans = '全てのTL監視を停止しました。by @' + screen_name + '\n 監視ワードを追加するには半角スペース区切りで、\n response [監視ワード] [応答文]'
                screen_name = ''
            except:
                ans = '設定失敗。半角スペースで区切ってオーダーしてください。'
        else:
            try:
                cmds = text.split(' ')
                tgword = cmds[1]
                response = cmds[2]
                if len(tgword) > 3:
                    tmp['responseWord'][tgword] = response
                    ans = '「' + tgword + '」を監視して\n「' + response + '」と5分間反応します。by @' + screen_name + '\n 監視ワードを追加するには半角スペース区切りで、\n response [監視ワード] [応答文]'
                    setTime = datetime.utcnow() + timedelta(hours=0, minutes=5)
                    dealSQL.saveTask(
                        taskdict={
                            'who': screen_name,
                            'what': 'erase.tmp.responseWord',
                            'to_whom': screen_name,
                            'when': setTime,
                            'tmptext': tgword
                        })
                    screen_name = ''
                else:
                    ans = '監視ワードは4文字以上である必要があります。'
            except:
                ans = '設定失敗。半角スペースで区切ってオーダーしてください。'
    elif 'kusoripu' in text and status['in_reply_to_screen_name'] in {BOT_ID}:
        try:
            cmds = text.split(' ')
            tgname = cmds[1]
            user = twtr.get_user(screen_name=tgname)._json
            isFollowing = user['following']
            if isFollowing:
                screen_name = ''
                status_id = ''
                ans = getKusoripu(tg1=tgname)
            else:
                ans = 'そのユーザーはFF外です。クソリプは制限されます。'
        except:
            ans = 'クソリプ失敗。半角スペースで区切ってオーダーしてください。送信先はアットマークなしで記述してください。'

    elif 'su modsys' in text and status['in_reply_to_screen_name'] in {BOT_ID}:
        cmds = text.split(' ')
        tmp[cmds[2]] = cmds[3]
        ans = 'mod ' + cmds[2] + ' into ' + cmds[3]

    elif tmp['imitating'] != '' and 'default' in text:
        if twf.defaultProfile():
            ans = 'デフォルトに戻りました'
            tmp['imitating'] = ''
        else:
            ans = 'デフォルトに戻るのに失敗 @_apkX'

    elif 'imitat' in text and status['in_reply_to_screen_name'] in {BOT_ID}:
        try:
            cmds = text.split(' ')
            tgname = cmds[1].replace('@', '').replace('.', '')
            ans = 'imitateErr'
            print(cmds, tgname)
            # imitation中
            print(tmp['imitating'])
            ##TODO check whether ff or not
            if imitate(tgname):
                ans = tgname + 'さんのまねっこ5分間開始 defaultリプで元に戻ります。'
                mode = 'open'
                tmp['imitating'] = tgname
                # tmp['clocks']['imitationLimit'] = now + timedelta(hours=0, minutes=30)
                # tmp['clocks']['imitationTimer'] = now + timedelta(hours=0, minutes=5)
                setTime = now + timedelta(hours=0, minutes=5)
                dealSQL.saveTask(
                    taskdict={
                        'who': BOT_ID,
                        'what': 'imitate.default',
                        'to_whom': screen_name,
                        'when': setTime,
                        'tmptext': ''
                    })
            else:
                ans = tgname + 'さんのまねっこ失敗 FF外の場合はまねっこできません。'
        except Exception as e:
            print('[ERR][Main.imitation]')
            print(e)
            ans = 'まねっこがどこか失敗です...'

    elif 'しりとり' in text or userinfo['mode'] == 'srtr':
        userinfo['mode'] = 'srtr'
        ans = myGame.SRTR(text, screen_name)
        if '\END' in ans:
            ans = ans.replace('\END', '')
            userinfo['mode'] = 'dialog'
        if '\MISS' in ans:
            ans = ans.replace('\MISS', '')
            if userinfo['cnt'] > 3:
                ans = dealSQL.getPhrase(s_type='shiritori.end', n=1)
                userinfo['mode'] = 'dialog'
                userinfo['cnt'] = 0
        else:
            userinfo['cnt'] = 0

    elif 'おてもん' in text or userinfo['mode'] == 'mon':
        userinfo['mode'] = 'mon'
        userinfo['cnt'] = 0
        try:
            ans = GAME_MON.Main(text, screen_name, 'アルパカさん')
            if '\END' in ans:
                ans = ans.replace('\END', '')
                userinfo['mode'] = 'dialog'
            if '\MISS' in ans:
                ans = ans.replace('\MISS', '')
        except:
            ans = '工事中...'
            userinfo['mode'] = 'dialog'
    elif 'おみくじ' in text or '占い' in text:
        ans = dealSQL.getPhrase(s_type='おみくじ', n=20)
    elif 'おはよ' in text and status['in_reply_to_screen_name'] in set(
        [None, BOT_ID]):
        ans = dealSQL.getPhrase(s_type='goodmorning', n=1)
    elif 'おやすみ' in text and status['in_reply_to_screen_name'] in set(
        [None, BOT_ID]):
        ans = dealSQL.getPhrase(s_type='goodnight', n=1)
        userinfo['mode'] = 'sleeping'
    elif 'トレンドワード' in text:
        ans = '\n- '.join(['[現在のトレンドワード]'] + tmp['trendwordsList'][:10])
    elif deltasec > 600000:  #3日
        ans = dealSQL.getPhrase(s_type='longtimenosee', n=1)
    else:
        ans = trigramMC.dialog(text,
                               isRandMetaS=True,
                               isPrint=True,
                               isLearn=False,
                               n=5,
                               tryCnt=10,
                               needs=set(['名詞', '固有名詞'
                                          ])).replace('<人名>',
                                                      status['user']['name'])
        ans = charconv(ans, BOT_ID)

    # if isNewUser:
    # 	ans = dealSQL.getPhrase(s_type = 'welcomeNewUser', n = 20)

    if ans != '':
        tweetStatus, tmp = twf.send(ans,
                                    screen_name=screen_name,
                                    imgfile=filename,
                                    status_id=status_id,
                                    mode=mode,
                                    tmp=tmp)
    userinfo['time'] = now
    userinfo['cnt'] += 1
    dealSQL.saveUserInfo(userinfo)
    return tweetStatus, tmp
Ejemplo n.º 31
0
        def getTF(self):
		return TFIDF.getTF(self.stemmedCorpus)