コード例 #1
0
ファイル: Phase 1.py プロジェクト: hellking4u/RecoSys
def phase1_update(source_prob, wiki_list, keywords, n_iter, verbosity, log, mode_of_operation = 4):
        '''
            Compute and return the updated source priorities based on the initial priorities (source_prob) and the keywords.
            Can use text rank (default) as well as tf-idf to do so.
            'log' is the name of file in which log should be saved.
        '''
        j=0
        logFile = open(log+'_intermediate_probs',"a")
        for kw in keywords:
                j = j+1
                wiki_content = sm.get_wiki_article(kw, verbose=verbosity, search_list=[w[1] for w in wiki_list])  #Get the content of the web pages respected to each website for the keyword 'kw'
                logwiki = open(log+"_"+kw+"_wiki_contents","a")
                for n in range(len(wiki_content)):
                        logwiki.write(wiki_list[n][0]+' :\n')
                        logwiki.write(wiki_content[n].encode("utf8")+'\n\n\n')
                logwiki.close()
                if verbosity : print "\n\n Done with Content Extraction. Begin keyword extraction algorithm..."
                if mode_of_operation == 4:
                        source_prob = utilities.textrank(source_prob, wiki_content, log+'_'+str(j)+'_'+kw)
                else:
                        source_prob = utilities.tfidf(source_prob, wiki_content, log+'_'+str(j)+'_'+kw, mode_of_operation=mode_of_operation, return_term=0)
                if verbosity : print "\n\n---------\n"
                for i in range(len(source_prob)):
                        if verbosity : print wiki_list[i][0], source_prob[i]/source_prob[0]
                        logFile.write(wiki_list[i][0]+" : "+str(source_prob[i]/source_prob[0])+'\n')
        logFile.close()
        logoutput = open(log,"a")
        logoutput.write("Source Probs : "+ str(source_prob)+'\n')
        logoutput.close()
        #tr_list = TextRank.text_rank(wiki_content[0])
        return source_prob
コード例 #2
0
ファイル: test3.py プロジェクト: hellking4u/RecoSys
def test3(source_prob, wiki_list, keywords, n_iter, verbosity, mode_of_operation, log, x):
        j=0
        logFile = open(log+'_intermediate_probs',"a")
        for kw in keywords:
                j = j+1
                wiki_content = sm.get_wiki_article(kw, verbose=verbosity, search_list=[w[1] for w in wiki_list])
                if x == 1:
                    wiki_content[2] = wiki_content[2]+wiki_content[1]
                logwiki = open(log+"_"+kw+"_wiki_contents","a")
                for n in range(len(wiki_content)):
                        logwiki.write(wiki_list[n][0]+' :\n')
                        logwiki.write(wiki_content[n].encode("utf8")+'\n\n\n')
                logwiki.close()
                print "\n\n Done with Content Extraction. Begin keyword extraction algorithm..."
                if mode_of_operation == 4:
                        source_prob = utilities.textrank(source_prob, wiki_content, log+'_'+str(j)+'_'+kw)
                else:
                        source_prob = utilities.tfidf(source_prob, wiki_content, log+'_'+str(j)+'_'+kw, mode_of_operation=mode_of_operation, return_term=0)
                print "\n\n---------\n"
                for i in range(len(source_prob)):
                        logFile.write(wiki_list[i][0]+" : "+str(source_prob[i]/source_prob[0])+'\n')
        logFile.close()
        logoutput = open(log,"a")
        logoutput.write("Source Probs : "+ str(source_prob)+'\n')
        logoutput.close()
        #tr_list = TextRank.text_rank(wiki_content[0])
        return source_prob