def phase1_update(source_prob, wiki_list, keywords, n_iter, verbosity, log, mode_of_operation = 4): ''' Compute and return the updated source priorities based on the initial priorities (source_prob) and the keywords. Can use text rank (default) as well as tf-idf to do so. 'log' is the name of file in which log should be saved. ''' j=0 logFile = open(log+'_intermediate_probs',"a") for kw in keywords: j = j+1 wiki_content = sm.get_wiki_article(kw, verbose=verbosity, search_list=[w[1] for w in wiki_list]) #Get the content of the web pages respected to each website for the keyword 'kw' logwiki = open(log+"_"+kw+"_wiki_contents","a") for n in range(len(wiki_content)): logwiki.write(wiki_list[n][0]+' :\n') logwiki.write(wiki_content[n].encode("utf8")+'\n\n\n') logwiki.close() if verbosity : print "\n\n Done with Content Extraction. Begin keyword extraction algorithm..." if mode_of_operation == 4: source_prob = utilities.textrank(source_prob, wiki_content, log+'_'+str(j)+'_'+kw) else: source_prob = utilities.tfidf(source_prob, wiki_content, log+'_'+str(j)+'_'+kw, mode_of_operation=mode_of_operation, return_term=0) if verbosity : print "\n\n---------\n" for i in range(len(source_prob)): if verbosity : print wiki_list[i][0], source_prob[i]/source_prob[0] logFile.write(wiki_list[i][0]+" : "+str(source_prob[i]/source_prob[0])+'\n') logFile.close() logoutput = open(log,"a") logoutput.write("Source Probs : "+ str(source_prob)+'\n') logoutput.close() #tr_list = TextRank.text_rank(wiki_content[0]) return source_prob
def test3(source_prob, wiki_list, keywords, n_iter, verbosity, mode_of_operation, log, x): j=0 logFile = open(log+'_intermediate_probs',"a") for kw in keywords: j = j+1 wiki_content = sm.get_wiki_article(kw, verbose=verbosity, search_list=[w[1] for w in wiki_list]) if x == 1: wiki_content[2] = wiki_content[2]+wiki_content[1] logwiki = open(log+"_"+kw+"_wiki_contents","a") for n in range(len(wiki_content)): logwiki.write(wiki_list[n][0]+' :\n') logwiki.write(wiki_content[n].encode("utf8")+'\n\n\n') logwiki.close() print "\n\n Done with Content Extraction. Begin keyword extraction algorithm..." if mode_of_operation == 4: source_prob = utilities.textrank(source_prob, wiki_content, log+'_'+str(j)+'_'+kw) else: source_prob = utilities.tfidf(source_prob, wiki_content, log+'_'+str(j)+'_'+kw, mode_of_operation=mode_of_operation, return_term=0) print "\n\n---------\n" for i in range(len(source_prob)): logFile.write(wiki_list[i][0]+" : "+str(source_prob[i]/source_prob[0])+'\n') logFile.close() logoutput = open(log,"a") logoutput.write("Source Probs : "+ str(source_prob)+'\n') logoutput.close() #tr_list = TextRank.text_rank(wiki_content[0]) return source_prob
__author__ = 'nikhil' import search_module as sm import TextRank, gurmeet verbosity = True keyword = "Britain" mode_of_operation = 1 wiki_content = sm.get_wiki_article(keyword, verbose=verbosity) print "\n\n Done with Content Extraction. Begin keyword extraction algorithm..." tr_list = gurmeet.tfidf(wiki_content[0], wiki_content[1], wiki_content[2], mode_of_operation=mode_of_operation, return_term=0) #tr_list = TextRank.text_rank(wiki_content[0]) for g, v in tr_list: print g, v