def phase1_update(source_prob, wiki_list, keywords, n_iter, verbosity, log, mode_of_operation = 4): ''' Compute and return the updated source priorities based on the initial priorities (source_prob) and the keywords. Can use text rank (default) as well as tf-idf to do so. 'log' is the name of file in which log should be saved. ''' j=0 logFile = open(log+'_intermediate_probs',"a") for kw in keywords: j = j+1 wiki_content = sm.get_wiki_article(kw, verbose=verbosity, search_list=[w[1] for w in wiki_list]) #Get the content of the web pages respected to each website for the keyword 'kw' logwiki = open(log+"_"+kw+"_wiki_contents","a") for n in range(len(wiki_content)): logwiki.write(wiki_list[n][0]+' :\n') logwiki.write(wiki_content[n].encode("utf8")+'\n\n\n') logwiki.close() if verbosity : print "\n\n Done with Content Extraction. Begin keyword extraction algorithm..." if mode_of_operation == 4: source_prob = utilities.textrank(source_prob, wiki_content, log+'_'+str(j)+'_'+kw) else: source_prob = utilities.tfidf(source_prob, wiki_content, log+'_'+str(j)+'_'+kw, mode_of_operation=mode_of_operation, return_term=0) if verbosity : print "\n\n---------\n" for i in range(len(source_prob)): if verbosity : print wiki_list[i][0], source_prob[i]/source_prob[0] logFile.write(wiki_list[i][0]+" : "+str(source_prob[i]/source_prob[0])+'\n') logFile.close() logoutput = open(log,"a") logoutput.write("Source Probs : "+ str(source_prob)+'\n') logoutput.close() #tr_list = TextRank.text_rank(wiki_content[0]) return source_prob
def test3(source_prob, wiki_list, keywords, n_iter, verbosity, mode_of_operation, log, x): j=0 logFile = open(log+'_intermediate_probs',"a") for kw in keywords: j = j+1 wiki_content = sm.get_wiki_article(kw, verbose=verbosity, search_list=[w[1] for w in wiki_list]) if x == 1: wiki_content[2] = wiki_content[2]+wiki_content[1] logwiki = open(log+"_"+kw+"_wiki_contents","a") for n in range(len(wiki_content)): logwiki.write(wiki_list[n][0]+' :\n') logwiki.write(wiki_content[n].encode("utf8")+'\n\n\n') logwiki.close() print "\n\n Done with Content Extraction. Begin keyword extraction algorithm..." if mode_of_operation == 4: source_prob = utilities.textrank(source_prob, wiki_content, log+'_'+str(j)+'_'+kw) else: source_prob = utilities.tfidf(source_prob, wiki_content, log+'_'+str(j)+'_'+kw, mode_of_operation=mode_of_operation, return_term=0) print "\n\n---------\n" for i in range(len(source_prob)): logFile.write(wiki_list[i][0]+" : "+str(source_prob[i]/source_prob[0])+'\n') logFile.close() logoutput = open(log,"a") logoutput.write("Source Probs : "+ str(source_prob)+'\n') logoutput.close() #tr_list = TextRank.text_rank(wiki_content[0]) return source_prob
def __init__(self, model_addr, tfidf_addr, phrases_addr, knowledge_dir='knowledge', threshold=0.01, tfidf_factor=1): self.knowledge_dir = knowledge_dir self.threshold = threshold self.tfidf_factor = tfidf_factor self.knowledge = dict() start = datetime.datetime.now() self.model = gensim.models.Word2Vec.load(model_addr) print "time to load model: %s"%(datetime.datetime.now() - start) start = datetime.datetime.now() self.tfidf = utilities.tfidf() self.tfidf.load() print "time to load tfidf: %s"%(datetime.datetime.now() - start) start = datetime.datetime.now() self.bigram_transformer = gensim.models.phrases.Phraser.load(phrases_addr) print "time to load bigram transformer: %s"%(datetime.datetime.now() - start) start = datetime.datetime.now() self.code_knowledge() print "time to load knowledge: %s"%(datetime.datetime.now() - start)
import utilities import use from os import mkdir from os.path import exists, isdir from gensim.models import Word2Vec utilities.keep_english_dir('Files/') use.train_bigram_transformer(addr='Files/') if not isdir('word2vec'): mkdir('word2vec') if exists('word2vec/400'): model = Word2Vec.load('word2vec/400') else: model = use.train_word2vec('Files/', vector_size=400, use_bigram_transform=True) model.save('word2vec/400') tfidf = utilities.tfidf() tfidf.generate(model, 'Files/') tfidf.save()