Beispiel #1
0
def phase1_update(source_prob, wiki_list, keywords, n_iter, verbosity, log, mode_of_operation = 4):
        '''
            Compute and return the updated source priorities based on the initial priorities (source_prob) and the keywords.
            Can use text rank (default) as well as tf-idf to do so.
            'log' is the name of file in which log should be saved.
        '''
        j=0
        logFile = open(log+'_intermediate_probs',"a")
        for kw in keywords:
                j = j+1
                wiki_content = sm.get_wiki_article(kw, verbose=verbosity, search_list=[w[1] for w in wiki_list])  #Get the content of the web pages respected to each website for the keyword 'kw'
                logwiki = open(log+"_"+kw+"_wiki_contents","a")
                for n in range(len(wiki_content)):
                        logwiki.write(wiki_list[n][0]+' :\n')
                        logwiki.write(wiki_content[n].encode("utf8")+'\n\n\n')
                logwiki.close()
                if verbosity : print "\n\n Done with Content Extraction. Begin keyword extraction algorithm..."
                if mode_of_operation == 4:
                        source_prob = utilities.textrank(source_prob, wiki_content, log+'_'+str(j)+'_'+kw)
                else:
                        source_prob = utilities.tfidf(source_prob, wiki_content, log+'_'+str(j)+'_'+kw, mode_of_operation=mode_of_operation, return_term=0)
                if verbosity : print "\n\n---------\n"
                for i in range(len(source_prob)):
                        if verbosity : print wiki_list[i][0], source_prob[i]/source_prob[0]
                        logFile.write(wiki_list[i][0]+" : "+str(source_prob[i]/source_prob[0])+'\n')
        logFile.close()
        logoutput = open(log,"a")
        logoutput.write("Source Probs : "+ str(source_prob)+'\n')
        logoutput.close()
        #tr_list = TextRank.text_rank(wiki_content[0])
        return source_prob
Beispiel #2
0
def test3(source_prob, wiki_list, keywords, n_iter, verbosity, mode_of_operation, log, x):
        j=0
        logFile = open(log+'_intermediate_probs',"a")
        for kw in keywords:
                j = j+1
                wiki_content = sm.get_wiki_article(kw, verbose=verbosity, search_list=[w[1] for w in wiki_list])
                if x == 1:
                    wiki_content[2] = wiki_content[2]+wiki_content[1]
                logwiki = open(log+"_"+kw+"_wiki_contents","a")
                for n in range(len(wiki_content)):
                        logwiki.write(wiki_list[n][0]+' :\n')
                        logwiki.write(wiki_content[n].encode("utf8")+'\n\n\n')
                logwiki.close()
                print "\n\n Done with Content Extraction. Begin keyword extraction algorithm..."
                if mode_of_operation == 4:
                        source_prob = utilities.textrank(source_prob, wiki_content, log+'_'+str(j)+'_'+kw)
                else:
                        source_prob = utilities.tfidf(source_prob, wiki_content, log+'_'+str(j)+'_'+kw, mode_of_operation=mode_of_operation, return_term=0)
                print "\n\n---------\n"
                for i in range(len(source_prob)):
                        logFile.write(wiki_list[i][0]+" : "+str(source_prob[i]/source_prob[0])+'\n')
        logFile.close()
        logoutput = open(log,"a")
        logoutput.write("Source Probs : "+ str(source_prob)+'\n')
        logoutput.close()
        #tr_list = TextRank.text_rank(wiki_content[0])
        return source_prob
Beispiel #3
0
    def __init__(self, model_addr, tfidf_addr, phrases_addr, knowledge_dir='knowledge', threshold=0.01, tfidf_factor=1):
        self.knowledge_dir = knowledge_dir
        self.threshold = threshold
        self.tfidf_factor = tfidf_factor
        self.knowledge = dict()

        start = datetime.datetime.now()
        self.model = gensim.models.Word2Vec.load(model_addr)
        print "time to load model: %s"%(datetime.datetime.now() - start)
        start = datetime.datetime.now()
        self.tfidf = utilities.tfidf()
        self.tfidf.load()
        print "time to load tfidf: %s"%(datetime.datetime.now() - start)
        start = datetime.datetime.now()
        self.bigram_transformer = gensim.models.phrases.Phraser.load(phrases_addr)
        print "time to load bigram transformer: %s"%(datetime.datetime.now() - start)
        start = datetime.datetime.now()
        self.code_knowledge()
        print "time to load knowledge: %s"%(datetime.datetime.now() - start)
Beispiel #4
0
import utilities
import use
from os import mkdir
from os.path import exists, isdir
from gensim.models import Word2Vec

utilities.keep_english_dir('Files/')
use.train_bigram_transformer(addr='Files/')
if not isdir('word2vec'):
    mkdir('word2vec')
if exists('word2vec/400'):
    model = Word2Vec.load('word2vec/400')
else:
    model = use.train_word2vec('Files/',
                               vector_size=400,
                               use_bigram_transform=True)
    model.save('word2vec/400')
tfidf = utilities.tfidf()
tfidf.generate(model, 'Files/')
tfidf.save()