def relevant_full_corpus(kwLimit): #corpus = utils.get_data('SELECT id FROM refdesc WHERE abstract_keywords IS NOT NULL;','../../Data/dumps/20160224_cybergeo.sqlite3') corpus = utils.get_ids('cybergeo','keywords') occurence_dicos = utils.import_kw_dico('cybergeo','keywords') mongo = pymongo.MongoClient('localhost',27017) database = mongo['relevant'] relevant = 'relevant_full_'+str(kwLimit) network = 'network_full_'+str(kwLimit)+'_eth10' database[relevant].delete_many({"cumtermhood":{"$gt":0}}) database[relevant].create_index('keyword') [keywords,dico,frequencies,edge_list] = kwFunctions.extract_relevant_keywords(corpus,kwLimit,occurence_dicos) print('insert relevant...') for kw in keywords.keys(): butils.update_kw_tm(kw,keywords[kw],frequencies[kw],math.log(keywords[kw])*math.log(len(corpus)/frequencies[kw]),database,relevant) print('insert edges...') database[network].delete_many({"weight":{"$gt":0}}) database[network].insert_many(edge_list)
def relevant_full_corpus(kwLimit): #corpus = utils.get_data('SELECT id FROM refdesc WHERE abstract_keywords IS NOT NULL;','../../Data/dumps/20160224_cybergeo.sqlite3') corpus = utils.get_ids('cybergeo', 'keywords') occurence_dicos = utils.import_kw_dico('cybergeo', 'keywords') mongo = pymongo.MongoClient('localhost', 27017) database = mongo['relevant'] relevant = 'relevant_full_' + str(kwLimit) network = 'network_full_' + str(kwLimit) + '_eth10' database[relevant].delete_many({"cumtermhood": {"$gt": 0}}) database[relevant].create_index('keyword') [keywords, dico, frequencies, edge_list ] = kwFunctions.extract_relevant_keywords(corpus, kwLimit, occurence_dicos) print('insert relevant...') for kw in keywords.keys(): butils.update_kw_tm( kw, keywords[kw], frequencies[kw], math.log(keywords[kw]) * math.log(len(corpus) / frequencies[kw]), database, relevant) print('insert edges...') database[network].delete_many({"weight": {"$gt": 0}}) database[network].insert_many(edge_list)
def run_bootstrap(res_folder,kwLimit,subCorpusSize,bootstrapSize,nruns) : corpus = utils.get_data('SELECT id FROM refdesc WHERE abstract_keywords IS NOT NULL;','../../Data/dumps/20160224_cybergeo.sqlite3') occurence_dicos = utils.import_kw_dico('../../Data/dumps/20160224_cybergeo.sqlite3') mongo = pymongo.MongoClient() #database = res_folder+'/bootstrap.sqlite3' database = mongo[res_folder] # mongodb database #while True : for i in range(nruns): print("run "+str(i)) [relevantkw,relevant_dico,allkw] = bootstrap_subcorpuses(corpus,occurence_dicos,kwLimit,subCorpusSize,bootstrapSize) # update bases iteratively (ok for concurrency ?) n=len(relevantkw)/100;k=0 for kw in relevantkw.keys(): if k % n == 0 : print('kwinsertion : '+str(k/n)+'%') butils.update_kw_tm(kw,relevantkw[kw],database) k = k + 1 # rq : we do not need the dico -> - use full dico after - #n=len(relevant_dico)/100;k=0 #for i in relevant_dico.keys(): # if k % n == 0 : print('dicoinsertion : '+str(k/n)+'%') # butils.update_kw_dico(i,relevant_dico[i],database) # k = k + 1 butils.update_count(bootstrapSize,database)