Ejemplo n.º 1
0
def relevant_full_corpus(kwLimit):
    #corpus = utils.get_data('SELECT id FROM refdesc WHERE abstract_keywords IS NOT NULL;','../../Data/dumps/20160224_cybergeo.sqlite3')
    corpus = utils.get_ids('cybergeo','keywords')
    occurence_dicos = utils.import_kw_dico('cybergeo','keywords')
    mongo = pymongo.MongoClient('localhost',27017)
    database = mongo['relevant']
    relevant = 'relevant_full_'+str(kwLimit)
    network = 'network_full_'+str(kwLimit)+'_eth10'
    database[relevant].delete_many({"cumtermhood":{"$gt":0}})
    database[relevant].create_index('keyword')
    [keywords,dico,frequencies,edge_list] = kwFunctions.extract_relevant_keywords(corpus,kwLimit,occurence_dicos)
    print('insert relevant...')
    for kw in keywords.keys():
        butils.update_kw_tm(kw,keywords[kw],frequencies[kw],math.log(keywords[kw])*math.log(len(corpus)/frequencies[kw]),database,relevant)
    print('insert edges...')
    database[network].delete_many({"weight":{"$gt":0}})
    database[network].insert_many(edge_list)
Ejemplo n.º 2
0
def relevant_full_corpus(kwLimit):
    #corpus = utils.get_data('SELECT id FROM refdesc WHERE abstract_keywords IS NOT NULL;','../../Data/dumps/20160224_cybergeo.sqlite3')
    corpus = utils.get_ids('cybergeo', 'keywords')
    occurence_dicos = utils.import_kw_dico('cybergeo', 'keywords')
    mongo = pymongo.MongoClient('localhost', 27017)
    database = mongo['relevant']
    relevant = 'relevant_full_' + str(kwLimit)
    network = 'network_full_' + str(kwLimit) + '_eth10'
    database[relevant].delete_many({"cumtermhood": {"$gt": 0}})
    database[relevant].create_index('keyword')
    [keywords, dico, frequencies, edge_list
     ] = kwFunctions.extract_relevant_keywords(corpus, kwLimit,
                                               occurence_dicos)
    print('insert relevant...')
    for kw in keywords.keys():
        butils.update_kw_tm(
            kw, keywords[kw], frequencies[kw],
            math.log(keywords[kw]) * math.log(len(corpus) / frequencies[kw]),
            database, relevant)
    print('insert edges...')
    database[network].delete_many({"weight": {"$gt": 0}})
    database[network].insert_many(edge_list)
Ejemplo n.º 3
0
def run_bootstrap(res_folder,kwLimit,subCorpusSize,bootstrapSize,nruns) :
    corpus = utils.get_data('SELECT id FROM refdesc WHERE abstract_keywords IS NOT NULL;','../../Data/dumps/20160224_cybergeo.sqlite3')
    occurence_dicos = utils.import_kw_dico('../../Data/dumps/20160224_cybergeo.sqlite3')
    mongo = pymongo.MongoClient()
    #database = res_folder+'/bootstrap.sqlite3'
    database = mongo[res_folder] # mongodb database
    #while True :
    for i in range(nruns):
        print("run "+str(i))
	[relevantkw,relevant_dico,allkw] = bootstrap_subcorpuses(corpus,occurence_dicos,kwLimit,subCorpusSize,bootstrapSize)
        # update bases iteratively (ok for concurrency ?)
        n=len(relevantkw)/100;k=0
	for kw in relevantkw.keys():
            if k % n == 0 : print('kwinsertion : '+str(k/n)+'%')
	    butils.update_kw_tm(kw,relevantkw[kw],database)
	    k = k + 1
	# rq : we do not need the dico -> - use full dico after -
	#n=len(relevant_dico)/100;k=0
        #for i in relevant_dico.keys():
	#    if k % n == 0 : print('dicoinsertion : '+str(k/n)+'%')
        #    butils.update_kw_dico(i,relevant_dico[i],database)
	#    k = k + 1
	butils.update_count(bootstrapSize,database)