def extract_relevant_cybergeo_fulltext(kwLimit): resdir = "res/cybergeo_full/" # corpus = utils.get_data( "SELECT id FROM cybergeo WHERE fulltext_keywords IS NOT NULL AND fulltext_keywords!='' LIMIT 10;", "mysql" ) occurence_dicos = utils.import_kw_dico_req( "SELECT id,fulltext_keywords FROM cybergeo WHERE fulltext_keywords IS NOT NULL AND fulltext_keywords!='' LIMIT 10;", "mysql", ) [relevantkw, relevant_dico] = kwFunctions.extract_relevant_keywords(corpus, kwLimit, occurence_dicos) # export as csv utils.export_dico_csv(relevant_dico, resdir + "relevantDico_kw" + str(kwLimit), False) export_dico_num_csv(relevantkw, resdir + "termhoods_kw" + str(kwLimit), False)
def extract_relevant_cybergeo(kwLimit, database): corpus = utils.get_data( "SELECT cybergeo.id FROM refdesc INNER JOIN cybergeo ON cybergeo.id=refdesc.id WHERE abstract_keywords IS NOT NULL AND abstract_keywords!='';", database, ) print(corpus) occurence_dicos = utils.import_kw_dico_req( "SELECT cybergeo.id,abstract_keywords FROM refdesc INNER JOIN cybergeo ON cybergeo.id=refdesc.id WHERE abstract_keywords IS NOT NULL AND abstract_keywords!='';", database, ) print(occurence_dicos) [relevantkw, relevant_dico] = kwFunctions.extract_relevant_keywords(corpus, kwLimit, occurence_dicos) utils.export_dico_csv(relevant_dico, "res/cybergeo/relevantDico_kwLimit" + str(kwLimit), False) utils.export_dico_num_csv(relevantkw, "res/cybergeo/kw_" + str(kwLimit), False)
def relevant_full_corpus(kwLimit): #corpus = utils.get_data('SELECT id FROM refdesc WHERE abstract_keywords IS NOT NULL;','../../Data/dumps/20160224_cybergeo.sqlite3') corpus = utils.get_ids('cybergeo','keywords') occurence_dicos = utils.import_kw_dico('cybergeo','keywords') mongo = pymongo.MongoClient('localhost',27017) database = mongo['relevant'] relevant = 'relevant_full_'+str(kwLimit) network = 'network_full_'+str(kwLimit)+'_eth10' database[relevant].delete_many({"cumtermhood":{"$gt":0}}) database[relevant].create_index('keyword') [keywords,dico,frequencies,edge_list] = kwFunctions.extract_relevant_keywords(corpus,kwLimit,occurence_dicos) print('insert relevant...') for kw in keywords.keys(): butils.update_kw_tm(kw,keywords[kw],frequencies[kw],math.log(keywords[kw])*math.log(len(corpus)/frequencies[kw]),database,relevant) print('insert edges...') database[network].delete_many({"weight":{"$gt":0}}) database[network].insert_many(edge_list)
def extract_relevant_cybergeo(kwLimit, database): corpus = utils.get_data( 'SELECT cybergeo.id FROM refdesc INNER JOIN cybergeo ON cybergeo.id=refdesc.id WHERE abstract_keywords IS NOT NULL AND abstract_keywords!=\'\';', database) print(corpus) occurence_dicos = utils.import_kw_dico_req( 'SELECT cybergeo.id,abstract_keywords FROM refdesc INNER JOIN cybergeo ON cybergeo.id=refdesc.id WHERE abstract_keywords IS NOT NULL AND abstract_keywords!=\'\';', database) print(occurence_dicos) [relevantkw, relevant_dico ] = kwFunctions.extract_relevant_keywords(corpus, kwLimit, occurence_dicos) utils.export_dico_csv(relevant_dico, 'res/cybergeo/relevantDico_kwLimit' + str(kwLimit), False) utils.export_dico_num_csv(relevantkw, 'res/cybergeo/kw_' + str(kwLimit), False)
def extract_relevant_cybergeo_fulltext(kwLimit): resdir = 'res/cybergeo_full/' # corpus = utils.get_data( 'SELECT id FROM cybergeo WHERE fulltext_keywords IS NOT NULL AND fulltext_keywords!=\'\' LIMIT 10;', 'mysql') occurence_dicos = utils.import_kw_dico_req( 'SELECT id,fulltext_keywords FROM cybergeo WHERE fulltext_keywords IS NOT NULL AND fulltext_keywords!=\'\' LIMIT 10;', 'mysql') [relevantkw, relevant_dico ] = kwFunctions.extract_relevant_keywords(corpus, kwLimit, occurence_dicos) # export as csv utils.export_dico_csv(relevant_dico, resdir + 'relevantDico_kw' + str(kwLimit), False) export_dico_num_csv(relevantkw, resdir + 'termhoods_kw' + str(kwLimit), False)
def bootstrap_subcorpuses(corpus,occurence_dicos,kwLimit,subCorpusSize,bootstrapSize): N = len(corpus) print('Bootstrapping on corpus of size '+str(N)) # generate bSize extractions # -> random subset of 1:N of size subCorpusSize extractions = [map(lambda x : x - 1,numpy.random.choice(N,subCorpusSize,replace=False)) for b in range(bootstrapSize)] # numpy.random.choice(N, size, replace=False) mean_termhoods = dict() # mean termhoods progressively updated ref_kw_dico = dict() # ref -> kw dico : cumulated on repetitions. if a kw is relevant a few time, counted as 0 in mean. allkw = [] for eind in range(len(extractions)) : print("bootstrap : run "+str(eind)) extraction = extractions[eind] subcorpus = [corpus[i] for i in extraction] [keywords,ref_kw_local_dico] = kwFunctions.extract_relevant_keywords(subcorpus,kwLimit,occurence_dicos) allkw.append(keywords) # add termhoods for kw in keywords.keys() : if kw not in mean_termhoods : mean_termhoods[kw] = 0 mean_termhoods[kw] = mean_termhoods[kw] + keywords[kw] # update p->kw dico for ref in ref_kw_local_dico.keys() : if ref not in ref_kw_dico : ref_kw_dico[ref] = set() for kw in ref_kw_local_dico[ref] : ref_kw_dico[ref].add(kw) # sort on termhoods (no need to normalize) adn returns res = kwFunctions.extract_from_termhood(mean_termhoods,ref_kw_dico,kwLimit) #print(res) #print(allkw) res.append(allkw) return(res)
def relevant_full_corpus(kwLimit): #corpus = utils.get_data('SELECT id FROM refdesc WHERE abstract_keywords IS NOT NULL;','../../Data/dumps/20160224_cybergeo.sqlite3') corpus = utils.get_ids('cybergeo', 'keywords') occurence_dicos = utils.import_kw_dico('cybergeo', 'keywords') mongo = pymongo.MongoClient('localhost', 27017) database = mongo['relevant'] relevant = 'relevant_full_' + str(kwLimit) network = 'network_full_' + str(kwLimit) + '_eth10' database[relevant].delete_many({"cumtermhood": {"$gt": 0}}) database[relevant].create_index('keyword') [keywords, dico, frequencies, edge_list ] = kwFunctions.extract_relevant_keywords(corpus, kwLimit, occurence_dicos) print('insert relevant...') for kw in keywords.keys(): butils.update_kw_tm( kw, keywords[kw], frequencies[kw], math.log(keywords[kw]) * math.log(len(corpus) / frequencies[kw]), database, relevant) print('insert edges...') database[network].delete_many({"weight": {"$gt": 0}}) database[network].insert_many(edge_list)