def edges_list_reverse(edges): distance_champ = {} for x,y in edges.iteritems(): ori = x[0] t = x[1] for z in y: dest = z[0] poid = z[1] distance_champ[(ori,dest,t)]=poid return distance_champ print 'on recupere le lexique' dico_termes=fonctions.lexique(termsandblogs)#on cree le dictionnaire des termes #print dico_termes import context_process dist_mat = context_process.dist_mat#on recupere la matrice de distance entre termes #p_cooccurrences=context_process.p_cooccurrences#on recupere la matrice de cooccurrences entre termes fini=1 niveau=0 CF_weight_v = [0.5,0.2,0.5,0.5,0.5] #CF_weight_v = [0.4,0.2,0.5,0.5,0.5] seuil_net_champ_v = [0.,0.,0.,0.,0.] taillemin_v=[3,3,3,3,3,3] taillemax_v=[40,40,40,40,40,40] kmin_v = [5,5,5,5,5,5] kmin_v = [3,3,3,3,3,3]
def load_data(orphan_number): champs=['id_cluster_1','periode_1','id_cluster_1_univ','id_cluster_2','periode_2','id_cluster_2_univ','strength'] res_maps = fonctions_bdd.select_bdd_table_champ_complet(name_bdd,'maps',','.join(champs)) champs=['id_cluster_1','periode_1','id_cluster_1_univ','id_cluster_2','periode_2','id_cluster_2_univ','strength'] res_phylo = fonctions_bdd.select_bdd_table_champ_complet(name_bdd,'phylo',','.join(champs)) champs=['id_cluster','periode','id_cluster_univ','label_1','label_2','level','concept','nb_fathers','nb_sons','label'] res_cluster = fonctions_bdd.select_bdd_table_champ_complet(name_bdd,'cluster',','.join(champs)) champs=['jours','concepts_id'] occurrences_concepts = fonctions_bdd.select_bdd_table_champ_complet(name_bdd,'billets',','.join(champs)) champs=['concept1','concept2','periode','distance0','distance1'] reseau_termes = fonctions_bdd.select_bdd_table_champ_complet(name_bdd,'sem_weighted',','.join(champs)) dico_termes=fonctions.lexique()#on cree le dictionnaire des termes #on les restructure pour plus de confort d'utilisation. clusters={}#on crée un dico de dico. years_bins_first = [] res_termes={}#on crée un dico de dico dans lequel seront indiqués les distances for years in years_bins: years_bins_first.append(years[0]) for lien_terme in reseau_termes: [concept1,concept2,periode,distance0,distance1] = lien_terme if distance0>0: res_termes_inter = res_termes.get(periode,{}) dict_id1 = res_termes_inter.get(concept1,{}) dict_id1[concept2] = distance0 res_termes_inter[concept1]=dict_id1 res_termes[periode] = res_termes_inter if distance1>0: res_termes_inter = res_termes.get(periode,{}) dict_id2 = res_termes_inter.get(concept2,{}) dict_id2[concept1] = distance1#attention on rapporte les distances en double res_termes_inter[concept2]=dict_id2 res_termes[periode] = res_termes_inter for cluster_terme in res_cluster: [id_cluster,periode,id_cluster_univ,label_1,label_2,level,concept,nb_fathers,nb_sons,label] = cluster_terme periode = years_bins_first.index(int(str(periode).split(' ')[0])) if nb_fathers+nb_sons >= orphan_number: if id_cluster_univ in clusters: dict_id = clusters[id_cluster_univ] temp_concept = dict_id['concepts'] temp_concept.append(concept) dict_id['concepts'] = temp_concept clusters[id_cluster_univ] = dict_id else: dict_id={} #dict_id['id_cluster']=id_cluster dict_id['periode']=periode dict_id['label']=[label_1,label_2] dict_id['nb_fathers']=nb_fathers dict_id['nb_sons']=nb_sons dict_id['concepts'] = [concept] dict_id['label'] = label clusters[id_cluster_univ] = dict_id #clusters[id_cluster_univ]['id_cluster'/'periode'/'label'/'nb_sons'/'nb_fathers'/'concepts'] add_link(clusters,res_phylo,'dia') add_link(clusters,res_maps,'syn') for index in clusters.keys(): if not 'syn' in clusters[index]: clusters[index]['syn']={} #on construit la matrice temporelle d'occurrence des termes. occs = {} for occ in occurrences_concepts: year = occ[0] if len(occ[1])>2: concept_list = list(map(int,occ[1][1:-1].split(', '))) else: concept_list=[] #print concept_list for conc in concept_list: occs_conc=occs.get(conc,{}) occs_conc[year] = 1 + occs_conc.get(year,0) occs[conc]=occs_conc #on récupère dist_mat réseau des distances entre termes. name_date = str(years_bins[0][0]) + '_' + str(years_bins[0][-1]) + '_'+ str(years_bins[1][0])+ '_'+str(years_bins[-1][-1]) #version longue et exacte #dist_mat = fonctions.dumpingout('dist_mat'+name_date) #version rapide et approchée: dist_mat = fonctions.dumpingout('dist_mat_10'+name_date) return dico_termes,clusters,dist_mat,res_termes