Example #1
0
def nettoyer_site(chaine,chainel,site):
	sortie = fonctions_bdd.select_bdd_table_champ_complet(name_bdd,'billets','id,content,content_lemmatise','site',site)
	entree,entreel=[],[]
	for x in  sortie:
		idx = x[0]
		text = x[1]
		textl = x[2]
		n = text.find(chaine)
		nl = textl.find(chainel)
		if  n>0:
			entree.append((idx,n))
			entreel.append((idx,nl))
		print entree
	fonctions_bdd.insert_select_substring(name_bdd,'billets','billets',entree,'content')
	fonctions_bdd.insert_select_substring(name_bdd,'billets','billets',entreel,'content_lemmatise')
Example #2
0
def load_data(orphan_number):
	champs=['id_cluster_1','periode_1','id_cluster_1_univ','id_cluster_2','periode_2','id_cluster_2_univ','strength']
	res_maps = fonctions_bdd.select_bdd_table_champ_complet(name_bdd,'maps',','.join(champs))
	champs=['id_cluster_1','periode_1','id_cluster_1_univ','id_cluster_2','periode_2','id_cluster_2_univ','strength']
	res_phylo = fonctions_bdd.select_bdd_table_champ_complet(name_bdd,'phylo',','.join(champs))
	champs=['id_cluster','periode','id_cluster_univ','label_1','label_2','level','concept','nb_fathers','nb_sons','label']
	res_cluster = fonctions_bdd.select_bdd_table_champ_complet(name_bdd,'cluster',','.join(champs))
	champs=['jours','concepts_id']
	occurrences_concepts = fonctions_bdd.select_bdd_table_champ_complet(name_bdd,'billets',','.join(champs))
	champs=['concept1','concept2','periode','distance0','distance1']
	reseau_termes = fonctions_bdd.select_bdd_table_champ_complet(name_bdd,'sem_weighted',','.join(champs))
	dico_termes=fonctions.lexique()#on cree le dictionnaire des termes

	#on les restructure pour plus de confort d'utilisation.
	clusters={}#on crée un dico de dico.
	years_bins_first = []
	res_termes={}#on crée un dico de dico dans lequel seront indiqués les distances
	for years in years_bins:
		years_bins_first.append(years[0])
	for lien_terme in reseau_termes:
		[concept1,concept2,periode,distance0,distance1] = lien_terme
		if distance0>0:
			res_termes_inter = res_termes.get(periode,{})
			dict_id1 = res_termes_inter.get(concept1,{})
			dict_id1[concept2] = distance0
			res_termes_inter[concept1]=dict_id1
			res_termes[periode] = res_termes_inter
			
		if distance1>0:
			res_termes_inter = res_termes.get(periode,{})
			dict_id2 = res_termes_inter.get(concept2,{})		
			dict_id2[concept1] = distance1#attention on rapporte les distances en double	
			res_termes_inter[concept2]=dict_id2		
			res_termes[periode] = res_termes_inter
		
	for cluster_terme in res_cluster:
		[id_cluster,periode,id_cluster_univ,label_1,label_2,level,concept,nb_fathers,nb_sons,label] = cluster_terme
		periode = years_bins_first.index(int(str(periode).split(' ')[0]))
		if nb_fathers+nb_sons >= orphan_number:
			if id_cluster_univ in clusters:
				dict_id = clusters[id_cluster_univ]
				temp_concept = dict_id['concepts']
				temp_concept.append(concept)
				dict_id['concepts'] = temp_concept
				clusters[id_cluster_univ] = dict_id
			else:
				dict_id={}
				#dict_id['id_cluster']=id_cluster
				dict_id['periode']=periode
				dict_id['label']=[label_1,label_2]
				dict_id['nb_fathers']=nb_fathers
				dict_id['nb_sons']=nb_sons
				dict_id['concepts'] = [concept]
				dict_id['label'] = label
				clusters[id_cluster_univ] = dict_id
	#clusters[id_cluster_univ]['id_cluster'/'periode'/'label'/'nb_sons'/'nb_fathers'/'concepts']
	add_link(clusters,res_phylo,'dia')
	add_link(clusters,res_maps,'syn')
	for index in clusters.keys():
		if not 'syn' in  clusters[index]:
			clusters[index]['syn']={}
	#on construit la matrice temporelle d'occurrence des termes.
	occs = {}
	for occ in occurrences_concepts:	
		year = occ[0]
		if len(occ[1])>2:
			concept_list = list(map(int,occ[1][1:-1].split(', ')))
		else:
			concept_list=[]
		#print concept_list
		for conc in concept_list:
			occs_conc=occs.get(conc,{})
			occs_conc[year] = 1 + occs_conc.get(year,0)
			occs[conc]=occs_conc
	#on récupère dist_mat réseau des distances entre termes.
	name_date = str(years_bins[0][0]) + '_' + str(years_bins[0][-1]) + '_'+ str(years_bins[1][0])+ '_'+str(years_bins[-1][-1])
	#version longue et exacte
	#dist_mat = fonctions.dumpingout('dist_mat'+name_date)
	#version rapide et approchée:
	dist_mat = fonctions.dumpingout('dist_mat_10'+name_date)
	return dico_termes,clusters,dist_mat,res_termes