Exemple #1
0
def query_exander(query,N):
	encore=1
	while encore==1:
		dico_new=0
		dico=0
		#construit la base name_bdd_new en fonction de la query
		id_new_list = fonctions_bdd.select_bdd_table_champ_simple(name_bdd_new,'billets','id')
		N_new = len(id_new_list)
		id_new=[]
		for x in  id_new_list:
			id_new.append(x[0])
		dico_new = fast_ngram_counter(name_bdd_new,'')
		print len(dico_new.keys())
		dico= fast_ngram_counter(name_bdd,dico_new.keys())
		print len(dico.keys())
		ratio= out_doc(dico_new,dico)
		ratio_l = trier_dictionnaire(ratio)
		steps=100000
		nb_question=0
		champs_name = "(id,title,date,permalink,site,categorie1,categorie2,categorie3,content,content_lemmatise,href,requete,identifiant_unique)"#on n'enregistre pas le html brut
		mode_dynamique=0
		 
		if mode_dynamique==1:
			for x in ratio_l:
				if nb_question<steps:
					val = x[1]
					if dico_new[x[0]]!=dico[x[0]]:
						info ='\n'
						#affichage des exemples:
						exemple=0
						if exemple >0:
							nouveaux_billets = fonctions_bdd.select_bdd_table_champ_simple(name_bdd,'billets',champs_name[1:-1],"where content_lemmatise like '% " + x[0]  +" %'")
					
							for billets in nouveaux_billets[:9]:
								if not billets[0] in id_new:
									info=info +  '*** '+ billets[1] + '(' + billets[4]  + ')' + '\n'
							print info
						print str(dico_new[x[0]]) +' doc. in ( '+str(float(dico_new[x[0]])/float(N_new)*100.) +'% )' + ' vs ' + str(dico[x[0]])+' doc. out ( '+str(float(dico[x[0]])/float(N)*100.) +'% )' + ' => ratio: ' + str(float(dico_new[x[0]])/float(N_new)/(float(dico[x[0]])/float(N)))

						var = raw_input('Do you wish to add "' + x[0] + '" to the query ?')
						if var=='y':
							query =  add_query(query,x[0])
							fonctions_bdd.remplir_table(name_bdd_new,'billets',nouveaux_billets,champs_name)
							nb_question=nb_question+1
						if var=='s':
							steps=0
					else:
						query =  add_query(query,x[0])
				else:
					pass
		else:
			fileout=open(path_req + 'query_extension.csv','w')
			print path_req + 'query_extension.csv'
			fileout.write('nlemme' + '\t' + ' nb doc. in  ' + '\t' + "pourcentage doc in" +'\t' +  ' nb doc out ' + '\t'+'poucentage doc out' +'\t' +' ratio: ' +'\n')
			for x in ratio_l:
				fileout.write(str(x[0]) +'\t' + str(dico_new[x[0]]) +'\t'+str(float(dico_new[x[0]])/float(N_new)*100.) + '\t' + str(dico[x[0]])+' \t'+str(float(dico[x[0]])/float(N)*100.) +'\t ' + str(float(dico_new[x[0]])/float(N_new)/(float(dico[x[0]])/float(N))) + '\n')
				print str(x[0]) +'\t' + str(dico_new[x[0]]) +'\t'+str(float(dico_new[x[0]])/float(N_new)*100.) + '\t' + str(dico[x[0]])+' \t'+str(float(dico[x[0]])/float(N)*100.) +'\t ' + str(float(dico_new[x[0]])/float(N_new)/(float(dico[x[0]])/float(N))) + '\n'
		print 'query finale'
		print query
		var = raw_input('Do you wish to perform a new indexation of the database based on the new query ?')
		if var == 'n':
			encore=0
	return query
Exemple #2
0
print "+++ processing raw database \""+name_data+"\" into SQL database file \""+name_bdd+"\""

sep  = ' *** '
##############################################################
#######1.importation des billets d'une base extérieure########
##############################################################


#a modifier en fonction de l'endroit où on se trouve:
name_bdd_new = '.'.join(name_bdd.split('.')[:-2]) + '_new.' + '.'.join(name_bdd.split('.')[-2:])
print name_bdd
print name_bdd_new
champs_name = "(title,date,permalink,site,categorie1,categorie2,categorie3,content_lemmatise,content,href,jours,concepts,identifiant_unique,requete,concepts_id)"#on n'enregistre pas le html brut
billet_new = fonctions_bdd.select_bdd_table_champ_simple(name_bdd_new,'billets',champs_name[1:-1])
champs=billet_new
fonctions_bdd.remplir_table(name_bdd,'billets',champs,champs_name)

###################################
#######2.calcul des infos annexes##
###################################



#creation de la table auteurs
try:
	fonctions_bdd.detruire_table(name_bdd,'auteurs')
except:
	pass
print "    + creation de la table auteurs..."
fonctions_bdd.creer_table_auteurs(name_bdd,'auteurs')
## on alimente la table concepts avec la liste des concepts trie et leur forme principale
file_concepts=codecs.open(dictionnaire_treetagged__formemajoritaire_name,'r','utf-8')
liste_concepts=[]
correspondance_lemme_forme={}
for ligne in file_concepts.readlines():
	lignev = ligne.split('\t')
	liste_concepts.append((lignev[0].encode('utf-8','replace'),lignev[1].encode('utf-8','replace')))
	correspondance_lemme_forme[lignev[0].encode('utf-8','replace')]=lignev[1].encode('utf-8','replace')
print "&&&",len(liste_concepts),"concepts now."
##si necessaire on recree la table concepts


#on remplit la table concept
#print liste_concepts
fonctions_bdd.remplir_table(name_bdd,'concepts',liste_concepts,"(concepts,forme_principale)")



contenu = fonctions_bdd.select_bdd_table_champ_simple(name_bdd,'concepts','id,concepts')
liste_concepts_dico={}
for con in contenu:
	liste_concepts_dico[con[1]]=con[0]

def freq_comp(year):
	y=years.index(year)
#for y,year in enumerate(years):
	#on construit la liste des index des concepts dans l'ordre 
	ngrammes_fit_index=[]
	Nb_auteurs = fonctions_bdd.count_rows(name_bdd,'auteurs')
Exemple #4
0
def ecrire_tables_cluster_phylo(nodes,edges,sortie,level,time,attribut,sonsbis,fathersbis,dico_termes,indexs_inv,map_dessous,transition,sep_label):
	dico_termes_inv = invert_dict(dico_termes)
	transition_inv = invert_dict(transition)
	variables_cluster=[]
	variables_phylo=[]
	variables_maps=[]
	try:
		print 'path_req' + 'site'
		os.mkdir(path_req + 'site')
	except:
		pass
	
	fichier_dot=open(path_req + 'site'  +'/' + 'ExportPhyloDetails.dot','w')	
	fichier_dot.write('digraph arbre_phylogenetique {\n')
	labels_annees=[]
	ranks = {}	
	for idx,lab in nodes.iteritems():
		#print idx
		id_cluster_univ=transition_inv[idx]
		tim = time[idx]
		lev = level[idx]
		att = attribut[idx]
		son = sonsbis[idx]
		fat = fathersbis[idx]
		labv = lab.split(sep_label)
		labv.sort()
		lab1 = labv[0]
		lab2 = labv[1]
		
		lab_1 = dico_termes_inv[lab1]
		lab_2 = dico_termes_inv[lab2]
		label_annee = get_label_annees(tim)
		index_local = indexs_inv[id_cluster_univ]
		[niv,numero,inter] = index_local.split('_')
		if not label_annee in labels_annees:
			labels_annees.append(label_annee)
			
			ranks[str(tim)]=['P' + str(tim)]
			fichier_dot.write('P' + str(tim) +' [shape = rect ,fontsize=22,label="'+label_annee.replace(' ','-')+ '"]\n')
			if len(labels_annees)>1:
				fichier_dot.write('P' + str(tim) + ' -> ' + mem + '\n')
			mem = 'P' + str(tim)
			fichier_dot.write
		temp =  ranks[str(tim)]
		temp.append('C' + str(tim) + '_' + str(numero))
		ranks[str(tim)]=temp
		fichier_dot.write('C' + str(tim) + '_' + str(numero)  +'  [style=filled,fontname=Arial,fontsize=12,peripheries=2,color=lightblue2    ,shape=rect, label="')
		for con in map_dessous[id_cluster_univ]:
			con = indexs_inv[con].split('_')[1]
			variables_cluster.append([idx,numero,lab_1,lab_2,lab,att,lev,label_annee,fat,son,idx,con])
			fichier_dot.write('\\n'+dico_termes[int(con)])
		fichier_dot.write('",fontsize=12]\n')
			
	#	ex('CREATE TABLE '+ name_table +' (id INTEGER ,id_cluster INTEGER,label_1 INTEGER,label_2 INTEGER,label VARCHAR(300),attribut VARCHAR(300),level INTEGER,periode VARCHAR(50),concept INT, pseudo VARCHAR(10), cluster_size INT, density VARCHAR(10), nb_fathers INT, nb_sons INT, lettre VARCHAR(3), identifiant_unique VARCHAR(20))')
	variables_cluster_names = "(id_cluster_univ,id_cluster,label_1,label_2,label,attribut,level,periode,nb_fathers , nb_sons,identifiant_unique,concept)"#concept,pseudo , cluster_size , density ,  lettre , ))"
	fonctions_bdd.remplir_table(name_bdd,'cluster',variables_cluster,variables_cluster_names)
	
	for source,streng in edges.iteritems():
		id_cluster_1_univ=transition_inv[source[0]]
		id_cluster_2_univ=transition_inv[source[1]]
		[niv_1,id_cluster_1,inter_1] = indexs_inv[id_cluster_1_univ].split('_')
		[niv_2,id_cluster_2,inter_2] = indexs_inv[id_cluster_2_univ].split('_')
		label_annee_1 = get_label_annees(int(inter_1))
		label_annee_2 = get_label_annees(int(inter_2))
		if niv_1 == niv_2:
			if inter_1 == inter_2:
				variables_maps.append([id_cluster_1,label_annee_1,source[0],id_cluster_2,label_annee_2,source[1],str(streng)])
				#print streng
				#activer liens intra temporelles
				#if float(streng)>0.:			
					#fichier_dot.write('C'+str(inter_1) + '_' + str(id_cluster_1) + '->' + 'C'+str(inter_2) + '_' + str(id_cluster_2) + ' '  + '[style="setlinewidth(2)",color=green]\n')
			else:
				variables_phylo.append([id_cluster_1,label_annee_1,source[0],id_cluster_2,label_annee_2,source[1],str(streng)])
				fichier_dot.write('C'+str(inter_1) + '_' + str(id_cluster_1) + '->' + 'C'+str(inter_2) + '_' + str(id_cluster_2) + ' '  + '[style="setlinewidth(2)",color=red]\n')
	for inter,ra in ranks.iteritems():
		fichier_dot.write('{rank=same;')
		for x in ra:
			fichier_dot.write(str(x)+' ')
		fichier_dot.write('}\n')
	fichier_dot.write('}\n')
	variables_phylo_names = "(id_cluster_1,periode_1,id_cluster_1_univ,id_cluster_2,periode_2,id_cluster_2_univ,strength)"
	variables_maps_names = "(id_cluster_1,periode_1,id_cluster_1_univ,id_cluster_2,periode_2,id_cluster_2_univ,strength)"
	fonctions_bdd.remplir_table(name_bdd,'phylo',variables_phylo,variables_phylo_names)
	fonctions_bdd.remplir_table(name_bdd,'maps',variables_maps,variables_maps_names)
Exemple #5
0
for lignes in file.readlines():
	sites_interdits.append(lignes.split('\t')[0][:-1])
print "    - on charge la liste des sources inerdites..."

print "    - remplissage de la table auteurs..."
sortie = fonctions_bdd.select_bdd_table(name_bdd,'billets','site',requete)
sites = set()
for sor in sortie:
	names =sor[0].split(" *%* ")
	for nom in names:
		site_name=text_processing.nettoyer_url(nom)
		if site_name not in sites_interdits:
			sites.add(site_name)
			
sites=list(sites)
fonctions_bdd.remplir_table(name_bdd,'auteurs',sites,"(auteurs)")

print "    - recuperation des ids des auteurs dans la table \"auteurs\" (index SQL) pour reinjecter dans la table \"billets\"..."
auteurs = fonctions_bdd.select_bdd_table_champ_simple(name_bdd,'auteurs','id,auteurs')
dic_auteurs ={}
for aut in auteurs:
	dic_auteurs[aut[1]] = aut[0]
site_billets = fonctions_bdd.select_bdd_table_champ_simple(name_bdd,'billets','id,site')

auteur_index=[]
for sit in site_billets:
	id_b= sit[0]
	names =sit[1].split(" *%* ")
	for nom in names:
		sit_name = text_processing.nettoyer_url(nom)
		if sit_name in dic_auteurs:
Exemple #6
0
try:
	fonctions_bdd.drop_table(name_bdd,'concept2billets')
except:
	pass
fonctions_bdd.creer_table_concept2billets(name_bdd,'concept2billets')

con2bill = []
for couple in concepts_index:
	id_b = couple[0]
	for con in couple[1]:
		con2bill.append([con,id_b,billet_jour[id_b],requete,str(con)+'_'+str(id_b)])



fonctions_bdd.remplir_table(name_bdd,'concept2billets',con2bill,"(concept,id_b,jours,requete,identifiant_unique)")






#on alimente ensuite la table socsem liant les index des acteurs aux index des concepts ainsi qu'au jour du lien

def aggreger_periode(liensem):
	#lienssem.append([con1,con2,jours,b_id,requete,str(b_id)+'_' + str(con1) + '_' + str(con2)])
	#fonctions_bdd.remplir_table(name_bdd,'sem_weighted',lienssem_weighted,"(concept1,concept2,periode,cooccurrences,requete,identifiant_unique)")
	years_bins=parameters.years_bins
	lienssem_weighted_dict={}
	lienssem_weighted=[]
	requete=''
Exemple #7
0
name_data = parameters.name_data 
requete = parameters.requete
user_interface=parameters.user_interface
name_data_real = parameters.name_data_real
#lemmadictionnary = parameters.lemmadictionary# si nul, on  calcule ou recalcule  le dico de lemmes sur la requete consideree sinon on ne calcule pas
path_req = parameters.path_req
sep = parameters.sep
build_link_tables=parameters.build_link_tables
language = parameters.language
###################################
#######3.Indexer les billets#######
###################################


###################################
####construction des liens#########
###################################

#on alimente enfin la table soc liant les index des acteurs entre eux ainsi qu'au jour du lien
try: 
	fonctions_bdd.detruire_table(name_bdd,'soc')
except: 
	pass
fonctions_bdd.creer_table_soc(name_bdd,'soc')
lienssoc = misc.build_social_net(requete,name_bdd,sep,name_data)
fonctions_bdd.remplir_table(name_bdd,'soc',lienssoc,"(auteur1,auteur2,jours,id_b,requete,identifiant_unique)")
print "\n--- finished inserting data in table soc."



Exemple #8
0
		y=couple[1]
		if x in dict_mono_dict:
			temp = dict_mono_dict[x]
		else:
			temp = {}
		temp[y]=moy = float(sum(valeurs.values())/n)
		dict_mono_dict[x]=temp
	return 	dict_mono_dict

n=len(years_bins)
dist_1d = turn1d_moy(dist_2d)
dist_1d_trans = turn1d_moy(dist_2d_trans)

def export_nfirst(dist_2d,dist_1d,direction,nfirst):
	dist_2d_vector = []
	for x,vois_forcemoy in dist_1d.iteritems():
		vois_forcemoy_sorted = sorted(vois_forcemoy.iteritems(), key=operator.itemgetter(1),reverse=True)
		for voisin_force in vois_forcemoy_sorted[:nfirst]:
			voisin = voisin_force[0]
			force =voisin_force[1]
			dist_2d_vector.append((x,voisin,','.join(map(str,add_zeros(dist_2d[(x,voisin)],years_bins))),str("%.3f" %(force)),direction))
	return dist_2d_vector
	
thres = 50
dist_2d_vector=export_nfirst(dist_2d,dist_1d,'1',thres)
fonctions_bdd.remplir_table(name_bdd,'termneighbour',dist_2d_vector,"(term1,term2, distances,force_moy,direction)")

dist_2d_vector_trans=export_nfirst(dist_2d_trans,dist_1d_trans,'0',thres)
fonctions_bdd.remplir_table(name_bdd,'termneighbour',dist_2d_vector_trans,"(term1,term2,distances,force_moy,direction)")
#fonctions.ecrire_reseau(dist_mat,years_bins,dist_type,seuil,1,dedoubler(dico_termes,years_bins))