def query_exander(query,N): encore=1 while encore==1: dico_new=0 dico=0 #construit la base name_bdd_new en fonction de la query id_new_list = fonctions_bdd.select_bdd_table_champ_simple(name_bdd_new,'billets','id') N_new = len(id_new_list) id_new=[] for x in id_new_list: id_new.append(x[0]) dico_new = fast_ngram_counter(name_bdd_new,'') print len(dico_new.keys()) dico= fast_ngram_counter(name_bdd,dico_new.keys()) print len(dico.keys()) ratio= out_doc(dico_new,dico) ratio_l = trier_dictionnaire(ratio) steps=100000 nb_question=0 champs_name = "(id,title,date,permalink,site,categorie1,categorie2,categorie3,content,content_lemmatise,href,requete,identifiant_unique)"#on n'enregistre pas le html brut mode_dynamique=0 if mode_dynamique==1: for x in ratio_l: if nb_question<steps: val = x[1] if dico_new[x[0]]!=dico[x[0]]: info ='\n' #affichage des exemples: exemple=0 if exemple >0: nouveaux_billets = fonctions_bdd.select_bdd_table_champ_simple(name_bdd,'billets',champs_name[1:-1],"where content_lemmatise like '% " + x[0] +" %'") for billets in nouveaux_billets[:9]: if not billets[0] in id_new: info=info + '*** '+ billets[1] + '(' + billets[4] + ')' + '\n' print info print str(dico_new[x[0]]) +' doc. in ( '+str(float(dico_new[x[0]])/float(N_new)*100.) +'% )' + ' vs ' + str(dico[x[0]])+' doc. out ( '+str(float(dico[x[0]])/float(N)*100.) +'% )' + ' => ratio: ' + str(float(dico_new[x[0]])/float(N_new)/(float(dico[x[0]])/float(N))) var = raw_input('Do you wish to add "' + x[0] + '" to the query ?') if var=='y': query = add_query(query,x[0]) fonctions_bdd.remplir_table(name_bdd_new,'billets',nouveaux_billets,champs_name) nb_question=nb_question+1 if var=='s': steps=0 else: query = add_query(query,x[0]) else: pass else: fileout=open(path_req + 'query_extension.csv','w') print path_req + 'query_extension.csv' fileout.write('nlemme' + '\t' + ' nb doc. in ' + '\t' + "pourcentage doc in" +'\t' + ' nb doc out ' + '\t'+'poucentage doc out' +'\t' +' ratio: ' +'\n') for x in ratio_l: fileout.write(str(x[0]) +'\t' + str(dico_new[x[0]]) +'\t'+str(float(dico_new[x[0]])/float(N_new)*100.) + '\t' + str(dico[x[0]])+' \t'+str(float(dico[x[0]])/float(N)*100.) +'\t ' + str(float(dico_new[x[0]])/float(N_new)/(float(dico[x[0]])/float(N))) + '\n') print str(x[0]) +'\t' + str(dico_new[x[0]]) +'\t'+str(float(dico_new[x[0]])/float(N_new)*100.) + '\t' + str(dico[x[0]])+' \t'+str(float(dico[x[0]])/float(N)*100.) +'\t ' + str(float(dico_new[x[0]])/float(N_new)/(float(dico[x[0]])/float(N))) + '\n' print 'query finale' print query var = raw_input('Do you wish to perform a new indexation of the database based on the new query ?') if var == 'n': encore=0 return query
print "+++ processing raw database \""+name_data+"\" into SQL database file \""+name_bdd+"\"" sep = ' *** ' ############################################################## #######1.importation des billets d'une base extérieure######## ############################################################## #a modifier en fonction de l'endroit où on se trouve: name_bdd_new = '.'.join(name_bdd.split('.')[:-2]) + '_new.' + '.'.join(name_bdd.split('.')[-2:]) print name_bdd print name_bdd_new champs_name = "(title,date,permalink,site,categorie1,categorie2,categorie3,content_lemmatise,content,href,jours,concepts,identifiant_unique,requete,concepts_id)"#on n'enregistre pas le html brut billet_new = fonctions_bdd.select_bdd_table_champ_simple(name_bdd_new,'billets',champs_name[1:-1]) champs=billet_new fonctions_bdd.remplir_table(name_bdd,'billets',champs,champs_name) ################################### #######2.calcul des infos annexes## ################################### #creation de la table auteurs try: fonctions_bdd.detruire_table(name_bdd,'auteurs') except: pass print " + creation de la table auteurs..." fonctions_bdd.creer_table_auteurs(name_bdd,'auteurs')
## on alimente la table concepts avec la liste des concepts trie et leur forme principale file_concepts=codecs.open(dictionnaire_treetagged__formemajoritaire_name,'r','utf-8') liste_concepts=[] correspondance_lemme_forme={} for ligne in file_concepts.readlines(): lignev = ligne.split('\t') liste_concepts.append((lignev[0].encode('utf-8','replace'),lignev[1].encode('utf-8','replace'))) correspondance_lemme_forme[lignev[0].encode('utf-8','replace')]=lignev[1].encode('utf-8','replace') print "&&&",len(liste_concepts),"concepts now." ##si necessaire on recree la table concepts #on remplit la table concept #print liste_concepts fonctions_bdd.remplir_table(name_bdd,'concepts',liste_concepts,"(concepts,forme_principale)") contenu = fonctions_bdd.select_bdd_table_champ_simple(name_bdd,'concepts','id,concepts') liste_concepts_dico={} for con in contenu: liste_concepts_dico[con[1]]=con[0] def freq_comp(year): y=years.index(year) #for y,year in enumerate(years): #on construit la liste des index des concepts dans l'ordre ngrammes_fit_index=[] Nb_auteurs = fonctions_bdd.count_rows(name_bdd,'auteurs')
def ecrire_tables_cluster_phylo(nodes,edges,sortie,level,time,attribut,sonsbis,fathersbis,dico_termes,indexs_inv,map_dessous,transition,sep_label): dico_termes_inv = invert_dict(dico_termes) transition_inv = invert_dict(transition) variables_cluster=[] variables_phylo=[] variables_maps=[] try: print 'path_req' + 'site' os.mkdir(path_req + 'site') except: pass fichier_dot=open(path_req + 'site' +'/' + 'ExportPhyloDetails.dot','w') fichier_dot.write('digraph arbre_phylogenetique {\n') labels_annees=[] ranks = {} for idx,lab in nodes.iteritems(): #print idx id_cluster_univ=transition_inv[idx] tim = time[idx] lev = level[idx] att = attribut[idx] son = sonsbis[idx] fat = fathersbis[idx] labv = lab.split(sep_label) labv.sort() lab1 = labv[0] lab2 = labv[1] lab_1 = dico_termes_inv[lab1] lab_2 = dico_termes_inv[lab2] label_annee = get_label_annees(tim) index_local = indexs_inv[id_cluster_univ] [niv,numero,inter] = index_local.split('_') if not label_annee in labels_annees: labels_annees.append(label_annee) ranks[str(tim)]=['P' + str(tim)] fichier_dot.write('P' + str(tim) +' [shape = rect ,fontsize=22,label="'+label_annee.replace(' ','-')+ '"]\n') if len(labels_annees)>1: fichier_dot.write('P' + str(tim) + ' -> ' + mem + '\n') mem = 'P' + str(tim) fichier_dot.write temp = ranks[str(tim)] temp.append('C' + str(tim) + '_' + str(numero)) ranks[str(tim)]=temp fichier_dot.write('C' + str(tim) + '_' + str(numero) +' [style=filled,fontname=Arial,fontsize=12,peripheries=2,color=lightblue2 ,shape=rect, label="') for con in map_dessous[id_cluster_univ]: con = indexs_inv[con].split('_')[1] variables_cluster.append([idx,numero,lab_1,lab_2,lab,att,lev,label_annee,fat,son,idx,con]) fichier_dot.write('\\n'+dico_termes[int(con)]) fichier_dot.write('",fontsize=12]\n') # ex('CREATE TABLE '+ name_table +' (id INTEGER ,id_cluster INTEGER,label_1 INTEGER,label_2 INTEGER,label VARCHAR(300),attribut VARCHAR(300),level INTEGER,periode VARCHAR(50),concept INT, pseudo VARCHAR(10), cluster_size INT, density VARCHAR(10), nb_fathers INT, nb_sons INT, lettre VARCHAR(3), identifiant_unique VARCHAR(20))') variables_cluster_names = "(id_cluster_univ,id_cluster,label_1,label_2,label,attribut,level,periode,nb_fathers , nb_sons,identifiant_unique,concept)"#concept,pseudo , cluster_size , density , lettre , ))" fonctions_bdd.remplir_table(name_bdd,'cluster',variables_cluster,variables_cluster_names) for source,streng in edges.iteritems(): id_cluster_1_univ=transition_inv[source[0]] id_cluster_2_univ=transition_inv[source[1]] [niv_1,id_cluster_1,inter_1] = indexs_inv[id_cluster_1_univ].split('_') [niv_2,id_cluster_2,inter_2] = indexs_inv[id_cluster_2_univ].split('_') label_annee_1 = get_label_annees(int(inter_1)) label_annee_2 = get_label_annees(int(inter_2)) if niv_1 == niv_2: if inter_1 == inter_2: variables_maps.append([id_cluster_1,label_annee_1,source[0],id_cluster_2,label_annee_2,source[1],str(streng)]) #print streng #activer liens intra temporelles #if float(streng)>0.: #fichier_dot.write('C'+str(inter_1) + '_' + str(id_cluster_1) + '->' + 'C'+str(inter_2) + '_' + str(id_cluster_2) + ' ' + '[style="setlinewidth(2)",color=green]\n') else: variables_phylo.append([id_cluster_1,label_annee_1,source[0],id_cluster_2,label_annee_2,source[1],str(streng)]) fichier_dot.write('C'+str(inter_1) + '_' + str(id_cluster_1) + '->' + 'C'+str(inter_2) + '_' + str(id_cluster_2) + ' ' + '[style="setlinewidth(2)",color=red]\n') for inter,ra in ranks.iteritems(): fichier_dot.write('{rank=same;') for x in ra: fichier_dot.write(str(x)+' ') fichier_dot.write('}\n') fichier_dot.write('}\n') variables_phylo_names = "(id_cluster_1,periode_1,id_cluster_1_univ,id_cluster_2,periode_2,id_cluster_2_univ,strength)" variables_maps_names = "(id_cluster_1,periode_1,id_cluster_1_univ,id_cluster_2,periode_2,id_cluster_2_univ,strength)" fonctions_bdd.remplir_table(name_bdd,'phylo',variables_phylo,variables_phylo_names) fonctions_bdd.remplir_table(name_bdd,'maps',variables_maps,variables_maps_names)
for lignes in file.readlines(): sites_interdits.append(lignes.split('\t')[0][:-1]) print " - on charge la liste des sources inerdites..." print " - remplissage de la table auteurs..." sortie = fonctions_bdd.select_bdd_table(name_bdd,'billets','site',requete) sites = set() for sor in sortie: names =sor[0].split(" *%* ") for nom in names: site_name=text_processing.nettoyer_url(nom) if site_name not in sites_interdits: sites.add(site_name) sites=list(sites) fonctions_bdd.remplir_table(name_bdd,'auteurs',sites,"(auteurs)") print " - recuperation des ids des auteurs dans la table \"auteurs\" (index SQL) pour reinjecter dans la table \"billets\"..." auteurs = fonctions_bdd.select_bdd_table_champ_simple(name_bdd,'auteurs','id,auteurs') dic_auteurs ={} for aut in auteurs: dic_auteurs[aut[1]] = aut[0] site_billets = fonctions_bdd.select_bdd_table_champ_simple(name_bdd,'billets','id,site') auteur_index=[] for sit in site_billets: id_b= sit[0] names =sit[1].split(" *%* ") for nom in names: sit_name = text_processing.nettoyer_url(nom) if sit_name in dic_auteurs:
try: fonctions_bdd.drop_table(name_bdd,'concept2billets') except: pass fonctions_bdd.creer_table_concept2billets(name_bdd,'concept2billets') con2bill = [] for couple in concepts_index: id_b = couple[0] for con in couple[1]: con2bill.append([con,id_b,billet_jour[id_b],requete,str(con)+'_'+str(id_b)]) fonctions_bdd.remplir_table(name_bdd,'concept2billets',con2bill,"(concept,id_b,jours,requete,identifiant_unique)") #on alimente ensuite la table socsem liant les index des acteurs aux index des concepts ainsi qu'au jour du lien def aggreger_periode(liensem): #lienssem.append([con1,con2,jours,b_id,requete,str(b_id)+'_' + str(con1) + '_' + str(con2)]) #fonctions_bdd.remplir_table(name_bdd,'sem_weighted',lienssem_weighted,"(concept1,concept2,periode,cooccurrences,requete,identifiant_unique)") years_bins=parameters.years_bins lienssem_weighted_dict={} lienssem_weighted=[] requete=''
name_data = parameters.name_data requete = parameters.requete user_interface=parameters.user_interface name_data_real = parameters.name_data_real #lemmadictionnary = parameters.lemmadictionary# si nul, on calcule ou recalcule le dico de lemmes sur la requete consideree sinon on ne calcule pas path_req = parameters.path_req sep = parameters.sep build_link_tables=parameters.build_link_tables language = parameters.language ################################### #######3.Indexer les billets####### ################################### ################################### ####construction des liens######### ################################### #on alimente enfin la table soc liant les index des acteurs entre eux ainsi qu'au jour du lien try: fonctions_bdd.detruire_table(name_bdd,'soc') except: pass fonctions_bdd.creer_table_soc(name_bdd,'soc') lienssoc = misc.build_social_net(requete,name_bdd,sep,name_data) fonctions_bdd.remplir_table(name_bdd,'soc',lienssoc,"(auteur1,auteur2,jours,id_b,requete,identifiant_unique)") print "\n--- finished inserting data in table soc."
y=couple[1] if x in dict_mono_dict: temp = dict_mono_dict[x] else: temp = {} temp[y]=moy = float(sum(valeurs.values())/n) dict_mono_dict[x]=temp return dict_mono_dict n=len(years_bins) dist_1d = turn1d_moy(dist_2d) dist_1d_trans = turn1d_moy(dist_2d_trans) def export_nfirst(dist_2d,dist_1d,direction,nfirst): dist_2d_vector = [] for x,vois_forcemoy in dist_1d.iteritems(): vois_forcemoy_sorted = sorted(vois_forcemoy.iteritems(), key=operator.itemgetter(1),reverse=True) for voisin_force in vois_forcemoy_sorted[:nfirst]: voisin = voisin_force[0] force =voisin_force[1] dist_2d_vector.append((x,voisin,','.join(map(str,add_zeros(dist_2d[(x,voisin)],years_bins))),str("%.3f" %(force)),direction)) return dist_2d_vector thres = 50 dist_2d_vector=export_nfirst(dist_2d,dist_1d,'1',thres) fonctions_bdd.remplir_table(name_bdd,'termneighbour',dist_2d_vector,"(term1,term2, distances,force_moy,direction)") dist_2d_vector_trans=export_nfirst(dist_2d_trans,dist_1d_trans,'0',thres) fonctions_bdd.remplir_table(name_bdd,'termneighbour',dist_2d_vector_trans,"(term1,term2,distances,force_moy,direction)") #fonctions.ecrire_reseau(dist_mat,years_bins,dist_type,seuil,1,dedoubler(dico_termes,years_bins))