ngrammes_fit_index.append(clique_index) print " + liste des index des concepts creee" for aut,clique in ngrammes_auteurs_fit.iteritems(): for terme in set(clique): dictionnaire_frequence_exact_auteur[terme]=dictionnaire_frequence_exact_auteur[terme]+1 print " + liste des index des concepts creee" file_freq_exact = path_req +'years/'+ requete +str(year) + '_' + 'frequences_exactes.csv' fichier_out =file(file_freq_exact,'w') def format(value): return "%.9f" % value for x in dictionnaire_frequence_exact: # print str(x) + '\t' + str(correspondance_lemme_forme[x]) +'\t' + (str(format(float(dictionnaire_frequence_exact[x])/N))).replace('.',',') +'\t' + (str(format(float(dictionnaire_frequence_exact_auteur[x])/float(Nb_auteurs)))).replace('.',',')+ '\n' fichier_out.write(str(x) + '\t' + str(correspondance_lemme_forme[x]) +'\t' + (str(format(float(dictionnaire_frequence_exact[x])/N))).replace('.',',') +'\t' + (str(format(float(dictionnaire_frequence_exact_auteur[x])/float(Nb_auteurs)))).replace('.',',')+ '\n') print " + frequences exactes calculees "+ file_freq_exact try: os.mkdir(path_req +'years/') except: pass pool_size = int(multiprocessing.cpu_count()) pool = multiprocessing.Pool(processes=pool_size) pool.map(freq_comp, years) fusion_years.fusion('freq')
dictionnaire_gramme_year[y]=x fonctions_lib.dumpingin(dictionnaire_gramme_year,name_export_pkl,requete) #decoupage par periode: print dictionnaire_gramme_year.keys() #puis on itere annee par annee try: os.mkdir(path_req +'years/') except: pass for y,year in enumerate(years): #on trie par fréquence et on exporte le lexique final avec les occurrences print '\n' print year dico_final = misc.freq_tri(dictionnaire_gramme_year[y],freqmin,int(math.floor(top*1.1)),language,ng_filter)#on effectue le tri de notre dictionnaire filename = path_req +'years/'+ requete + '_' + str(freqmin) + '_' +str(year) + '_'+ 'liste_n-grammes_freq_divers.csv' filename_redond = path_req +'years/'+ requete + '_' + str(freqmin) +str(year) + '_'+ 'liste_n-grammes_freq_divers_noredond.csv' filename_redond_leven = path_req +'years/'+ requete + '_' + str(freqmin)+str(year) + '_' 'liste_n-grammes_freq_divers_leven_noredond.csv' misc.ecrire_liste_lemmes_freq(dico_final,Nb_rows,filename,lemme_maj,freqmin,ng_filter)#on ecrit la liste precedente dans un fichier filename print "\n+++"+str(len(dico_final))+" n-lemmes crees." #leven.pack_rendondance(filename,filename_redond,maxTermLength,freqmin,language,redondance_manuelle,ng_filter,user_interface) leven.pack_rendondance_exact(filename,filename_redond,maxTermLength,freqmin,language,ng_filter,user_interface) print "\n" Nb_rows = fonctions_bdd.count_rows_where(name_bdd,'billets'," where jours IN ('" + "','".join(list(map(str,year))) + "') ") print Nb_rows leven.pack_leven(filename_redond,filename_redond_leven,language,user_interface,freqmin,Nb_rows) fusion_years.fusion('redond')
# # l = distribution_distance.items() # l.sort(key=itemgetter(1),reverse=True) # dico_final_top={} # # synonymes_potentiels = open(path_req + 'synonymes.txt','w') # # for x in l[:10000]: # couple=x[0] # #if p_cooccurrences[(couple[0],couple[0],0)]*NN>freqmin and p_cooccurrences[(couple[1],couple[1],0)]*NN>freqmin: # #print dico_termes[couple[0]] + '\t'+dico_termes[couple[1]] + '\t' + str(float(distribution_distance[couple])) # synonymes_potentiels.write(dico_termes[couple[0]] + '\t'+dico_termes[couple[1]] + '\t' + str(float(distribution_distance[couple])) + '\n') # # timeapres = timeavt # timeavt = time() # print 'duree de la derniere etape : ' + str(timeavt-timeapres) + '\n' # print "matrice de cooccurrence construite" muti = build_mutual_information(p_cooccurrences,p_cooccurrences_ordre1,nb_billets,occurrences,top_concepts_dict) xhi2val = xhi2(muti) export_concepts_xhi2(xhi2val,p_cooccurrences,p_cooccurrences_ordre1,dico_termes,dico_lemmes,year,occurrences) # # pool_size = int(multiprocessing.cpu_count()) # pool = multiprocessing.Pool(processes=pool_size) # print years # pool.map(xhi2_comp, years) for year in years: xhi2_comp(year) fusion_years.fusion('conceptsxhi2')