def fast_ngram_counter(name_bdd,concept_list=''): Nb_rows = fonctions_bdd.count_rows(name_bdd,'billets') size_seq = 5000 nb_sequences = Nb_rows/size_seq dictionnaire_gramme = {}#initialisation du dictionnaire de lemmes billetprocessed_after_requete=0 #counts the number of processed posts import multiprocessing pool_size = min(nb_sequences+1,multiprocessing.cpu_count()*4) pool = multiprocessing.Pool(processes=pool_size) inputs=[] for x in range(nb_sequences+1): inputs.append((x,size_seq,Nb_rows,sample,nb_sequences,concept_list,name_bdd)) pool_outputs = pool.map(fast_ngram_counter_x, inputs) dictionnaire_gramme={} for dictionnaire_gramme_x in pool_outputs: dictionnaire_gramme=fonctions_lib.merge(dictionnaire_gramme, dictionnaire_gramme_x, lambda x,y:x+y) if concept_list=='': dictionnaire_gramme = misc.freq_tri(dictionnaire_gramme,freqmin,int(math.floor(top*1.1)),language,ng_filter)#on effectue le tri de notre dictionnaire return dictionnaire_gramme
for y,x in enumerate(pool_outputs): dictionnaire_gramme_year[y]=x fonctions_lib.dumpingin(dictionnaire_gramme_year,name_export_pkl,requete) #decoupage par periode: print dictionnaire_gramme_year.keys() #puis on itere annee par annee try: os.mkdir(path_req +'years/') except: pass for y,year in enumerate(years): #on trie par fréquence et on exporte le lexique final avec les occurrences print '\n' print year dico_final = misc.freq_tri(dictionnaire_gramme_year[y],freqmin,int(math.floor(top*1.1)),language,ng_filter)#on effectue le tri de notre dictionnaire filename = path_req +'years/'+ requete + '_' + str(freqmin) + '_' +str(year) + '_'+ 'liste_n-grammes_freq_divers.csv' filename_redond = path_req +'years/'+ requete + '_' + str(freqmin) +str(year) + '_'+ 'liste_n-grammes_freq_divers_noredond.csv' filename_redond_leven = path_req +'years/'+ requete + '_' + str(freqmin)+str(year) + '_' 'liste_n-grammes_freq_divers_leven_noredond.csv' misc.ecrire_liste_lemmes_freq(dico_final,Nb_rows,filename,lemme_maj,freqmin,ng_filter)#on ecrit la liste precedente dans un fichier filename print "\n+++"+str(len(dico_final))+" n-lemmes crees." #leven.pack_rendondance(filename,filename_redond,maxTermLength,freqmin,language,redondance_manuelle,ng_filter,user_interface) leven.pack_rendondance_exact(filename,filename_redond,maxTermLength,freqmin,language,ng_filter,user_interface) print "\n" Nb_rows = fonctions_bdd.count_rows_where(name_bdd,'billets'," where jours IN ('" + "','".join(list(map(str,year))) + "') ") print Nb_rows leven.pack_leven(filename_redond,filename_redond_leven,language,user_interface,freqmin,Nb_rows) fusion_years.fusion('redond')