ngrammes_fit_index.append(clique_index)
	print "    + liste des index des concepts creee"


	for aut,clique in ngrammes_auteurs_fit.iteritems():
		for terme in set(clique):
			dictionnaire_frequence_exact_auteur[terme]=dictionnaire_frequence_exact_auteur[terme]+1
	print "    + liste des index des concepts creee"

	file_freq_exact =  path_req +'years/'+ requete +str(year) + '_'  +  'frequences_exactes.csv'
	fichier_out =file(file_freq_exact,'w')
	def format(value):
	    return "%.9f" % value

	for x in dictionnaire_frequence_exact:	
	#	print str(x) + '\t' + str(correspondance_lemme_forme[x]) +'\t' + (str(format(float(dictionnaire_frequence_exact[x])/N))).replace('.',',') +'\t' + (str(format(float(dictionnaire_frequence_exact_auteur[x])/float(Nb_auteurs)))).replace('.',',')+  '\n'
		fichier_out.write(str(x) + '\t' + str(correspondance_lemme_forme[x]) +'\t' + (str(format(float(dictionnaire_frequence_exact[x])/N))).replace('.',',') +'\t' + (str(format(float(dictionnaire_frequence_exact_auteur[x])/float(Nb_auteurs)))).replace('.',',')+  '\n')
	print "    + frequences exactes calculees   "+ file_freq_exact

try: 
	os.mkdir(path_req +'years/')
except:
	pass

pool_size = int(multiprocessing.cpu_count())
pool = multiprocessing.Pool(processes=pool_size)
pool.map(freq_comp, years)

	
fusion_years.fusion('freq')
Esempio n. 2
0
		dictionnaire_gramme_year[y]=x
	fonctions_lib.dumpingin(dictionnaire_gramme_year,name_export_pkl,requete)


#decoupage par periode:
print dictionnaire_gramme_year.keys()
#puis on itere annee par annee
try: 
	os.mkdir(path_req +'years/')
except:
	pass
for y,year in enumerate(years):
	#on trie par fréquence et on exporte le lexique final avec les occurrences 
	print '\n'
	print year
	
	dico_final = misc.freq_tri(dictionnaire_gramme_year[y],freqmin,int(math.floor(top*1.1)),language,ng_filter)#on effectue le tri de notre dictionnaire
	filename = path_req +'years/'+ requete + '_' + str(freqmin) + '_' +str(year) + '_'+ 'liste_n-grammes_freq_divers.csv'
	filename_redond =  path_req +'years/'+ requete + '_' + str(freqmin) +str(year) + '_'+ 'liste_n-grammes_freq_divers_noredond.csv'
	filename_redond_leven =  path_req +'years/'+ requete + '_' + str(freqmin)+str(year) + '_' 'liste_n-grammes_freq_divers_leven_noredond.csv'
	misc.ecrire_liste_lemmes_freq(dico_final,Nb_rows,filename,lemme_maj,freqmin,ng_filter)#on ecrit la liste precedente dans un fichier filename
	print "\n+++"+str(len(dico_final))+" n-lemmes crees."
	#leven.pack_rendondance(filename,filename_redond,maxTermLength,freqmin,language,redondance_manuelle,ng_filter,user_interface)
	leven.pack_rendondance_exact(filename,filename_redond,maxTermLength,freqmin,language,ng_filter,user_interface)
	print "\n"
	Nb_rows = fonctions_bdd.count_rows_where(name_bdd,'billets'," where jours IN ('" + "','".join(list(map(str,year))) + "') ")
	print Nb_rows
	leven.pack_leven(filename_redond,filename_redond_leven,language,user_interface,freqmin,Nb_rows)

fusion_years.fusion('redond')
Esempio n. 3
0
	# 
	# l = distribution_distance.items()
	# l.sort(key=itemgetter(1),reverse=True)
	# dico_final_top={}
	# 
	# synonymes_potentiels = open(path_req + 'synonymes.txt','w')
	# 
	# for x in l[:10000]:
	# 	couple=x[0]
	# 	#if p_cooccurrences[(couple[0],couple[0],0)]*NN>freqmin and p_cooccurrences[(couple[1],couple[1],0)]*NN>freqmin:
	# 		#print dico_termes[couple[0]] + '\t'+dico_termes[couple[1]] + '\t' + str(float(distribution_distance[couple])) 
	# 	synonymes_potentiels.write(dico_termes[couple[0]] + '\t'+dico_termes[couple[1]] + '\t' + str(float(distribution_distance[couple]))  + '\n')
	# 
	# timeapres = timeavt
	# timeavt = time()
	# print 'duree de la derniere etape : ' + str(timeavt-timeapres) + '\n'
	# print "matrice de cooccurrence construite"

	muti = build_mutual_information(p_cooccurrences,p_cooccurrences_ordre1,nb_billets,occurrences,top_concepts_dict)
	xhi2val = xhi2(muti)
	export_concepts_xhi2(xhi2val,p_cooccurrences,p_cooccurrences_ordre1,dico_termes,dico_lemmes,year,occurrences)
	
# 	
# pool_size = int(multiprocessing.cpu_count())
# pool = multiprocessing.Pool(processes=pool_size)
# print years
# pool.map(xhi2_comp, years)
for year in years:
	xhi2_comp(year)
fusion_years.fusion('conceptsxhi2')