Beispiel #1
0
def fast_ngram_counter_x(input):
	x = input[0]
	size_seq = input[1]
	Nb_rows=input[2]
	sample = input[3]
	nb_sequences=input[4]
	concept_list=input[5]
	name_bdd = input[6]
	dictionnaire_gramme_x={}
	lim_d = str(size_seq*x)
	if x<nb_sequences:
		duration = str(size_seq)
	else:
		duration = str(min(Nb_rows - size_seq*x,sample))
	where=1
	contenu = fonctions_bdd.select_bdd_table_where_limite(name_bdd,'billets','content_lemmatise',sample,requete,where,lim_d+','+duration,Nb_rows)
	billetprocessed_after_requete=0
	for billetlemm in contenu:
		billetprocessed_after_requete=1+billetprocessed_after_requete
		if not billetprocessed_after_requete%500 or billetprocessed_after_requete == len(contenu) : 
			print '---------'+str(billetprocessed_after_requete)+ ' traités (export ngrammes sur '+str(Nb_rows)+ ' billets)'
		billet_lemmatise =  billetlemm[0]
		if concept_list=='':
			dictionnaire_gramme_x = text_processing.ngramme_build(billet_lemmatise.split(),maxTermLength,dictionnaire_gramme_x,language,'billet')
		else:
			dictionnaire_gramme_x = text_processing.ngramme_find(billet_lemmatise,dictionnaire_gramme_x,concept_list)
	return dictionnaire_gramme_x
def do_calculation(year):
		print str(year) + ' being processed '
		where = " jours IN ('" + "','".join(list(map(str,year))) + "') "
		contenu = fonctions_bdd.select_bdd_table_champ_simple(name_bdd,'billets','jours',where)
		Nb_rows=len(contenu)
		#il faut découper ici car ça prend trop de RAM
		if sample<Nb_rows:
			size_seq = sample
			nb_sequences=0
		else:
			size_seq = 10000
			nb_sequences = Nb_rows/size_seq
		dictionnaire_gramme = {}#initialisation du dictionnaire de lemmes
		billetprocessed_after_requete=0 #counts the number of processed posts
		for x in range(nb_sequences+1):
			dictionnaire_gramme_x={}
		#	billetprocessed_after_requete=1+billetprocessed_after_requete
			lim_d = str(size_seq*x)
			if x<nb_sequences:
				duration = str(size_seq)
			else:
				duration = str(min(Nb_rows - size_seq*x,sample))
			where = " jours IN ('" + "','".join(list(map(str,year))) + "') "
			where=''
			for ii,ystr in enumerate(list(map(str,year))):
				if ii>0:
					where = where + ' or '
				where = where + ' jours = ' +"'"+ ystr+"'"
			#print where
			contenu = fonctions_bdd.select_bdd_table_where_limite(name_bdd,'billets','content_lemmatise',sample,requete,where,lim_d+','+duration,Nb_rows)
			
			for billetlemm in contenu:
				billetprocessed_after_requete=1+billetprocessed_after_requete
				if not billetprocessed_after_requete%500 or billetprocessed_after_requete == Nb_rows : 
					print '---------'+str(billetprocessed_after_requete)+ ' traités (export ngrammes sur '+str(Nb_rows)+ ' billets)'
				billet_lemmatise =  billetlemm[0]
				dictionnaire_gramme_x = text_processing.ngramme_build(billet_lemmatise.split(),maxTermLength,dictionnaire_gramme_x,language,'absolu')
			dictionnaire_gramme=fonctions_lib.merge(dictionnaire_gramme, dictionnaire_gramme_x, lambda x,y:x+y)
		return dictionnaire_gramme
Beispiel #3
0
	for x in range(nb_sequences+1):

		dictionnaire_gramme_x={}
	#	billetprocessed_after_requete=1+billetprocessed_after_requete
		lim_d = str(size_seq*x)
		if x<nb_sequences:
			duration = str(size_seq)
		else:
			duration = str(min(Nb_rows - size_seq*x,sample))
		contenu = fonctions_bdd.select_bdd_table_where_limite(name_bdd,'billets','content_lemmatise',sample,requete,where,lim_d+','+duration,Nb_rows)
		for billetlemm in contenu:
			billetprocessed_after_requete=1+billetprocessed_after_requete
			if not billetprocessed_after_requete%500 or billetprocessed_after_requete == len(contenu) : 
				print '---------'+str(billetprocessed_after_requete)+ ' traités (export ngrammes sur '+str(Nb_rows)+ ' billets)'
			billet_lemmatise =  billetlemm[0]
			dictionnaire_gramme_x = text_processing.ngramme_build(billet_lemmatise.split(),maxTermLength,dictionnaire_gramme_x,language,'absolu')
		dictionnaire_gramme=fonctions_lib.merge(dictionnaire_gramme, dictionnaire_gramme_x, lambda x,y:x+y)
	fonctions_lib.dumpingin(dictionnaire_gramme,name_export_pkl,requete)



#on trie par fréquence et on exporte le lexique final avec les occurrences 
print '\n'

dico_final = misc.freq_tri(dictionnaire_gramme,freqmin,int(math.floor(top*1.1)),language,ng_filter)#on effectue le tri de notre dictionnaire
#print dico_final

# redondance=0#pas otimise le machin, attention!!! revoir avant d'activer la fonctionnalite
# #redondance = 0, on n'utilise pas l'algo qui calcule avec la redondance !
# if redondance == 1:
# 	print "\n--- beginning processing redundant nlemmes "