def fast_ngram_counter_x(input): x = input[0] size_seq = input[1] Nb_rows=input[2] sample = input[3] nb_sequences=input[4] concept_list=input[5] name_bdd = input[6] dictionnaire_gramme_x={} lim_d = str(size_seq*x) if x<nb_sequences: duration = str(size_seq) else: duration = str(min(Nb_rows - size_seq*x,sample)) where=1 contenu = fonctions_bdd.select_bdd_table_where_limite(name_bdd,'billets','content_lemmatise',sample,requete,where,lim_d+','+duration,Nb_rows) billetprocessed_after_requete=0 for billetlemm in contenu: billetprocessed_after_requete=1+billetprocessed_after_requete if not billetprocessed_after_requete%500 or billetprocessed_after_requete == len(contenu) : print '---------'+str(billetprocessed_after_requete)+ ' traités (export ngrammes sur '+str(Nb_rows)+ ' billets)' billet_lemmatise = billetlemm[0] if concept_list=='': dictionnaire_gramme_x = text_processing.ngramme_build(billet_lemmatise.split(),maxTermLength,dictionnaire_gramme_x,language,'billet') else: dictionnaire_gramme_x = text_processing.ngramme_find(billet_lemmatise,dictionnaire_gramme_x,concept_list) return dictionnaire_gramme_x
def do_calculation(year): print str(year) + ' being processed ' where = " jours IN ('" + "','".join(list(map(str,year))) + "') " contenu = fonctions_bdd.select_bdd_table_champ_simple(name_bdd,'billets','jours',where) Nb_rows=len(contenu) #il faut découper ici car ça prend trop de RAM if sample<Nb_rows: size_seq = sample nb_sequences=0 else: size_seq = 10000 nb_sequences = Nb_rows/size_seq dictionnaire_gramme = {}#initialisation du dictionnaire de lemmes billetprocessed_after_requete=0 #counts the number of processed posts for x in range(nb_sequences+1): dictionnaire_gramme_x={} # billetprocessed_after_requete=1+billetprocessed_after_requete lim_d = str(size_seq*x) if x<nb_sequences: duration = str(size_seq) else: duration = str(min(Nb_rows - size_seq*x,sample)) where = " jours IN ('" + "','".join(list(map(str,year))) + "') " where='' for ii,ystr in enumerate(list(map(str,year))): if ii>0: where = where + ' or ' where = where + ' jours = ' +"'"+ ystr+"'" #print where contenu = fonctions_bdd.select_bdd_table_where_limite(name_bdd,'billets','content_lemmatise',sample,requete,where,lim_d+','+duration,Nb_rows) for billetlemm in contenu: billetprocessed_after_requete=1+billetprocessed_after_requete if not billetprocessed_after_requete%500 or billetprocessed_after_requete == Nb_rows : print '---------'+str(billetprocessed_after_requete)+ ' traités (export ngrammes sur '+str(Nb_rows)+ ' billets)' billet_lemmatise = billetlemm[0] dictionnaire_gramme_x = text_processing.ngramme_build(billet_lemmatise.split(),maxTermLength,dictionnaire_gramme_x,language,'absolu') dictionnaire_gramme=fonctions_lib.merge(dictionnaire_gramme, dictionnaire_gramme_x, lambda x,y:x+y) return dictionnaire_gramme
for x in range(nb_sequences+1): dictionnaire_gramme_x={} # billetprocessed_after_requete=1+billetprocessed_after_requete lim_d = str(size_seq*x) if x<nb_sequences: duration = str(size_seq) else: duration = str(min(Nb_rows - size_seq*x,sample)) contenu = fonctions_bdd.select_bdd_table_where_limite(name_bdd,'billets','content_lemmatise',sample,requete,where,lim_d+','+duration,Nb_rows) for billetlemm in contenu: billetprocessed_after_requete=1+billetprocessed_after_requete if not billetprocessed_after_requete%500 or billetprocessed_after_requete == len(contenu) : print '---------'+str(billetprocessed_after_requete)+ ' traités (export ngrammes sur '+str(Nb_rows)+ ' billets)' billet_lemmatise = billetlemm[0] dictionnaire_gramme_x = text_processing.ngramme_build(billet_lemmatise.split(),maxTermLength,dictionnaire_gramme_x,language,'absolu') dictionnaire_gramme=fonctions_lib.merge(dictionnaire_gramme, dictionnaire_gramme_x, lambda x,y:x+y) fonctions_lib.dumpingin(dictionnaire_gramme,name_export_pkl,requete) #on trie par fréquence et on exporte le lexique final avec les occurrences print '\n' dico_final = misc.freq_tri(dictionnaire_gramme,freqmin,int(math.floor(top*1.1)),language,ng_filter)#on effectue le tri de notre dictionnaire #print dico_final # redondance=0#pas otimise le machin, attention!!! revoir avant d'activer la fonctionnalite # #redondance = 0, on n'utilise pas l'algo qui calcule avec la redondance ! # if redondance == 1: # print "\n--- beginning processing redundant nlemmes "