def save_tfidf_like(parl_counter,sort_tfidf_like, counter_list,tot_counter,counter_list_parl): dic = dict(sort_tfidf_like) f = open(dir_out+"tfidf_like_parametros.csv", 'w') f.write("palavra"+";"+"valor"+";"+"frequencia"+";"+"entropia maxima"+";"+"entropia da palvra"+";"+"prob_politica"+";"+"entropia entre deputados"+"\n") for word in parl_counter: f.write(word+";"+str(dic[word])+";"+ '%.4f'%(TfIdf.tf(word,parl_counter))+";"+ '%.4f'%(math.log2(len(counter_list)))+";"+ '%.4f'%(TfIdf.entropy(word,tot_counter,counter_list))+";"+ '%.4f'%(TfIdf.parl_prob(word,parl_counter,counter_list))+";"+ '%.4f'%(TfIdf.parl_entropy(word, tot_counter, counter_list_parl))+"\n") f.close()
def tfidf_month(tw_month,random_list): tweets = list(itertools.chain.from_iterable(itertools.chain.from_iterable(tw_month))) tot_counter = Counter(tweets) dep_counts = list() for dep in tw_month: tw = list(itertools.chain.from_iterable(dep)) print(tw) dep_counts.append(Counter(tw)) docs_counter = docs_counters(random_list,tot_counter) tfidf = TfIdf() tfidf_like = list() for word in tot_counter: tfidf_like.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, dep_counts)) sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like)) sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True) return sort_tfidf_like
docs_bgr_counter.append(bgr_counter) tot_counter = dict() for y in docs_bgr_counter: for k in y.keys(): tot_counter[k] = k in tot_counter and tot_counter[k]+y[k] or y[k] f = open(dir_out+'param_beta.csv', 'rt') reader = csv.DictReader(f) params=list() for row in reader: like = list() beta = list() exp = list() for bgr in bgr_counter: freq = tfidf.tf(bgr,bgr_counter) like.append(freq *idf_like(bgr,bgr_counter,tot_counter,docs_bgr_counter, parl_bgr_counter)) beta.append(freq *idf_beta(bgr,bgr_counter,tot_counter,docs_bgr_counter, parl_bgr_counter,float(row["b1"]),float(row["b2"]))) exp.append(freq *idf_pow(bgr,bgr_counter,tot_counter,docs_bgr_counter, parl_bgr_counter,float(row["b1"]),float(row["b2"]))) like_list = list(zip(bgr_counter.keys(), like)) like_list = sorted(like_list, key=lambda x: x[1], reverse=True) beta_list = list(zip(bgr_counter.keys(), beta)) beta_list = sorted(beta_list, key=lambda x: x[1], reverse=True) pow_list = list(zip(bgr_counter.keys(), exp)) pow_list = sorted(pow_list, key=lambda x: x[1], reverse=True) params.append(row["param"]) with open(dir_pck+str(row["param"])+"_like"+".pck", 'wb') as handle: pickle.dump(like_list, handle)
pickle.dump(alea_processed, handle) with open(dir_out+"list_alea_trigrams.pck", 'wb') as handle: pickle.dump(alea_tri_processed, handle) bgr_counter = parl_bigrams.ngram_fd parl_bgr_counter = [l.ngram_fd for l in parl_processed] docs_bgr_counter = [l.ngram_fd for l in alea_processed] docs_bgr_counter.append(bgr_counter) tfidf = TfIdf() tfidf_smooth = list() for bgr in bgr_counter: tfidf_smooth.append(tfidf.tf(bgr,bgr_counter)*tfidf.idf_smooth(bgr,docs_bgr_counter)) dic_tfidf_smooth = list(zip(bgr_counter.keys(), tfidf_smooth)) dic_tfidf_smooth = sorted(dic_tfidf_smooth, key=lambda x: x[1], reverse=True) tot_counter = dict() for y in docs_bgr_counter: for k in y.keys(): tot_counter[k] = k in tot_counter and tot_counter[k]+y[k] or y[k] tfidf_like = list() for bgr in bgr_counter: tfidf_like.append(tfidf.tf(bgr,bgr_counter)*tfidf.idf_like(bgr,bgr_counter,tot_counter,docs_bgr_counter, parl_bgr_counter)) dic_tfidf_like = list(zip(bgr_counter.keys(), tfidf_like)) dic_tfidf_like = sorted(dic_tfidf_like, key=lambda x: x[1], reverse=True)
with open(dir_ale+"coleta3.pck", 'rb') as data_file: dataset.append(remove_irrelevant(pickle.load(data_file))) print("process tfidf") tfidf_entropy = list() tfidf_smooth = list() tfidf_like = list() for i , data in enumerate(dataset): tmp_smooth = dict() tmp_like = dict() tmp_entropy = dict() print("dataset: " + str(i)) for word in data: tf = TfIdf.tf(word, data) tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, dataset) tmp_smooth[word] = tf * TfIdf.idf_smooth(word, dataset) tmp_like[word] = tf * TfIdf.idf_like(word, i, dataset) tfidf_smooth.append(tmp_smooth) tfidf_like.append(tmp_like) tfidf_entropy.append(tmp_entropy) print("save tfidf") with open(dir_out+"tfidf_entropy.pck", 'wb') as handle: pickle.dump(tfidf_entropy, handle) with open(dir_out+"tfidf_smooth.pck", 'wb') as handle: pickle.dump(tfidf_smooth, handle)
dir_parl = "/Users/lucasso/Documents/pck/" dir_out = "/Users/lucasso/Dropbox/UFMG/Processamento de Linguagem Natural/" file_parl = "/Users/lucasso/Dropbox/UFMG/Processamento de Linguagem Natural/random_pck/docs/deputados.pck" tfidf_n = list() tf_log_idf = list() tfidf_like = list() corr = "" with open(file_parl, 'rb') as handle: parl_counter = pickle.load(handle) tot_counter,counter_list,_ = loadCounters(dir_in) tot_counter_dep,counter_list_dep,pck= loadCounters(dir_parl) tfidf = TfIdf() for word in parl_counter: tf = tfidf.tf(word, parl_counter) idf = tfidf.idf(word,counter_list) log_idf = tfidf.idf_smooth(word,counter_list) ent_idf = tfidf.idf_like(word,parl_counter, tot_counter, counter_list, counter_list_dep) tfidf_n.append(tf*idf) tf_log_idf.append(tf*log_idf) tfidf_like.append(tf*ent_idf) dic_tfidf= list(zip(parl_counter.keys(), tfidf_n)) dic_tf_log_idf= list(zip(parl_counter.keys(), tf_log_idf)) dic_tfidf_like= list(zip(parl_counter.keys(), tfidf_like)) """ corr += "tfidf X tfidf_smooth: "+str(stats.spearmanr([v for i,v in dic_tfidf] ,[v for i,v in dic_tf_log_idf]))+"\n" corr += "tfidf X tfidf_like: "+str(stats.spearmanr([v for i,v in dic_tfidf],[v for i,v in dic_tfidf_like]))+"\n" corr += "tfidf_like X tfidf_smooth: "+str(stats.spearmanr([v for i,v in dic_tfidf_like] , [v for i,v in dic_tf_log_idf]))+"\n"
parl_counters = list() for parl in parl_tw_processed: tw = list(itertools.chain.from_iterable(parl)) parl_counters.append(Counter(tw)) docs_counter =list() docs_counter.append(tot_counter) docs_counter.append(coleta1) docs_counter.append(coleta2) tfidf = TfIdf() tfidf_like = list() for word in tot_counter: tfidf_like.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, parl_counters)) sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like)) sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True) with open(dir_rob+"sort_tfidf_like.pck", 'wb') as handle: pickle.dump(sort_tfidf_like, handle) with open(dir_rob+"tfidf_like.pck", 'wb') as handle: pickle.dump(tfidf_like, handle) with open(dir_rob+"parl_tw_processed.pck", 'wb') as handle: pickle.dump(parl_tw_processed, handle) f = open(dir_rob+"10k_tfidf_like.txt", 'w') for w,i in sort_tfidf_like[:10000]:
tweets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(parl_tweets)))) tot_counter = Counter(tweets) docs_counter = list() for alea_tw in alea_tweets: tw = list(itertools.chain.from_iterable(alea_tw)) docs_counter.append(Counter(tw)) docs_counter.append(tot_counter) parl_counters = list() for parl in parl_tweets: tw = list(itertools.chain.from_iterable(parl)) parl_counters.append(Counter(tw)) tfidf = TfIdf() tfidf_like_bi_trigrams = list() for word in tot_counter: tfidf_like_bi_trigrams.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, parl_counters)) sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like_bi_trigrams)) sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True) with open(dir_out+"sort_tfidf_like_bi_trigram.pck", 'wb') as handle: pickle.dump(sort_tfidf_like, handle)