def classify_article_words(article_words, corpus_tfidfs_per_category, corpus_idf): ''' Find the category that best matches the given article among the given categories. -Input: (1) list of article terms (2) TF*IDF weights for each document in corpus (3) IDF for the entire corpus -Return: top category and a dictionary with match score for the article with each category ''' st_time = time.time() # Find article TF and TFIDF article_tfs = tf_idf.tf(article_words) article_tfidfs = tf_idf.tf_idf(article_tfs, corpus_idf, len(corpus_tfidfs_per_category)) # find best match among categories sim_scores = defaultdict() for cat_name, cat_tfidf_scores in corpus_tfidfs_per_category.iteritems(): cos_sim_time = time.time() sim_scores[cat_name] = \ cosine_sim.cosine_similarity_dict(article_tfidfs, cat_tfidf_scores) # sort by value (match score), descending match = sorted(sim_scores.iteritems(), key=operator.itemgetter(1), reverse=True)[0][0] return match, sim_scores
def common_term(fname,m,rand_n=400,sep='\t'): #seach common terms in happy tweets, rand sample of cluster and noise print 'Extract cluster-tweet terms' data=file('txt\\'+fname+'.txt').readlines()[1:] n=len(data) data=[data[i][:-1].split(sep) for i in xrange(n)] term=[data[i][-1].split(',') for i in xrange(n)] clid=[int(data[i][0]) for i in xrange(n)] k=max(clid)+1 term_cl=init_clusters(k+1,range(-1,k),[[term[i] for i in xrange(n) if clid[i]==j] for j in xrange(-1,k)]) print 'Random sampling of cluster-tweet and noise' term_r=[] for cl in xrange(-1,k): term_r+=list(np.random.permutation(term_cl[cl])[:rand_n]) print 'Count terms' counter=Counter() counter=tf_idf.tf(counter,term_r,type='term') common=counter.most_common()[:m] print 'Write results' f=os.open('txt\\'+fname+'_common.txt', os.O_RDWR|os.O_CREAT) os.write(f,'term,count\n') for i in xrange(m): os.write(f,'%s,%d\n'%(common[i][0],common[i][1])) os.close(f)
def calculate_corpus_tf_idf(corpus_reader): ''' Calculate TF*IDF weight for each document in given corpus. -Input: CorpusReader (either Tagged or PlainText ) -Return: (1) A dictionary whose keys=document name and values=dictionary with terms for keys and TF*IDF weights for values (2) A dictionary whose keys=terms and values=their IDF ''' st_time = time.time() # Term Frequency for each category tfs_per_document = defaultdict(Counter) for document in corpus_reader.fileids(): terms_in_document = corpus_reader.words(document) tfs_per_document[document] = tf_idf.tf(terms_in_document) # Inverse Document Frequency idfs = tf_idf.idf(tfs_per_document) # key is folder name, value is a list of (term, tfidf score) pairs tfidfs_per_document = defaultdict(defaultdict) for document, tfs in tfs_per_document.iteritems(): tfidfs_per_document[document] = tf_idf.tf_idf(tfs, idfs, len(tfs_per_document)) print "time to compute TF-IDF weights for corpus: %.3f sec" % (time.time()-st_time) return tfidfs_per_document, idfs
def getConsultapadrao(): M=['O peã e o caval são pec de xadrez. O caval é o melhor do jog.', 'A jog envolv a torr, o peã e o rei.','O peã lac o boi','Caval de rodei!','Polic o jog no xadrez.'] stopwords=['a', 'o', 'e', 'é', 'de', 'do', 'no', 'são'] q='xadrez peã caval torr' separadores=[' ',',','.','!','?'] (ponderada_docs, ponderada_consulta, incidencias, n) = tf_idf.tf(M, stopwords, q, separadores, True) return modelo_BM25(ponderada_docs, ponderada_consulta, incidencias, n, 1, 0.75)
def m_hot_words_tfidf(data): printv('Calculateing idf score.') idf = tf_idf.idf([v[ID_M_BOW] for v in data.values()]) all_result = dict() for id, bow in data.items(): printv('Calculateing tf-idf score for ' + id) result = list() for term in set(bow[ID_M_BOW]): tf = tf_idf.tf(term, bow[ID_M_BOW]) result.append((term, round(tf_idf.tfidf(term, tf, idf), ROUND_DIGITS))) all_result[id] = dict() for word, score in sorted(result, key=lambda x: x[1], reverse=True)[:RESULT_LENGTH]: all_result[id][word] = score return all_result
def common_emoji_cl(fname,m,names=False): #high tf-idf emoji of each cluster print 'Extract terms from %s'%fname [emoji,emoji_cl]=extract_data(fname) N=len(emoji) k=len(emoji_cl) N_cl=[len(emoji_cl[i]) for i in xrange(1,k+1)] print 'Count emoji tf' counter=Counter() counter=tf_idf.count(emoji,counter,ref=emoji_dict,type='emoji') counter_cl=[tf_idf.tf(counter,emoji_cl[i+1],ref=emoji_dict,type='emoji') for i in xrange(k)] print 'Calculate cluster emoji tf-idf' tfIdf=[tf_idf.tf_idf(counter,counter_cl[i+1],N,N_cl[i]) for i in xrange(k)] term=tfIdf[0][0] tfIdf=[tfIdf[i][1] for i in xrange(k)] print 'Write results' write_common_emoji_cl(fname,mtx,names)
def to_wordurl(docs): words_dic = {} tfdic = tf_idf.tf(docs) return tfdic.keys()
def common_term_cls(fname,m,cls=False,top_user=100,sep='\t'): #seach common/representative terms of clusters by tf-idf algo '''logic should be finding terms that are special for the cluster but not for one user posting tweets in the cluster''' print 'Extract cluster-tweet terms' data=file('txt\\'+fname+'.txt').readlines()[1:] n=len(data) data=[data[i][:-1].split(sep) for i in xrange(n)] clid=[int(data[i][0]) for i in xrange(n)] user=[data[i][1] for i in xrange(n)] term=[data[i][-1].split(',') for i in xrange(n)] print 'Process term by clusters and users' if not cls: cls=range(max(clid)+1) term_cl=init_clusters(len(cls),cls) user_cl=init_clusters(len(cls),cls) for cl in cls: term_cl[cl]=[term[i] for i in xrange(n) if clid[i]==cl] user_cl[cl]=[user[i] for i in xrange(n) if clid[i]==cl] print 'Count all-term tf' counter=Counter() counter=tf_idf.tf(counter,term,'term') print 'Remove common terms' data=file('txt\\'+fname+'_common.txt').readlines()[1:] l=len(data) data=[data[i][:-1].split(',') for i in xrange(l)] common=set([data[i][0] for i in xrange(l)]) term_remove=[] for key in counter.keys(): l=key.split(' ') if l[0] in common or l[-1] in common: #first/last word is common term term_remove+=[key] for key in term_remove: del counter[key] tfidf_cl=init_clusters(len(cls),cls) for cl in cls: print 'Calculate term and user tf-idf -- Cluster%d'%cl counter_copy=copy.copy(counter) remove_copy=copy.copy(term_remove) tfidf_cl[cl]=tf_idf_cl(term_cl[cl],user_cl[cl],counter_copy,top_user,n,remove_copy) print 'Remove shared term and self-merging' mtx=[tfidf_cl[cl] for cl in cls] mtx=tf_idf.screen(mtx,m,type='term') print 'Write results' f=os.open('txt\\'+fname+'_tfIdf.txt',os.O_RDWR|os.O_CREAT) os.write(f,','.join(['cluster%d,'%cl for cl in cls])+'\n') os.write(f,','.join(['term,tf-idf' for cl in cls])+'\n') k=len(cls) mtx=[fill_empty(list(mtx[i]),m,'f') for i in xrange(k)] for i in xrange(m): os.write(f,','.join(['%s,%0.3f'%(mtx[j][i][0],mtx[j][i][1]) for j in xrange(k)])+'\n') os.close(f)
def tf_idf_test(fname,cl=0,sep='\t'): data=file('txt\\'+fname+'.txt').readlines()[1:] n=len(data) data=[data[i][:-1].split(sep) for i in xrange(n)] user=[data[i][1] for i in xrange(n) if int(data[i][0])==cl] term=[data[i][-1].split(',') for i in xrange(n)] term_cl=[term[i] for i in xrange(n) if int(data[i][0])==cl] names=list(set(user)) m=len(user) count=[len([user[i] for i in xrange(m) if user[i]==name]) for name in names] idx=np.argsort(count)[::-1] #N=int(len(idx)*0.1) N=50 names=[names[i] for i in idx[:N]] term_user=init_clusters(len(names),names) for name in names: term_user[name]=[term_cl[i] for i in xrange(m) if user[i]==name] print 'Count all tf' counter=Counter() counter=tf_idf.tf(counter,term,'term') print 'Count cl tf' counter_i=copy.copy(counter) counter_i.subtract(counter) counter_term=tf_idf.tf(counter_i,term_cl,'term') remove=[] for term in counter_term: if counter_term[term]==0: remove+=[term] for term in remove: del counter_term[term] print 'Count user tf' print '#_name',len(names) counter_user=init_clusters(len(names),names) for name in names: print '-- %s'%name counter_i=copy.copy(counter_term) counter_i.subtract(counter_term) counter_user[name]=tf_idf.tf(counter_i,term_user[name],'term') print 'Calculate cl tfidf' term_tfidf=tf_idf.tf_idf(counter,counter_term,n) print 'Calculate user tfidf' user_tfidf=init_clusters(len(names),names) for name in names: user_tfidf[name]=tf_idf.tf_idf(counter_term,counter_user[name],m) print 'Sort tfidf' user_tfidf=[max([user_tfidf[name][term] for name in names]) for term in term_tfidf] #user_tfidf=[np.std([user_tfidf[name][term] for name in names]) for term in term_tfidf] term=term_tfidf.keys() n=len(term) tfidf=[term_tfidf[term[i]]/(1+user_tfidf[i]) for i in xrange(n)] term_tfidf=[term_tfidf[term[i]] for i in xrange(n)] f=os.open('txt\\tfidf_test.txt', os.O_RDWR|os.O_CREAT) idx=np.argsort(term_tfidf)[::-1] os.write(f,'term_tfidf,'+','.join([term[i] for i in idx[:10]])+'\n') os.write(f,','+','.join(['%0.4f'%term_tfidf[i] for i in idx[:10]])+'\n') idx=np.argsort(user_tfidf)[::-1] os.write(f,'user_tfidf,'+','.join([term[i] for i in idx[:10]])+'\n') os.write(f,','+','.join(['%0.4f'%user_tfidf[i] for i in idx[:10]])+'\n') idx=np.argsort(tfidf)[::-1] os.write(f,'tfidf,'+','.join([term[i] for i in idx[:10]])+'\n') os.write(f,','+','.join(['%0.4f'%tfidf[i] for i in idx[:10]])+'\n') os.close(f)
def tf_idf_cl(term_cl,user,counter,top_user,N,term_remove): '''''' names=list(set(user)) m=len(user) count=[len([user[i] for i in xrange(m) if user[i]==name]) for name in names] idx=np.argsort(count)[::-1] if top_user>len(idx): top_user=len(idx) print '---- cluster has less than %d unique users'%top_user names=[names[i] for i in idx[:top_user]] term_user=init_clusters(len(names),names) for name in names: term_user[name]=[term_cl[i] for i in xrange(m) if user[i]==name] '''''' print '-- Count cls-term tf' counter_i=copy.copy(counter) counter_i.subtract(counter) counter_term=tf_idf.tf(counter_i,term_cl,'term') '''''' print '-- Count user-term tf' counter_user=init_clusters(len(names),names) for name in names: counter_i=copy.copy(counter_term) counter_i.subtract(counter_term) counter_user[name]=tf_idf.tf(counter_i,term_user[name],'term') '''''' print '-- Clean counters' if '' in counter.keys(): del counter[''] del counter_term[''] '''''' for name in names: del counter_user[name][''] '''''' for term in counter_term: if counter_term[term]==0: term_remove+=[term] for term in term_remove: del counter_term[term] '''''' for name in names: del counter_user[name][term] '''''' print '-- Calculate cls-term tfidf' term_tfidf=tf_idf.tf_idf(counter,counter_term,N) '''''' print '-- Calculate user-term tfidf' user_tfidf=init_clusters(len(names),names) for name in names: user_tfidf[name]=tf_idf.tf_idf(counter_term,counter_user[name],m) '''''' print '-- Calculate term norm-tfidf' #user_tfidf=[max([user_tfidf[name][term] for name in names]) for term in term_tfidf] user_tfidf=[np.std([user_tfidf[name][term] for name in names]) for term in term_tfidf] #user_tfidf=[0 for term in term_tfidf] term=term_tfidf.keys() n=len(term) tfidf=[term_tfidf[term[i]]/(1+user_tfidf[i]) for i in xrange(n)] return [(term[i],tfidf[i]) for i in xrange(n)]