def classify_article_words(article_words, corpus_tfidfs_per_category, corpus_idf):
	'''
	Find the category that best matches the given article among the given
	categories.
	-Input: (1) list of article terms (2) TF*IDF weights for each document in
	 corpus (3) IDF for the entire corpus
	-Return: top category and a dictionary with match score for the article
	 with each category
	'''
	st_time = time.time()
	
	# Find article TF and TFIDF
	article_tfs = tf_idf.tf(article_words)
	article_tfidfs = tf_idf.tf_idf(article_tfs, corpus_idf, len(corpus_tfidfs_per_category))
	
	# find best match among categories
	sim_scores = defaultdict()
	for cat_name, cat_tfidf_scores in corpus_tfidfs_per_category.iteritems():
		cos_sim_time = time.time()
		sim_scores[cat_name] = \
			cosine_sim.cosine_similarity_dict(article_tfidfs, cat_tfidf_scores)
		
	# sort by value (match score), descending
	match = sorted(sim_scores.iteritems(), key=operator.itemgetter(1), reverse=True)[0][0]
	
	return match, sim_scores
Ejemplo n.º 2
0
def common_term(fname,m,rand_n=400,sep='\t'): 
	#seach common terms in happy tweets, rand sample of cluster and noise
	print 'Extract cluster-tweet terms'
	data=file('txt\\'+fname+'.txt').readlines()[1:]
	n=len(data)
	data=[data[i][:-1].split(sep) for i in xrange(n)]
	term=[data[i][-1].split(',') for i in xrange(n)] 
	clid=[int(data[i][0]) for i in xrange(n)]

	k=max(clid)+1
	term_cl=init_clusters(k+1,range(-1,k),[[term[i] for i in xrange(n) if clid[i]==j] for j in xrange(-1,k)])

	print 'Random sampling of cluster-tweet and noise'
	term_r=[]
	for cl in xrange(-1,k):
		term_r+=list(np.random.permutation(term_cl[cl])[:rand_n])

	print 'Count terms'
	counter=Counter()
	counter=tf_idf.tf(counter,term_r,type='term')
	common=counter.most_common()[:m]

	print 'Write results'
	f=os.open('txt\\'+fname+'_common.txt', os.O_RDWR|os.O_CREAT)
	os.write(f,'term,count\n')
	for i in xrange(m):
		os.write(f,'%s,%d\n'%(common[i][0],common[i][1]))

	os.close(f)
def calculate_corpus_tf_idf(corpus_reader):
	'''
	Calculate TF*IDF weight for each document in given corpus.
	-Input: CorpusReader (either Tagged or PlainText )
	-Return:
	 (1) A dictionary whose keys=document name and values=dictionary
	 with terms for keys and TF*IDF weights for values
	 (2) A dictionary whose keys=terms and values=their IDF
	'''
	st_time = time.time()
		
	# Term Frequency for each category
	tfs_per_document = defaultdict(Counter)
	for document in corpus_reader.fileids():
		terms_in_document = corpus_reader.words(document)
		tfs_per_document[document] = tf_idf.tf(terms_in_document)
		
	# Inverse Document Frequency
	idfs = tf_idf.idf(tfs_per_document)
	
	# key is folder name, value is a list of (term, tfidf score) pairs
	tfidfs_per_document = defaultdict(defaultdict) 
	for document, tfs in tfs_per_document.iteritems():
	    tfidfs_per_document[document] = tf_idf.tf_idf(tfs, idfs, len(tfs_per_document))
		
	print "time to compute TF-IDF weights for corpus: %.3f sec" % (time.time()-st_time)
	return tfidfs_per_document, idfs
Ejemplo n.º 4
0
def getConsultapadrao():
    M=['O peã e o caval são pec de xadrez. O caval é o melhor do jog.', 'A jog envolv a torr, o peã e o rei.','O peã lac o boi','Caval de rodei!','Polic o jog no xadrez.']
    stopwords=['a', 'o', 'e', 'é', 'de', 'do', 'no', 'são']
    q='xadrez peã caval torr'
    separadores=[' ',',','.','!','?']
    (ponderada_docs, ponderada_consulta, incidencias, n) = tf_idf.tf(M, stopwords, q, separadores, True)
    return modelo_BM25(ponderada_docs, ponderada_consulta, incidencias, n, 1, 0.75)
Ejemplo n.º 5
0
def m_hot_words_tfidf(data):
    printv('Calculateing idf score.')
    idf = tf_idf.idf([v[ID_M_BOW] for v in data.values()])
    all_result = dict()
    for id, bow in data.items():
        printv('Calculateing tf-idf score for ' + id)
        result = list()
        for term in set(bow[ID_M_BOW]):
            tf = tf_idf.tf(term, bow[ID_M_BOW])
            result.append((term, round(tf_idf.tfidf(term, tf, idf), ROUND_DIGITS)))
        all_result[id] = dict()
        for word, score in sorted(result, key=lambda x: x[1], reverse=True)[:RESULT_LENGTH]:
            all_result[id][word] = score
    return all_result
Ejemplo n.º 6
0
def common_emoji_cl(fname,m,names=False):
	#high tf-idf emoji of each cluster
	print 'Extract terms from %s'%fname
	[emoji,emoji_cl]=extract_data(fname)
	N=len(emoji)
	k=len(emoji_cl)
	N_cl=[len(emoji_cl[i]) for i in xrange(1,k+1)]

	print 'Count emoji tf'
	counter=Counter()
	counter=tf_idf.count(emoji,counter,ref=emoji_dict,type='emoji')
	counter_cl=[tf_idf.tf(counter,emoji_cl[i+1],ref=emoji_dict,type='emoji') for i in xrange(k)]

	print 'Calculate cluster emoji tf-idf'	
	tfIdf=[tf_idf.tf_idf(counter,counter_cl[i+1],N,N_cl[i]) for i in xrange(k)]
	term=tfIdf[0][0]
	tfIdf=[tfIdf[i][1] for i in xrange(k)]

	print 'Write results'
	write_common_emoji_cl(fname,mtx,names)
Ejemplo n.º 7
0
def to_wordurl(docs):
    words_dic = {}
    tfdic = tf_idf.tf(docs)
    return tfdic.keys()
Ejemplo n.º 8
0
def common_term_cls(fname,m,cls=False,top_user=100,sep='\t'): 
	#seach common/representative terms of clusters by tf-idf algo
	'''logic should be finding terms that are special for the cluster 
		but not for one user posting tweets in the cluster'''
	
	print 'Extract cluster-tweet terms'
	data=file('txt\\'+fname+'.txt').readlines()[1:]
	n=len(data)
	data=[data[i][:-1].split(sep) for i in xrange(n)]
	clid=[int(data[i][0]) for i in xrange(n)]
	user=[data[i][1] for i in xrange(n)]
	term=[data[i][-1].split(',') for i in xrange(n)]
	
	print 'Process term by clusters and users'
	if not cls:
		cls=range(max(clid)+1)		
	term_cl=init_clusters(len(cls),cls)
	user_cl=init_clusters(len(cls),cls)	
	for cl in cls:
		term_cl[cl]=[term[i] for i in xrange(n) if clid[i]==cl]
		user_cl[cl]=[user[i] for i in xrange(n) if clid[i]==cl]


	print 'Count all-term tf'
	counter=Counter()
	counter=tf_idf.tf(counter,term,'term')
	
	print 'Remove common terms'
	data=file('txt\\'+fname+'_common.txt').readlines()[1:]
	l=len(data)
	data=[data[i][:-1].split(',') for i in xrange(l)]
	common=set([data[i][0] for i in xrange(l)])
	term_remove=[]
	for key in counter.keys():
		l=key.split(' ')
		if l[0] in common or l[-1] in common: #first/last word is common term
			term_remove+=[key]
	for key in term_remove:
		del counter[key]

	tfidf_cl=init_clusters(len(cls),cls)
	for cl in cls:
		print 'Calculate term and user tf-idf -- Cluster%d'%cl
		counter_copy=copy.copy(counter)
		remove_copy=copy.copy(term_remove)
		tfidf_cl[cl]=tf_idf_cl(term_cl[cl],user_cl[cl],counter_copy,top_user,n,remove_copy)

	
	print 'Remove shared term and self-merging'
	mtx=[tfidf_cl[cl] for cl in cls]
	mtx=tf_idf.screen(mtx,m,type='term')

	print 'Write results'
	f=os.open('txt\\'+fname+'_tfIdf.txt',os.O_RDWR|os.O_CREAT)
	os.write(f,','.join(['cluster%d,'%cl for cl in cls])+'\n')
	os.write(f,','.join(['term,tf-idf' for cl in cls])+'\n')

	k=len(cls)
	mtx=[fill_empty(list(mtx[i]),m,'f') for i in xrange(k)]	
	for i in xrange(m):
		os.write(f,','.join(['%s,%0.3f'%(mtx[j][i][0],mtx[j][i][1]) for j in xrange(k)])+'\n')

	os.close(f)
Ejemplo n.º 9
0
def tf_idf_test(fname,cl=0,sep='\t'):
	data=file('txt\\'+fname+'.txt').readlines()[1:]
	n=len(data)
	data=[data[i][:-1].split(sep) for i in xrange(n)]
	user=[data[i][1] for i in xrange(n) if int(data[i][0])==cl]
	term=[data[i][-1].split(',') for i in xrange(n)]
	term_cl=[term[i] for i in xrange(n) if int(data[i][0])==cl]

	names=list(set(user))
	m=len(user)
	count=[len([user[i] for i in xrange(m) if user[i]==name]) for name in names]
	idx=np.argsort(count)[::-1]
	#N=int(len(idx)*0.1)
	N=50
	names=[names[i] for i in idx[:N]]
	term_user=init_clusters(len(names),names)	
	for name in names:
		term_user[name]=[term_cl[i] for i in xrange(m) if user[i]==name]

	print 'Count all tf'
	counter=Counter()
	counter=tf_idf.tf(counter,term,'term')

	print 'Count cl tf'
	counter_i=copy.copy(counter)
	counter_i.subtract(counter)
	counter_term=tf_idf.tf(counter_i,term_cl,'term')

	remove=[]
	for term in counter_term:
		if counter_term[term]==0:
			remove+=[term]
	for term in remove:
		del counter_term[term]

	print 'Count user tf'
	print '#_name',len(names)
	counter_user=init_clusters(len(names),names)
	for name in names:
		print '-- %s'%name
		counter_i=copy.copy(counter_term)
		counter_i.subtract(counter_term)
		counter_user[name]=tf_idf.tf(counter_i,term_user[name],'term')
	
	print 'Calculate cl tfidf'
	term_tfidf=tf_idf.tf_idf(counter,counter_term,n)

	print 'Calculate user tfidf'
	user_tfidf=init_clusters(len(names),names)
	for name in names:
		user_tfidf[name]=tf_idf.tf_idf(counter_term,counter_user[name],m)
	
	print 'Sort tfidf'
	user_tfidf=[max([user_tfidf[name][term] for name in names]) for term in term_tfidf]
	#user_tfidf=[np.std([user_tfidf[name][term] for name in names]) for term in term_tfidf]
	
	term=term_tfidf.keys()
	n=len(term)
	tfidf=[term_tfidf[term[i]]/(1+user_tfidf[i]) for i in xrange(n)]
	term_tfidf=[term_tfidf[term[i]] for i in xrange(n)]

	f=os.open('txt\\tfidf_test.txt', os.O_RDWR|os.O_CREAT)
	idx=np.argsort(term_tfidf)[::-1]	
	os.write(f,'term_tfidf,'+','.join([term[i] for i in idx[:10]])+'\n')
	os.write(f,','+','.join(['%0.4f'%term_tfidf[i] for i in idx[:10]])+'\n')

	idx=np.argsort(user_tfidf)[::-1]	
	os.write(f,'user_tfidf,'+','.join([term[i] for i in idx[:10]])+'\n')
	os.write(f,','+','.join(['%0.4f'%user_tfidf[i] for i in idx[:10]])+'\n')
	
	idx=np.argsort(tfidf)[::-1]	
	os.write(f,'tfidf,'+','.join([term[i] for i in idx[:10]])+'\n')
	os.write(f,','+','.join(['%0.4f'%tfidf[i] for i in idx[:10]])+'\n')
	
	os.close(f)
Ejemplo n.º 10
0
def tf_idf_cl(term_cl,user,counter,top_user,N,term_remove):
	''''''
	names=list(set(user))
	m=len(user)
	count=[len([user[i] for i in xrange(m) if user[i]==name]) for name in names]
	idx=np.argsort(count)[::-1]
	if top_user>len(idx):
		top_user=len(idx)
		print '---- cluster has less than %d unique users'%top_user
	names=[names[i] for i in idx[:top_user]]
	term_user=init_clusters(len(names),names)	
	for name in names:
		term_user[name]=[term_cl[i] for i in xrange(m) if user[i]==name]
	''''''
	print '-- Count cls-term tf'
	counter_i=copy.copy(counter)
	counter_i.subtract(counter)
	counter_term=tf_idf.tf(counter_i,term_cl,'term')

	''''''
	print '-- Count user-term tf'
	counter_user=init_clusters(len(names),names)
	for name in names:
		counter_i=copy.copy(counter_term)
		counter_i.subtract(counter_term)
		counter_user[name]=tf_idf.tf(counter_i,term_user[name],'term')
	''''''
	print '-- Clean counters'
	if '' in counter.keys():
		del counter['']
		del counter_term['']
		''''''
		for name in names:
			del counter_user[name]['']
		''''''
	
	for term in counter_term:
		if counter_term[term]==0:
			term_remove+=[term]
	for term in term_remove:
		del counter_term[term]
		''''''
		for name in names:
			del counter_user[name][term]
		''''''

	
	print '-- Calculate cls-term tfidf'
	term_tfidf=tf_idf.tf_idf(counter,counter_term,N)
	''''''
	print '-- Calculate user-term tfidf'
	user_tfidf=init_clusters(len(names),names)
	for name in names:
		user_tfidf[name]=tf_idf.tf_idf(counter_term,counter_user[name],m)
	''''''
	print '-- Calculate term norm-tfidf'
	#user_tfidf=[max([user_tfidf[name][term] for name in names]) for term in term_tfidf]
	user_tfidf=[np.std([user_tfidf[name][term] for name in names]) for term in term_tfidf]
	#user_tfidf=[0 for term in term_tfidf]

	term=term_tfidf.keys()
	n=len(term)
	tfidf=[term_tfidf[term[i]]/(1+user_tfidf[i]) for i in xrange(n)]
	
	return [(term[i],tfidf[i]) for i in xrange(n)]
Ejemplo n.º 11
0
def to_wordurl(docs):
    words_dic = {}
    tfdic = tf_idf.tf(docs)
    return tfdic.keys()