forked from randomjohn/project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
similarity.py
127 lines (117 loc) · 4.51 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# given the file name of the term document matrix
# load it into memory, compute a tf-idf matrix, and then
# compute pairwise similarity based on normalized dot product (cosine similarity)
import math
import os
from numpy import *
import codecs
import clusters
def load_tdf(tdf_name="out/tdf.txt"):
"""load a term document matrix from a tab-delimited text file, generated by get_counts.py"""
lines = [line for line in file(tdf_name)]
# first line is column titles
colnames=lines[0].strip().split('\t')[1:]
rownames=[]
data=[]
for line in lines[1:]:
p=line.strip().split('\t')
# rownames is the first in each row
rownames.append(p[0])
# data is in the remainder of each row
data.append([float(x) for x in p[1:]])
return rownames,colnames,matrix(data)
def compute_idf(tdf):
"""compute the inverse document frequency of each word from its term document matrix"""
idf=[]
total_documents = shape(tdf)[1]
term_doc_inc = where(tdf>0,1,0) # creates a term-document incidence matrix
doc_freq = term_doc_inc.sum(axis=0)
idf = doc_freq/float(total_documents)
for i in range(total_documents):
if idf[0,i]==0:
idf[0,i]=1.0
else:
idf[0,i] = 1.0+math.log(idf[0,i])
return idf
def compute_tf_idf_matrix(tdf,idf):
"""Convert a term document matrix into a tf-idf matrix given the matrix and the idf"""
tdf=array(tdf)
idf=array(idf)
for i in range(len(tdf)):
tdf[i]=tdf[i]*idf
return matrix(tdf)
def normalize_tf_idf(tfidf):
"""Convert a matrix so that its rows have norm 1, so we can efficiently compute similarity, may be unnecessary?"""
for row in range(shape(tfidf)[0]):
tfidf[row] = tfidf[row]/linalg.norm(tfidf[row])
return tfidf
def compute_similarity(v1,v2):
"""compute cosine similarity between two vectors, assumes they have been normalized by tf-idf"""
v1=matrix(v1)
v2=matrix(v2)
try:
res= inner(v1,v2)/linalg.norm(v1)/linalg.norm(v2)
except ZeroDivisionError:
res=1.0
return float(res)
def compute_similarity_normalized(v1,v2):
"""compute cosine similarity between two vectors, assumes they have been normalized by tf-idf and normalized so that norm=1"""
v1=matrix(v1)
v2=matrix(v2)
try:
res= inner(v1,v2)
except ZeroDivisionError:
res=1.0
return float(res)
def compute_similarity_matrix(rownames,tfidf):
"""compute a similarity matrix of documents (blogs) given names and tfidf"""
# initialize to 0
sim_mat = [[0.0 for i in range(len(rownames))] for j in range(len(rownames))]
total=str(len(rownames)*(len(rownames)-1)/2)
k=0
for i in range(len(rownames)):
sim_mat[i][i]=1.0 # a blog is completely similar to itself
for j in range(i+1,len(rownames)):
# so, this matrix is symmetric because similarity is symmetric
sim_mat[i][j] = compute_similarity(tfidf[i],tfidf[j])
sim_mat[j][i] = sim_mat[i][j]
k+=1
if k%1000==0:
print " --> Completed " + str(k) + " of " + total
print " --> Similarity matrix computation completed."
return sim_mat
def write_similarity(sim,names,filename="similarity.txt"):
""" write out similarity matrix to tab-separated text file """
out = codecs.open(os.path.join(os.getcwd(), 'out', filename), 'w',encoding='iso-8859-1')
out.write('Blog')
for blog in names:
out.write('\t%s' % blog)
i=0
out.write('\n')
for blog in names:
out.write(blog)
for j in range(len(names)):
out.write('\t%f' % sim[i][j])
out.write('\n')
i+=1
return
def main():
print "Loading term document matrix"
rownames,colnames,counts=load_tdf("out/tdf.txt")
print "Computing IDF"
idf=compute_idf(counts)
print "Computing TF-IDF matrix"
tfidf=compute_tf_idf_matrix(counts,idf)
n_clus=15
print "Computing " + str(n_clus) + " clusters"
blog_clus=clusters.kcluster(tfidf,distance=compute_similarity_normalized,k=n_clus)
f = open("out/blog_clus.txt",'w')
for i in range(len(blog_clus)):
for j in range(len(blog_clus[i])):
f.write(str(i) + "\t" + rownames[blog_clus[i][j]] + "\n")
f.close()
#print "Computing similarity matrix"
#sim=compute_similarity_matrix(rownames,tfidf)
#print "Writing similarity matrix"
#write_similarity(sim,rownames,filename="similarity.txt")
if __name__=="__main__": main()